forked from bcbio/bcbio-nextgen
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnormalize.py
115 lines (94 loc) · 5.06 KB
/
normalize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
"""Normalization of VCF files for GEMINI and ensemble calling.
1. Split multi-allelic variants into single sample records.
Downstream tools like GEMINI don't handle multi-allelic variants.
`normalize` splits these into multiple single allele variants.
More generally, multi-allelic variants are tricky to represent in
comparisons and storage. There are useful discussions on-going in the
GA4GH about allele/count based approaches to handling this more
generally:
https://github.com/ga4gh/schemas/issues/169
There are multiple approaches to do the decomposition:
- vt decompose: https://github.com/atks/vt
- vcfbreakmulti: https://github.com/ekg/vcflib
- bcftools norm -m '-any': https://samtools.github.io/bcftools/bcftools.html
- vcf_parser --split: https://github.com/moonso/vcf_parser https://github.com/moonso/genmod
vt handles the most cases cleanly and correctly reconstructs PLs for
each genotype, removing other FORMAT items which are not changed so
the resulting VCF is still valid.
2. Decompose biallelic block substitutions
As part of normalization, decompose MNPs into SNPs, e.g. AG>CT -> A>C and G>T
There are a few approaches:
- vcfallelicprimitives -t DECOMPOSED --keep-geno: https://github.com/vcflib/vcflib#vcfallelicprimitives
- vt decompose_blocksub: https://genome.sph.umich.edu/wiki/Vt#Decompose_biallelic_block_substitutions
with `-a`, align/aggressive mode is turned on, which runs 3-4 longer but resolves
additional clumped SNPs
We are using `vcfallelicprimitives` in this module.
3. Normalizing indels
Left-align and normalize indels, check if REF alleles match the reference.
- bcftools norm: https://samtools.github.io/bcftools/bcftools.html
- vt normalize: https://github.com/atks/vt
We are using `vt normalize` in this module.
"""
import cyvcf2
from bcbio import utils
from bcbio.distributed.transaction import file_transaction
from bcbio.pipeline import datadict as dd
from bcbio.provenance import do
from bcbio.variation import effects, vcfutils
def normalize(in_file, data, passonly=False, normalize_indels=True, split_biallelic=True,
rerun_effects=True, remove_oldeffects=False):
"""Normalizes variants and reruns SnpEFF for resulting VCF
"""
if remove_oldeffects:
out_file = "%s-noeff-nomultiallelic%s" % utils.splitext_plus(in_file)
else:
out_file = "%s-nomultiallelic%s" % utils.splitext_plus(in_file)
if not utils.file_exists(out_file):
if vcfutils.vcf_has_variants(in_file):
ready_ma_file = _normalize(in_file, data, passonly=passonly,
normalize_indels=normalize_indels,
split_biallelic=split_biallelic,
remove_oldeffects=remove_oldeffects)
if rerun_effects:
ann_ma_file, _ = effects.add_to_vcf(ready_ma_file, data)
if ann_ma_file:
ready_ma_file = ann_ma_file
utils.symlink_plus(ready_ma_file, out_file)
else:
utils.symlink_plus(in_file, out_file)
return vcfutils.bgzip_and_index(out_file, data["config"])
def _normalize(in_file, data, passonly=False, normalize_indels=True, split_biallelic=True,
remove_oldeffects=False):
"""Convert multi-allelic variants into single allelic.
`vt normalize` has the -n flag passed (skipping reference checks) because
of errors where the reference genome has non GATCN ambiguous bases. These
are not supported in VCF, so you'll have a mismatch of N in VCF versus R
(or other ambiguous bases) in the genome.
"""
if remove_oldeffects:
out_file = "%s-noeff-decompose%s" % utils.splitext_plus(in_file)
old_effects = [a for a in ["CSQ", "ANN"] if a in cyvcf2.VCF(in_file)]
if old_effects:
clean_effects_cmd = " | bcftools annotate -x %s " % (",".join(["INFO/%s" % x for x in old_effects]))
else:
clean_effects_cmd = ""
else:
clean_effects_cmd = ""
out_file = "%s-decompose%s" % utils.splitext_plus(in_file)
if not utils.file_exists(out_file):
ref_file = dd.get_ref_file(data)
assert out_file.endswith(".vcf.gz")
with file_transaction(data, out_file) as tx_out_file:
cmd = ("gunzip -c " + in_file +
(" | bcftools view -f 'PASS,.'" if passonly else "") +
clean_effects_cmd +
(" | vcfallelicprimitives -t DECOMPOSED --keep-geno" if split_biallelic else "") +
" | sed 's/ID=AD,Number=./ID=AD,Number=R/'" +
" | vt decompose -s - " +
((" | vt normalize -n -r " + ref_file + " - ") if normalize_indels else "") +
" | awk '{ gsub(\"./-65\", \"./.\"); print $0 }'" +
" | sed -e 's/Number=A/Number=1/g'" +
" | bgzip -c > " + tx_out_file
)
do.run(cmd, "Multi-allelic to single allele")
return vcfutils.bgzip_and_index(out_file, data["config"])