-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathcovid19_call_variants.sh
executable file
·141 lines (118 loc) · 3.25 KB
/
covid19_call_variants.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
#!/bin/bash
# Example command: `bash covid19_call_variants.sh
# reference/nCoV-2019.reference.fasta
# data/twist-target-capture/RNA_control_spike_in_10_6_100k_reads.fastq.gz
# reference/artic-v1/ARTIC-V1.bed`
# shellcheck disable=SC1091
source activate jobscript-env # noqa
set -eou pipefail
# argv
: "${reference:=${1}}"
: "${input_fastq:=${2}}"
: "${primer_bed_file:=${3}}"
: "${consensus_mask_depth:=${4}}"
# defaults
: "${threads:=4}"
: "${prefix:=results}"
: "${min_quality:=20}"
echo "reference=${reference}"
echo "input_fastq=${input_fastq}"
echo "primer_bed_file=${primer_bed_file}"
echo "threads=${threads}"
echo "min_quality=${min_quality}"
# minimap2
# Trim polyA tail for alignment (33 bases)
seqtk trimfq -e 33 "${reference}" > trimmed-reference.fasta
# shellcheck disable=SC2086
echo "[1] mapping reads with minimap2"
minimap2 \
-K 20M \
-a \
-x sr \
-t "${threads}" \
trimmed-reference.fasta \
"${input_fastq}" \
| samtools \
view \
-u \
-h \
-q $min_quality \
-F \
4 - \
| samtools \
sort \
--threads "${threads}" \
- \
> "${prefix}.sorted.bam"
# Trim with ivar
echo "[2] Trimming with ivar"
samtools index "${prefix}.sorted.bam"
# samtools flagstat "${prefix}.sorted.bam"
ivar \
trim \
-e \
-q 0 \
-i "${prefix}.sorted.bam" \
-b "${primer_bed_file}" \
-p "${prefix}.ivar"
echo "[3] Sorting and indexing trimmed BAM"
samtools \
sort \
-@ "${threads}" \
"${prefix}.ivar.bam" \
> "${prefix}.sorted.bam"
samtools \
index \
"${prefix}.sorted.bam"
echo "[4] Generating pileup"
bcftools \
mpileup \
--annotate FORMAT/AD,INFO/AD \
--fasta-ref "${reference}" \
--max-depth 200 \
--no-BAQ \
"${prefix}.sorted.bam" \
| bcftools call \
--variants-only \
--multiallelic-caller \
--output-type z \
--output "${prefix}.raw.vcf.gz"
# save raw vcf
bcftools view \
< "${prefix}.raw.vcf.gz" \
> "${prefix}.raw.vcf"
# filter out low-quality variants
bcftools view \
--exclude "QUAL<150" \
--output-type z \
< "${prefix}.raw.vcf.gz" \
> "${prefix}.vcf.gz"
# bcftools index requires a .vcf.gz file
# in the special indexed gzip format (can't use regular gzip)
bcftools index "${prefix}.vcf.gz"
# We want to generate a consensus sequence with:
# 1. well-supported deletions marked with '-'
# 2. low-coverage regions masked with N
# vcf is 1-based while bedgraph is 0-based; thus the somewhat convoluted path to mask.bed
bcftools query -f'%CHROM\t%POS0\t%END\n' ${prefix}.vcf.gz > variants.bed
# get low coverage sites in bedgraph format
bedtools genomecov -bga -ibam ${prefix}.sorted.bam | awk -v var="$consensus_mask_depth" '$4 < var' > low_coverage_sites.bed
bedtools subtract -a low_coverage_sites.bed -b variants.bed > mask.bed
# generate consensus and rename header
bcftools consensus \
--fasta-ref "${reference}" \
--mark-del '-' \
-m mask.bed \
"${prefix}.vcf.gz" \
| sed \
'/>/ s/$/ | One Codex consensus sequence/' \
> "${prefix}.consensus.fa"
zcat "${prefix}.vcf.gz" > "${prefix}.vcf"
# Move some files around and clean up
mv "${prefix}.consensus.fa" "consensus.fa"
mv "${prefix}.vcf" "variants.vcf"
mv "${prefix}.sorted.bam" "covid19.bam"
mv "${prefix}.sorted.bam.bai" "covid19.bam.bai"
mv "${prefix}.raw.vcf" "variants.raw.vcf"
rm "${prefix}"*
echo "[ ] Finished!"