Skip to content

Commit

Permalink
integrated phasing step into main workflow
Browse files Browse the repository at this point in the history
  • Loading branch information
priesgo committed Mar 8, 2022
1 parent e27eca2 commit 4c35b32
Show file tree
Hide file tree
Showing 6 changed files with 42 additions and 9 deletions.
8 changes: 4 additions & 4 deletions bin/phasing.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@ def __init__(self, input_vcf, output_vcf, input_gtf, input_fasta):
self.vcf_reader = VCF(input_vcf)
self.vcf_writer = Writer(output_vcf, self.vcf_reader)
self.gtf = read_gtf(input_gtf)
self.cds_regions = self.gtf[self.gtf.feature == "CDS"]
self.cds_regions["uid"] = self.cds_regions[["start", "end"]].apply(lambda x: "{}:{}".format(x[0], x[1]), axis=1)
self.cds_regions = self.gtf.loc[self.gtf.feature == "CDS", :]
self.cds_regions.loc[:, "uid"] = self.cds_regions.loc[:, "start":"end"].apply(lambda x: "{}:{}".format(x[0], x[1]), axis=1)
self.fasta = FastaFile(input_fasta)

def _overlap_amino_acid(self, first_variant, first_overlapping_cds, second_variant, second_overlapping_cds) -> bool:
Expand Down Expand Up @@ -88,11 +88,11 @@ def main():
dest="vcf_in",
required=True,
help="Input vcf file listing somatic variants (expects sorted VCF).")
parser.add_argument("-g", "--input-gtf",
parser.add_argument("-g", "--gtf",
dest="gtf",
required=True,
help="Input GTF defining the coding regions.")
parser.add_argument("-f", "--input-fasta",
parser.add_argument("-f", "--fasta",
dest="fasta",
required=True,
help="Input FASTA reference genome.")
Expand Down
4 changes: 2 additions & 2 deletions bin/test_phasing.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

class TestPhasing(TestCase):

def test_reference_genome_reader(self):
def test_phasing(self):
os.makedirs("./output/phasing", exist_ok=True)
output_vcf = "./output/phasing/test_data.merged.vcf.gz"
if os.path.exists(output_vcf):
Expand All @@ -18,7 +18,7 @@ def test_reference_genome_reader(self):
input_fasta="./reference/Sars_cov_2.ASM985889v3.dna.toplevel.fa"
).run()
self.assertTrue(os.path.exists(output_vcf))
self._assert_vcf(output_vcf, 56)
self._assert_vcf(output_vcf, 59)

def _assert_vcf(self, vcf, expected_count_variants):
vcf_reader = VCF(vcf)
Expand Down
6 changes: 4 additions & 2 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ include { ALIGNMENT_PAIRED_END; ALIGNMENT_SINGLE_END } from './modules/02_bwa'
include { BAM_PREPROCESSING; COVERAGE_ANALYSIS } from './modules/03_bam_preprocessing'
include { VARIANT_CALLING_BCFTOOLS; VARIANT_CALLING_LOFREQ ; VARIANT_CALLING_GATK ;
VARIANT_CALLING_IVAR ; VARIANT_CALLING_ASSEMBLY; IVAR2VCF } from './modules/04_variant_calling'
include { VARIANT_NORMALIZATION } from './modules/05_variant_normalization'
include { VARIANT_NORMALIZATION ; PHASING } from './modules/05_variant_normalization'
include { VARIANT_ANNOTATION; VARIANT_SARSCOV2_ANNOTATION;
VARIANT_VAF_ANNOTATION; VAFATOR } from './modules/06_variant_annotation'
include { PANGOLIN_LINEAGE; VCF2FASTA } from './modules/07_lineage_annotation'
Expand Down Expand Up @@ -249,5 +249,7 @@ workflow {
normalized_vcfs = VARIANT_VAF_ANNOTATION.out.vaf_annotated
}

BGZIP(normalized_vcfs)
PHASING(normalized_vcfs, reference, gff)

BGZIP(PHASING.out)
}
28 changes: 28 additions & 0 deletions modules/05_variant_normalization.nf
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,32 @@ process VARIANT_NORMALIZATION {
# remove duplicates after normalisation
bcftools norm --rm-dup exact -o ${name}.${caller}.normalized.vcf -
"""
}

process PHASING {
cpus params.cpus
memory params.memory
if (params.keep_intermediate) {
publishDir "${params.output}", mode: "copy"
}
tag "${name}"

conda (params.enable_conda ? "conda-forge::python=3.8.5 conda-forge::pandas=1.1.5 bioconda::pysam=0.17.0 bioconda::gtfparse=1.2.1 bioconda::cyvcf2=0.30.14" : null)

input:
tuple val(name), val(caller), file(vcf)
val(fasta)
val(gtf)

output:
tuple val(name), val(caller), file("${name}.${caller}.phased.vcf")

script:
"""
phasing.py \
--fasta ${fasta} \
--gtf ${gtf} \
--input-vcf ${vcf} \
--output-vcf ${name}.${caller}.phased.vcf
"""
}
3 changes: 3 additions & 0 deletions test_data/test_data.lofreq.vcf
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,9 @@ MN908947.3 12325 . C T 146 LOW_FREQUENCY DP=45;AF=0.133333;SB=4;DP4=15,24,4,2;AN
MN908947.3 12329 . A AC 54 LOW_FREQUENCY DP=54;AF=0.037037;SB=0;DP4=26,26,1,1;INDEL;HRUN=3;ANN=AC|frameshift_variant|HIGH|ORF1ab|gene-GU280_gp01|transcript|TRANSCRIPT_gene-GU280_gp01|protein_coding|1/1|c.12064_12065insC|p.R4022fs|12065/21290|12065/21290|4022/7095||WARNING_TRANSCRIPT_MULTIPLE_STOP_CODONS;LOF=(ORF1ab|gene-GU280_gp01|1|1.00);CONS_HMM_SARS_COV_2=0;CONS_HMM_SARBECOVIRUS=0;CONS_HMM_VERTEBRATE_COV=0;PFAM_NAME=NSP8_CoV-like;PFAM_DESCRIPTION=Non-structural protein NSP8, coronavirus-like;vafator_af=0;vafator_ac=0;vafator_dp=54
MN908947.3 12334 . A C 184 LOW_FREQUENCY DP=63;AF=0.126984;SB=0;DP4=31,24,5,3;ANN=C|synonymous_variant|LOW|ORF1ab|gene-GU280_gp01|transcript|TRANSCRIPT_gene-GU280_gp01|protein_coding|1/1|c.12069A>C|p.A4023A|12069/21290|12069/21290|4023/7095||WARNING_TRANSCRIPT_MULTIPLE_STOP_CODONS;CONS_HMM_SARS_COV_2=-0.35633;CONS_HMM_SARBECOVIRUS=-0.35633;CONS_HMM_VERTEBRATE_COV=0;PFAM_NAME=NSP8_CoV-like;PFAM_DESCRIPTION=Non-structural protein NSP8, coronavirus-like;vafator_af=0.126984;vafator_ac=8;vafator_dp=63
MN908947.3 12335 . A G 54 LOW_FREQUENCY DP=62;AF=0.048387;SB=0;DP4=34,25,2,1;ANN=G|missense_variant|MODERATE|ORF1ab|gene-GU280_gp01|transcript|TRANSCRIPT_gene-GU280_gp01|protein_coding|1/1|c.12070A>G|p.K4024E|12070/21290|12070/21290|4024/7095||WARNING_TRANSCRIPT_MULTIPLE_STOP_CODONS;CONS_HMM_SARS_COV_2=2.17352;CONS_HMM_SARBECOVIRUS=0.57215;CONS_HMM_VERTEBRATE_COV=2.17352;PFAM_NAME=NSP8_CoV-like;PFAM_DESCRIPTION=Non-structural protein NSP8, coronavirus-like;vafator_af=0.0483871;vafator_ac=3;vafator_dp=62
MN908947.3 12336 . A G 54 LOW_FREQUENCY DP=62;AF=0.048387;SB=0;DP4=34,25,2,1;ANN=G|missense_variant|MODERATE|ORF1ab|gene-GU280_gp01|transcript|TRANSCRIPT_gene-GU280_gp01|protein_coding|1/1|c.12070A>G|p.K4024E|12070/21290|12070/21290|4024/7095||WARNING_TRANSCRIPT_MULTIPLE_STOP_CODONS;CONS_HMM_SARS_COV_2=2.17352;CONS_HMM_SARBECOVIRUS=0.57215;CONS_HMM_VERTEBRATE_COV=2.17352;PFAM_NAME=NSP8_CoV-like;PFAM_DESCRIPTION=Non-structural protein NSP8, coronavirus-like;vafator_af=0.0483871;vafator_ac=3;vafator_dp=62
MN908947.3 12337 . A G 54 LOW_FREQUENCY DP=62;AF=0.048387;SB=0;DP4=34,25,2,1;ANN=G|missense_variant|MODERATE|ORF1ab|gene-GU280_gp01|transcript|TRANSCRIPT_gene-GU280_gp01|protein_coding|1/1|c.12070A>G|p.K4024E|12070/21290|12070/21290|4024/7095||WARNING_TRANSCRIPT_MULTIPLE_STOP_CODONS;CONS_HMM_SARS_COV_2=2.17352;CONS_HMM_SARBECOVIRUS=0.57215;CONS_HMM_VERTEBRATE_COV=2.17352;PFAM_NAME=NSP8_CoV-like;PFAM_DESCRIPTION=Non-structural protein NSP8, coronavirus-like;vafator_af=0.0483871;vafator_ac=3;vafator_dp=62
MN908947.3 12338 . A G 54 LOW_FREQUENCY DP=62;AF=0.048387;SB=0;DP4=34,25,2,1;ANN=G|missense_variant|MODERATE|ORF1ab|gene-GU280_gp01|transcript|TRANSCRIPT_gene-GU280_gp01|protein_coding|1/1|c.12070A>G|p.K4024E|12070/21290|12070/21290|4024/7095||WARNING_TRANSCRIPT_MULTIPLE_STOP_CODONS;CONS_HMM_SARS_COV_2=2.17352;CONS_HMM_SARBECOVIRUS=0.57215;CONS_HMM_VERTEBRATE_COV=2.17352;PFAM_NAME=NSP8_CoV-like;PFAM_DESCRIPTION=Non-structural protein NSP8, coronavirus-like;vafator_af=0.0483871;vafator_ac=3;vafator_dp=62
MN908947.3 12439 . CT C 50 LOW_FREQUENCY DP=98;AF=0.020408;SB=0;DP4=56,40,1,1;INDEL;HRUN=2;ANN=C|frameshift_variant|HIGH|ORF1ab|gene-GU280_gp01|transcript|TRANSCRIPT_gene-GU280_gp01|protein_coding|1/1|c.12175delT|p.L4059fs|12175/21290|12175/21290|4059/7095||WARNING_TRANSCRIPT_MULTIPLE_STOP_CODONS;LOF=(ORF1ab|gene-GU280_gp01|1|1.00);CONS_HMM_SARS_COV_2=-1.04511;CONS_HMM_SARBECOVIRUS=-1.04511;CONS_HMM_VERTEBRATE_COV=0;PFAM_NAME=NSP8_CoV-like;PFAM_DESCRIPTION=Non-structural protein NSP8, coronavirus-like;vafator_af=0.0204082;vafator_ac=2;vafator_dp=98
MN908947.3 12862 . TG T 55 LOW_FREQUENCY DP=52;AF=0.038462;SB=0;DP4=27,24,1,1;INDEL;HRUN=1;ANN=T|frameshift_variant|HIGH|ORF1ab|gene-GU280_gp01|transcript|TRANSCRIPT_gene-GU280_gp01|protein_coding|1/1|c.12598delG|p.D4200fs|12598/21290|12598/21290|4200/7095||WARNING_TRANSCRIPT_MULTIPLE_STOP_CODONS;LOF=(ORF1ab|gene-GU280_gp01|1|1.00);CONS_HMM_SARS_COV_2=1.1685;CONS_HMM_SARBECOVIRUS=0.57215;CONS_HMM_VERTEBRATE_COV=1.1685;PFAM_NAME=NSP9_CoV;PFAM_DESCRIPTION=Non-structural protein NSP9, coronavirus;vafator_af=0.0377358;vafator_ac=2;vafator_dp=53
MN908947.3 12895 . AC A 83 LOW_FREQUENCY DP=68;AF=0.044118;SB=0;DP4=32,33,2,1;INDEL;HRUN=2;ANN=A|frameshift_variant|HIGH|ORF1ab|gene-GU280_gp01|transcript|TRANSCRIPT_gene-GU280_gp01|protein_coding|1/1|c.12631delC|p.P4211fs|12631/21290|12631/21290|4211/7095||WARNING_TRANSCRIPT_MULTIPLE_STOP_CODONS;LOF=(ORF1ab|gene-GU280_gp01|1|1.00);CONS_HMM_SARS_COV_2=1.1685;CONS_HMM_SARBECOVIRUS=0.67226;CONS_HMM_VERTEBRATE_COV=1.1685;PFAM_NAME=NSP9_CoV;PFAM_DESCRIPTION=Non-structural protein NSP9, coronavirus;vafator_af=0.0441176;vafator_ac=3;vafator_dp=68
Expand Down
2 changes: 1 addition & 1 deletion tests/test_01.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ output=output/test1
nextflow main.nf -profile test,conda --name test_data \
--output $output \
--fastq1 test_data/test_data_1.fastq.gz \
--fastq2 test_data/test_data_2.fastq.gz
--fastq2 test_data/test_data_2.fastq.gz --keep_intermediate

test -s $output/test_data.bcftools.vcf.gz || { echo "Missing VCF output file!"; exit 1; }
test -s $output/test_data.gatk.vcf.gz || { echo "Missing VCF output file!"; exit 1; }
Expand Down

0 comments on commit 4c35b32

Please sign in to comment.