Merge pull request #8 from TRON-Bioinformatics/fix-vafator-params

upgrade to the latest VAFator v1.2.5 + normalization is now optional
TRON-Bioinformatics · Feb 21, 2022 · 2881fc8 · 2881fc8
2 parents 87e1025 + 21c9e63
commit 2881fc8
Show file tree

Hide file tree

Showing 11 changed files with 96 additions and 45 deletions.
diff --git a/Makefile b/Makefile
@@ -22,3 +22,5 @@ test:
 	bash tests/run_test_5.sh
 	bash tests/run_test_6.sh
 	bash tests/run_test_7.sh
+	bash tests/run_test_8.sh
+	bash tests/run_test_10.sh
diff --git a/README.md b/README.md
@@ -18,6 +18,8 @@ This pipeline has several objectives:
 * Technical annotations from different BAM files
 * Functional annotations
 
+All of the previous steps are optional.
+
 ## How to run it
 
 Run it from GitHub as follows:
@@ -47,11 +49,12 @@ Input:
     Example input file:
     sample1	/path/to/your/file.vcf
     sample2	/path/to/your/file2.vcf
-    * --reference: path to the FASTA genome reference (indexes expected *.fai, *.dict)
-    * --vcf-without-ad: indicate when the VCFs to normalize do not have the FORMAT/AD annotation
 
 Optional input:
+    * --reference: path to the FASTA genome reference (indexes expected *.fai, *.dict) [required for normalization]
+    * --vcf-without-ad: indicate when the VCFs to normalize do not have the FORMAT/AD annotation
     * --output: the folder where to publish output
+    * --skip_normalization: flag indicating to skip all normalization steps
     * --skip_decompose_complex: flag indicating not to split complex variants (ie: MNVs and combinations of SNVs and indels)
     * --filter: specify the filter to apply if any (e.g.: PASS), only variants with this value will be kept
     * --input_bams: a tab-separated values file containing in each row the sample name, tumor and normal BAM files for annotation with Vafator
@@ -96,11 +99,13 @@ The aggregated vafator annotations on each sample will also be provided without
 
 ## Variant filtering
 
-Optionally, only variants with the value in the column `FILTER` matching the value of parameter `--filter` are kept.
+Only variants with the value in the column `FILTER` matching the value of parameter `--filter` are kept.
 If this parameter is not used not variants are filtered out. Multiple values can be passed separated by commas without spaces.
 
 For instance, `--filter PASS,.` will keep variant having `FILTER=PASS` or `FILTER=.`, but remove all others.
 
+No filter is applied if `--filter` is not passed.
+
 
 ## Variant normalization
 
@@ -112,13 +117,15 @@ The normalization pipeline consists of the following steps:
  * Decomposition of multiallelic variants into biallelic variants (ie: A > C,G is decomposed into two variants A > C and A > G)
  * Trim redundant sequence and left align indels, indels in repetitive sequences can have multiple representations
  * Remove duplicated variants
-
-Optionally if BAM files are provided (through `--bam_files`) VCFs are annotated with allele frequencies and depth of 
-coverage by Vafator (https://github.com/TRON-Bioinformatics/vafator). 
 
 The output consists of:
  * The normalized VCF
  * Summary statistics before and after normalization
+
+The parameter `--reference` pointing to the reference genome FASTA file is required for normalization. 
+The reference genome requires *.fai and *.dict indexes.
+
+Normalization is not applied if the parameter `--skip_normalization` is passed.
 
 
 ![Pipeline](images/variant_normalization_pipeline.png)
@@ -215,6 +222,8 @@ within the pileup of a BAM file. When doing somatic variant calling it may be re
 for the same variant in a patient from multiple BAM files.
 These annotations are provided by VAFator (https://github.com/TRON-Bioinformatics/vafator).
 
+No technical annotations are performed if the parameter `--input_bams` is not passed.
+
 ## Functional annotations
 
 The functional annotations provide a biological context for every variant. Such as the overlapping genes or the effect 
@@ -238,6 +247,8 @@ To provide any additional SnpEff arguments use `--snpeff_args` such as
 `--snpeff_args "-noStats -no-downstream -no-upstream -no-intergenic -no-intron -onlyProtein -hgvs1LetterAa -noShiftHgvs"`, 
 otherwise defaults will be used.
 
+No functional annotations are performed if the parameters `--snpeff_organism` and `--snpeff_datadir` are not passed.
+
 
 ## References
 

diff --git a/main.nf b/main.nf
@@ -14,6 +14,7 @@ params.input_bams = false
 params.input_vcf = false
 params.reference = false
 params.output = "output"
+params.skip_normalization = false
 params.skip_decompose_complex = false
 params.filter = false
 params.cpus = 1
@@ -32,6 +33,10 @@ if ( params.snpeff_organism && ! params.snpeff_datadir) {
   exit 1, "To run snpEff, please, provide your snpEff data folder with --snpeff_datadir"
 }
 
+if (params.skip_normalization && ! params.input_bams && ! params.snpeff_organism) {
+  exit -1, "Neither normalization, VAFator annotation or SnpEff annotation enabled! Nothing to do..."
+}
+
 if (! params.input_vcfs && ! params.input_vcf) {
   exit 1, "Neither --input_vcfs or --input_vcf are provided!"
 }
@@ -67,15 +72,20 @@ workflow {
 
     SUMMARY_VCF(input_vcfs)
 
-    final_vcfs = BCFTOOLS_NORM(input_vcfs)
-    if (! params.skip_decompose_complex) {
-        VT_DECOMPOSE_COMPLEX(final_vcfs)
-        final_vcfs = VT_DECOMPOSE_COMPLEX.out.decomposed_vcfs
-    }
-    REMOVE_DUPLICATES(final_vcfs)
-    final_vcfs = REMOVE_DUPLICATES.out.deduplicated_vcfs
+    if (! params.skip_normalization) {
+        final_vcfs = BCFTOOLS_NORM(input_vcfs)
+        if (! params.skip_decompose_complex) {
+            VT_DECOMPOSE_COMPLEX(final_vcfs)
+            final_vcfs = VT_DECOMPOSE_COMPLEX.out.decomposed_vcfs
+        }
+        REMOVE_DUPLICATES(final_vcfs)
+        final_vcfs = REMOVE_DUPLICATES.out.deduplicated_vcfs
 
-    SUMMARY_VCF_2(final_vcfs)
+        SUMMARY_VCF_2(final_vcfs)
+    }
+    else {
+        final_vcfs = input_vcfs
+    }
 
     if ( params.input_bams ) {
         VAFATOR(final_vcfs.join(input_bams.groupTuple()))

diff --git a/modules/04_vafator.nf b/modules/04_vafator.nf
@@ -12,20 +12,20 @@ process VAFATOR {
     tag "${patient_name}"
     publishDir "${params.output}/${patient_name}", mode: "copy"
 
-    conda (params.enable_conda ? "bioconda::vafator=1.1.4" : null)
+    conda (params.enable_conda ? "bioconda::vafator=1.2.5" : null)
 
     input:
     tuple val(patient_name), file(vcf), val(bams)
 
     output:
-    tuple val(patient_name), file("${vcf.baseName}.vaf.vcf"), emit: annotated_vcf
+    tuple val(patient_name), file("${patient_name}.vaf.vcf"), emit: annotated_vcf
 
     script:
     bams_param = bams.collect { b -> "--bam " + b.split(":").join(" ") }.join(" ")
     """
     vafator \
     --input-vcf ${vcf} \
-    --output-vcf ${vcf.baseName}.vaf.vcf \
+    --output-vcf ${patient_name}.vaf.vcf \
     --mapping-quality ${params.mapping_quality} \
     --base-call-quality ${params.base_call_quality} \
     ${bams_param}
@@ -39,16 +39,16 @@ process MULTIALLELIC_FILTER {
     tag "${name}"
     publishDir "${params.output}/${name}", mode: "copy"
 
-    conda (params.enable_conda ? "bioconda::vafator=1.1.4" : null)
+    conda (params.enable_conda ? "bioconda::vafator=1.2.5" : null)
 
     input:
     tuple val(name), file(vcf)
 
     output:
-    tuple val(name), file("${vcf.baseName}.filtered_multiallelics.vcf"), emit: filtered_vcf
+    tuple val(name), file("${name}.filtered_multiallelics.vcf"), emit: filtered_vcf
 
     script:
     """
-    multiallelics-filter --input-vcf ${vcf} --output-vcf ${vcf.baseName}.filtered_multiallelics.vcf
+    multiallelics-filter --input-vcf ${vcf} --output-vcf ${name}.filtered_multiallelics.vcf
     """
 }
diff --git a/nextflow.config b/nextflow.config
@@ -53,10 +53,11 @@ Input:
     Example input file:
     sample1	/path/to/your/file.vcf
     sample2	/path/to/your/file2.vcf
-    * --reference: path to the FASTA genome reference (indexes expected *.fai, *.dict)
 
 Optional input:
+    * --reference: path to the FASTA genome reference (indexes expected *.fai, *.dict) [required for normalization]
     * --output: the folder where to publish output
+    * --skip_normalization: flag indicating to skip all normalization steps
     * --skip_decompose_complex: flag indicating not to split complex variants (ie: MNVs and combinations of SNVs and indels)
     * --filter: specify a comma-separated list of filters to apply (e.g.: PASS,.), only variants with these values will be kept. If not provided all varianst are kept
     * --vcf-without-ad: indicate when the VCFs to normalize do not have the FORMAT/AD annotation

diff --git a/test_data/test_bams.txt b/test_data/test_bams.txt
@@ -1,2 +1,6 @@
-tumor_normal	/home/priesgo/src/github/tronflow-variant-normalization/test_data/TESTX_S1_L001.bam	/home/priesgo/src/github/tronflow-variant-normalization/test_data/TESTX_S1_L002.bam
-single_sample	/home/priesgo/src/github/tronflow-variant-normalization/test_data/TESTX_S1_L001.bam,/home/priesgo/src/github/tronflow-variant-normalization/test_data/TESTX_S1_L002.bam	/home/priesgo/src/github/tronflow-variant-normalization/test_data/TESTX_S1_L001.bam,/home/priesgo/src/github/tronflow-variant-normalization/test_data/TESTX_S1_L002.bam
+tumor_normal	primary:/home/priesgo/src/github/tronflow-variant-normalization/test_data/TESTX_S1_L001.bam
+tumor_normal	normal:/home/priesgo/src/github/tronflow-variant-normalization/test_data/TESTX_S1_L002.bam
+single_sample	tumor:/home/priesgo/src/github/tronflow-variant-normalization/test_data/TESTX_S1_L001.bam
+single_sample	tumor:/home/priesgo/src/github/tronflow-variant-normalization/test_data/TESTX_S1_L002.bam
+single_sample	normal:/home/priesgo/src/github/tronflow-variant-normalization/test_data/TESTX_S1_L001.bam
+single_sample	normal:/home/priesgo/src/github/tronflow-variant-normalization/test_data/TESTX_S1_L002.bam
diff --git a/tests/run_test_0.sh b/tests/run_test_0.sh
@@ -1,4 +1,11 @@
 #!/bin/bash
 
+output_folder=output/test0
 
-nextflow main.nf --help
+nextflow main.nf --help
+
+nextflow main.nf -profile test,conda --output $output_folder --skip_normalization
+
+# missing SNpEff data folder
+nextflow main.nf -profile test,conda --output $output_folder --snpeff_organism hg19
+test ! -d $output_folder
diff --git a/tests/run_test_10.sh b/tests/run_test_10.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+
+source tests/assert.sh
+output_folder=output/test10
+echo -e "tumor_normal\tprimary:"`pwd`"/test_data/TESTX_S1_L001.bam" > test_data/test_bams.txt
+echo -e "tumor_normal\tnormal:"`pwd`"/test_data/TESTX_S1_L002.bam" >> test_data/test_bams.txt
+echo -e "single_sample\ttumor:"`pwd`"/test_data/TESTX_S1_L001.bam" >> test_data/test_bams.txt
+echo -e "single_sample\ttumor:"`pwd`"/test_data/TESTX_S1_L002.bam" >> test_data/test_bams.txt
+echo -e "single_sample\tnormal:"`pwd`"/test_data/TESTX_S1_L001.bam" >> test_data/test_bams.txt
+echo -e "single_sample\tnormal:"`pwd`"/test_data/TESTX_S1_L002.bam" >> test_data/test_bams.txt
+nextflow main.nf -profile test,conda --output $output_folder --input_bams test_data/test_bams.txt --skip_normalization
+test -s $output_folder/single_sample/single_sample.filtered_multiallelics.vcf || { echo "Missing test 10 output file!"; exit 1; }
+test -s $output_folder/tumor_normal/tumor_normal.filtered_multiallelics.vcf || { echo "Missing test 10 output file!"; exit 1; }
+test -s $output_folder/single_sample/single_sample.vaf.vcf || { echo "Missing test 10 output file!"; exit 1; }
+test -s $output_folder/tumor_normal/tumor_normal.vaf.vcf || { echo "Missing test 10 output file!"; exit 1; }
diff --git a/tests/run_test_6.sh b/tests/run_test_6.sh
@@ -12,15 +12,15 @@ echo -e "single_sample\tnormal:"`pwd`"/test_data/TESTX_S1_L002.bam" >> test_data
 nextflow main.nf -profile test,conda --output $output_folder --input_bams test_data/test_bams.txt
 test -s $output_folder/single_sample/single_sample.normalized.vcf || { echo "Missing test 6 output file!"; exit 1; }
 test -s $output_folder/tumor_normal/tumor_normal.normalized.vcf || { echo "Missing test 6 output file!"; exit 1; }
-test -s $output_folder/single_sample/single_sample.normalized.vaf.vcf || { echo "Missing test 6 output file!"; exit 1; }
-test -s $output_folder/tumor_normal/tumor_normal.normalized.vaf.vcf || { echo "Missing test 6 output file!"; exit 1; }
-test -s $output_folder/single_sample/single_sample.normalized.vaf.filtered_multiallelics.vcf || { echo "Missing test 6 output file!"; exit 1; }
-test -s $output_folder/tumor_normal/tumor_normal.normalized.vaf.filtered_multiallelics.vcf || { echo "Missing test 6 output file!"; exit 1; }
+test -s $output_folder/single_sample/single_sample.vaf.vcf || { echo "Missing test 6 output file!"; exit 1; }
+test -s $output_folder/tumor_normal/tumor_normal.vaf.vcf || { echo "Missing test 6 output file!"; exit 1; }
+test -s $output_folder/single_sample/single_sample.filtered_multiallelics.vcf || { echo "Missing test 6 output file!"; exit 1; }
+test -s $output_folder/tumor_normal/tumor_normal.filtered_multiallelics.vcf || { echo "Missing test 6 output file!"; exit 1; }
 assert_eq `wc -l $output_folder/single_sample/single_sample.normalized.vcf | cut -d' ' -f 1` 53 "Wrong number of variants"
 assert_eq `wc -l $output_folder/tumor_normal/tumor_normal.normalized.vcf | cut -d' ' -f 1` 53 "Wrong number of variants"
 assert_eq `wc -l $output_folder/single_sample/single_sample.normalized.vaf.vcf | cut -d' ' -f 1` 72 "Wrong number of variants"
-assert_eq `grep tumor_af $output_folder/single_sample/single_sample.normalized.vaf.vcf | wc -l | cut -d' ' -f 1` 35 "Wrong number of variants"
-assert_eq `grep normal_af $output_folder/single_sample/single_sample.normalized.vaf.vcf | wc -l | cut -d' ' -f 1` 35 "Wrong number of variants"
-assert_eq `wc -l $output_folder/tumor_normal/tumor_normal.normalized.vaf.vcf | cut -d' ' -f 1` 60 "Wrong number of variants"
-assert_eq `wc -l $output_folder/tumor_normal/tumor_normal.normalized.vaf.filtered_multiallelics.vcf | cut -d' ' -f 1` 53 "Wrong number of variants"
-assert_eq `grep primary_af $output_folder/tumor_normal/tumor_normal.normalized.vaf.filtered_multiallelics.vcf | wc -l | cut -d' ' -f 1` 24 "Wrong number of variants"
+assert_eq `grep tumor_af $output_folder/single_sample/single_sample.vaf.vcf | wc -l | cut -d' ' -f 1` 35 "Wrong number of variants"
+assert_eq `grep normal_af $output_folder/single_sample/single_sample.vaf.vcf | wc -l | cut -d' ' -f 1` 35 "Wrong number of variants"
+assert_eq `wc -l $output_folder/tumor_normal/tumor_normal.vaf.vcf | cut -d' ' -f 1` 60 "Wrong number of variants"
+assert_eq `wc -l $output_folder/tumor_normal/tumor_normal.filtered_multiallelics.vcf | cut -d' ' -f 1` 53 "Wrong number of variants"
+assert_eq `grep primary_af $output_folder/tumor_normal/tumor_normal.filtered_multiallelics.vcf | wc -l | cut -d' ' -f 1` 24 "Wrong number of variants"
diff --git a/tests/run_test_7.sh b/tests/run_test_7.sh
@@ -12,11 +12,11 @@ echo -e "single_sample\tnormal:"`pwd`"/test_data/TESTX_S1_L002.bam" >> test_data
 nextflow main.nf -profile test,conda --output $output_folder --input_bams test_data/test_bams.txt --skip_multiallelic_filter
 test -s $output_folder/single_sample/single_sample.normalized.vcf || { echo "Missing test 7 output file!"; exit 1; }
 test -s $output_folder/tumor_normal/tumor_normal.normalized.vcf || { echo "Missing test 7 output file!"; exit 1; }
-test -s $output_folder/single_sample/single_sample.normalized.vaf.vcf || { echo "Missing test 7 output file!"; exit 1; }
-test -s $output_folder/tumor_normal/tumor_normal.normalized.vaf.vcf || { echo "Missing test 7 output file!"; exit 1; }
+test -s $output_folder/single_sample/single_sample.vaf.vcf || { echo "Missing test 7 output file!"; exit 1; }
+test -s $output_folder/tumor_normal/tumor_normal.vaf.vcf || { echo "Missing test 7 output file!"; exit 1; }
 assert_eq `wc -l $output_folder/single_sample/single_sample.normalized.vcf | cut -d' ' -f 1` 53 "Wrong number of variants"
 assert_eq `wc -l $output_folder/tumor_normal/tumor_normal.normalized.vcf | cut -d' ' -f 1` 53 "Wrong number of variants"
-assert_eq `wc -l $output_folder/single_sample/single_sample.normalized.vaf.vcf | cut -d' ' -f 1` 72 "Wrong number of variants"
-assert_eq `grep tumor_af $output_folder/single_sample/single_sample.normalized.vaf.vcf | wc -l | cut -d' ' -f 1` 35 "Wrong number of variants"
-assert_eq `grep normal_af $output_folder/single_sample/single_sample.normalized.vaf.vcf | wc -l | cut -d' ' -f 1` 35 "Wrong number of variants"
-assert_eq `wc -l $output_folder/tumor_normal/tumor_normal.normalized.vaf.vcf | cut -d' ' -f 1` 60 "Wrong number of variants"
+assert_eq `wc -l $output_folder/single_sample/single_sample.vaf.vcf | cut -d' ' -f 1` 72 "Wrong number of variants"
+assert_eq `grep tumor_af $output_folder/single_sample/single_sample.vaf.vcf | wc -l | cut -d' ' -f 1` 35 "Wrong number of variants"
+assert_eq `grep normal_af $output_folder/single_sample/single_sample.vaf.vcf | wc -l | cut -d' ' -f 1` 35 "Wrong number of variants"
+assert_eq `wc -l $output_folder/tumor_normal/tumor_normal.vaf.vcf | cut -d' ' -f 1` 60 "Wrong number of variants"
diff --git a/tests/run_test_8.sh b/tests/run_test_8.sh
@@ -12,11 +12,11 @@ echo -e "single_sample\tnormal:"`pwd`"/test_data/TESTX_S1_L002.bam" >> test_data
 nextflow main.nf -profile test,conda --output $output_folder --input_bams test_data/test_bams.txt
 test -s $output_folder/single_sample/single_sample.normalized.vcf || { echo "Missing test 8 output file!"; exit 1; }
 test -s $output_folder/tumor_normal/tumor_normal.normalized.vcf || { echo "Missing test 8 output file!"; exit 1; }
-test -s $output_folder/single_sample/single_sample.normalized.vaf.vcf || { echo "Missing test 8 output file!"; exit 1; }
-test -s $output_folder/tumor_normal/tumor_normal.normalized.vaf.vcf || { echo "Missing test 8 output file!"; exit 1; }
+test -s $output_folder/single_sample/single_sample.vaf.vcf || { echo "Missing test 8 output file!"; exit 1; }
+test -s $output_folder/tumor_normal/tumor_normal.vaf.vcf || { echo "Missing test 8 output file!"; exit 1; }
 assert_eq `wc -l $output_folder/single_sample/single_sample.normalized.vcf | cut -d' ' -f 1` 53 "Wrong number of variants"
 assert_eq `wc -l $output_folder/tumor_normal/tumor_normal.normalized.vcf | cut -d' ' -f 1` 53 "Wrong number of variants"
-assert_eq `wc -l $output_folder/single_sample/single_sample.normalized.vaf.vcf | cut -d' ' -f 1` 72 "Wrong number of variants"
-assert_eq `grep tumor_af $output_folder/single_sample/single_sample.normalized.vaf.vcf | wc -l | cut -d' ' -f 1` 35 "Wrong number of variants"
-assert_eq `grep normal_af $output_folder/single_sample/single_sample.normalized.vaf.vcf | wc -l | cut -d' ' -f 1` 35 "Wrong number of variants"
-assert_eq `wc -l $output_folder/tumor_normal/tumor_normal.normalized.vaf.vcf | cut -d' ' -f 1` 60 "Wrong number of variants"
+assert_eq `wc -l $output_folder/single_sample/single_sample.vaf.vcf | cut -d' ' -f 1` 72 "Wrong number of variants"
+assert_eq `grep tumor_af $output_folder/single_sample/single_sample.vaf.vcf | wc -l | cut -d' ' -f 1` 35 "Wrong number of variants"
+assert_eq `grep normal_af $output_folder/single_sample/single_sample.vaf.vcf | wc -l | cut -d' ' -f 1` 35 "Wrong number of variants"
+assert_eq `wc -l $output_folder/tumor_normal/tumor_normal.vaf.vcf | cut -d' ' -f 1` 60 "Wrong number of variants"