diff --git a/Makefile b/Makefile index 1b65dce..58bbba7 100644 --- a/Makefile +++ b/Makefile @@ -22,3 +22,5 @@ test: bash tests/run_test_5.sh bash tests/run_test_6.sh bash tests/run_test_7.sh + bash tests/run_test_8.sh + bash tests/run_test_10.sh diff --git a/README.md b/README.md index b573c03..533590f 100644 --- a/README.md +++ b/README.md @@ -18,6 +18,8 @@ This pipeline has several objectives: * Technical annotations from different BAM files * Functional annotations +All of the previous steps are optional. + ## How to run it Run it from GitHub as follows: @@ -47,11 +49,12 @@ Input: Example input file: sample1 /path/to/your/file.vcf sample2 /path/to/your/file2.vcf - * --reference: path to the FASTA genome reference (indexes expected *.fai, *.dict) - * --vcf-without-ad: indicate when the VCFs to normalize do not have the FORMAT/AD annotation Optional input: + * --reference: path to the FASTA genome reference (indexes expected *.fai, *.dict) [required for normalization] + * --vcf-without-ad: indicate when the VCFs to normalize do not have the FORMAT/AD annotation * --output: the folder where to publish output + * --skip_normalization: flag indicating to skip all normalization steps * --skip_decompose_complex: flag indicating not to split complex variants (ie: MNVs and combinations of SNVs and indels) * --filter: specify the filter to apply if any (e.g.: PASS), only variants with this value will be kept * --input_bams: a tab-separated values file containing in each row the sample name, tumor and normal BAM files for annotation with Vafator @@ -96,11 +99,13 @@ The aggregated vafator annotations on each sample will also be provided without ## Variant filtering -Optionally, only variants with the value in the column `FILTER` matching the value of parameter `--filter` are kept. +Only variants with the value in the column `FILTER` matching the value of parameter `--filter` are kept. If this parameter is not used not variants are filtered out. Multiple values can be passed separated by commas without spaces. For instance, `--filter PASS,.` will keep variant having `FILTER=PASS` or `FILTER=.`, but remove all others. +No filter is applied if `--filter` is not passed. + ## Variant normalization @@ -112,13 +117,15 @@ The normalization pipeline consists of the following steps: * Decomposition of multiallelic variants into biallelic variants (ie: A > C,G is decomposed into two variants A > C and A > G) * Trim redundant sequence and left align indels, indels in repetitive sequences can have multiple representations * Remove duplicated variants - -Optionally if BAM files are provided (through `--bam_files`) VCFs are annotated with allele frequencies and depth of -coverage by Vafator (https://github.com/TRON-Bioinformatics/vafator). The output consists of: * The normalized VCF * Summary statistics before and after normalization + +The parameter `--reference` pointing to the reference genome FASTA file is required for normalization. +The reference genome requires *.fai and *.dict indexes. + +Normalization is not applied if the parameter `--skip_normalization` is passed. ![Pipeline](images/variant_normalization_pipeline.png) @@ -215,6 +222,8 @@ within the pileup of a BAM file. When doing somatic variant calling it may be re for the same variant in a patient from multiple BAM files. These annotations are provided by VAFator (https://github.com/TRON-Bioinformatics/vafator). +No technical annotations are performed if the parameter `--input_bams` is not passed. + ## Functional annotations The functional annotations provide a biological context for every variant. Such as the overlapping genes or the effect @@ -238,6 +247,8 @@ To provide any additional SnpEff arguments use `--snpeff_args` such as `--snpeff_args "-noStats -no-downstream -no-upstream -no-intergenic -no-intron -onlyProtein -hgvs1LetterAa -noShiftHgvs"`, otherwise defaults will be used. +No functional annotations are performed if the parameters `--snpeff_organism` and `--snpeff_datadir` are not passed. + ## References diff --git a/main.nf b/main.nf index a09e9e2..5480b3c 100755 --- a/main.nf +++ b/main.nf @@ -14,6 +14,7 @@ params.input_bams = false params.input_vcf = false params.reference = false params.output = "output" +params.skip_normalization = false params.skip_decompose_complex = false params.filter = false params.cpus = 1 @@ -32,6 +33,10 @@ if ( params.snpeff_organism && ! params.snpeff_datadir) { exit 1, "To run snpEff, please, provide your snpEff data folder with --snpeff_datadir" } +if (params.skip_normalization && ! params.input_bams && ! params.snpeff_organism) { + exit -1, "Neither normalization, VAFator annotation or SnpEff annotation enabled! Nothing to do..." +} + if (! params.input_vcfs && ! params.input_vcf) { exit 1, "Neither --input_vcfs or --input_vcf are provided!" } @@ -67,15 +72,20 @@ workflow { SUMMARY_VCF(input_vcfs) - final_vcfs = BCFTOOLS_NORM(input_vcfs) - if (! params.skip_decompose_complex) { - VT_DECOMPOSE_COMPLEX(final_vcfs) - final_vcfs = VT_DECOMPOSE_COMPLEX.out.decomposed_vcfs - } - REMOVE_DUPLICATES(final_vcfs) - final_vcfs = REMOVE_DUPLICATES.out.deduplicated_vcfs + if (! params.skip_normalization) { + final_vcfs = BCFTOOLS_NORM(input_vcfs) + if (! params.skip_decompose_complex) { + VT_DECOMPOSE_COMPLEX(final_vcfs) + final_vcfs = VT_DECOMPOSE_COMPLEX.out.decomposed_vcfs + } + REMOVE_DUPLICATES(final_vcfs) + final_vcfs = REMOVE_DUPLICATES.out.deduplicated_vcfs - SUMMARY_VCF_2(final_vcfs) + SUMMARY_VCF_2(final_vcfs) + } + else { + final_vcfs = input_vcfs + } if ( params.input_bams ) { VAFATOR(final_vcfs.join(input_bams.groupTuple())) diff --git a/modules/04_vafator.nf b/modules/04_vafator.nf index 3f3493e..f04983b 100644 --- a/modules/04_vafator.nf +++ b/modules/04_vafator.nf @@ -12,20 +12,20 @@ process VAFATOR { tag "${patient_name}" publishDir "${params.output}/${patient_name}", mode: "copy" - conda (params.enable_conda ? "bioconda::vafator=1.1.4" : null) + conda (params.enable_conda ? "bioconda::vafator=1.2.5" : null) input: tuple val(patient_name), file(vcf), val(bams) output: - tuple val(patient_name), file("${vcf.baseName}.vaf.vcf"), emit: annotated_vcf + tuple val(patient_name), file("${patient_name}.vaf.vcf"), emit: annotated_vcf script: bams_param = bams.collect { b -> "--bam " + b.split(":").join(" ") }.join(" ") """ vafator \ --input-vcf ${vcf} \ - --output-vcf ${vcf.baseName}.vaf.vcf \ + --output-vcf ${patient_name}.vaf.vcf \ --mapping-quality ${params.mapping_quality} \ --base-call-quality ${params.base_call_quality} \ ${bams_param} @@ -39,16 +39,16 @@ process MULTIALLELIC_FILTER { tag "${name}" publishDir "${params.output}/${name}", mode: "copy" - conda (params.enable_conda ? "bioconda::vafator=1.1.4" : null) + conda (params.enable_conda ? "bioconda::vafator=1.2.5" : null) input: tuple val(name), file(vcf) output: - tuple val(name), file("${vcf.baseName}.filtered_multiallelics.vcf"), emit: filtered_vcf + tuple val(name), file("${name}.filtered_multiallelics.vcf"), emit: filtered_vcf script: """ - multiallelics-filter --input-vcf ${vcf} --output-vcf ${vcf.baseName}.filtered_multiallelics.vcf + multiallelics-filter --input-vcf ${vcf} --output-vcf ${name}.filtered_multiallelics.vcf """ } diff --git a/nextflow.config b/nextflow.config index 957a3cd..4174e51 100644 --- a/nextflow.config +++ b/nextflow.config @@ -53,10 +53,11 @@ Input: Example input file: sample1 /path/to/your/file.vcf sample2 /path/to/your/file2.vcf - * --reference: path to the FASTA genome reference (indexes expected *.fai, *.dict) Optional input: + * --reference: path to the FASTA genome reference (indexes expected *.fai, *.dict) [required for normalization] * --output: the folder where to publish output + * --skip_normalization: flag indicating to skip all normalization steps * --skip_decompose_complex: flag indicating not to split complex variants (ie: MNVs and combinations of SNVs and indels) * --filter: specify a comma-separated list of filters to apply (e.g.: PASS,.), only variants with these values will be kept. If not provided all varianst are kept * --vcf-without-ad: indicate when the VCFs to normalize do not have the FORMAT/AD annotation diff --git a/test_data/test_bams.txt b/test_data/test_bams.txt index 835a3eb..c42dc36 100644 --- a/test_data/test_bams.txt +++ b/test_data/test_bams.txt @@ -1,2 +1,6 @@ -tumor_normal /home/priesgo/src/github/tronflow-variant-normalization/test_data/TESTX_S1_L001.bam /home/priesgo/src/github/tronflow-variant-normalization/test_data/TESTX_S1_L002.bam -single_sample /home/priesgo/src/github/tronflow-variant-normalization/test_data/TESTX_S1_L001.bam,/home/priesgo/src/github/tronflow-variant-normalization/test_data/TESTX_S1_L002.bam /home/priesgo/src/github/tronflow-variant-normalization/test_data/TESTX_S1_L001.bam,/home/priesgo/src/github/tronflow-variant-normalization/test_data/TESTX_S1_L002.bam +tumor_normal primary:/home/priesgo/src/github/tronflow-variant-normalization/test_data/TESTX_S1_L001.bam +tumor_normal normal:/home/priesgo/src/github/tronflow-variant-normalization/test_data/TESTX_S1_L002.bam +single_sample tumor:/home/priesgo/src/github/tronflow-variant-normalization/test_data/TESTX_S1_L001.bam +single_sample tumor:/home/priesgo/src/github/tronflow-variant-normalization/test_data/TESTX_S1_L002.bam +single_sample normal:/home/priesgo/src/github/tronflow-variant-normalization/test_data/TESTX_S1_L001.bam +single_sample normal:/home/priesgo/src/github/tronflow-variant-normalization/test_data/TESTX_S1_L002.bam diff --git a/tests/run_test_0.sh b/tests/run_test_0.sh index 5629ccd..5b3a47c 100644 --- a/tests/run_test_0.sh +++ b/tests/run_test_0.sh @@ -1,4 +1,11 @@ #!/bin/bash +output_folder=output/test0 -nextflow main.nf --help \ No newline at end of file +nextflow main.nf --help + +nextflow main.nf -profile test,conda --output $output_folder --skip_normalization + +# missing SNpEff data folder +nextflow main.nf -profile test,conda --output $output_folder --snpeff_organism hg19 +test ! -d $output_folder \ No newline at end of file diff --git a/tests/run_test_10.sh b/tests/run_test_10.sh new file mode 100644 index 0000000..62d2cff --- /dev/null +++ b/tests/run_test_10.sh @@ -0,0 +1,16 @@ +#!/bin/bash + + +source tests/assert.sh +output_folder=output/test10 +echo -e "tumor_normal\tprimary:"`pwd`"/test_data/TESTX_S1_L001.bam" > test_data/test_bams.txt +echo -e "tumor_normal\tnormal:"`pwd`"/test_data/TESTX_S1_L002.bam" >> test_data/test_bams.txt +echo -e "single_sample\ttumor:"`pwd`"/test_data/TESTX_S1_L001.bam" >> test_data/test_bams.txt +echo -e "single_sample\ttumor:"`pwd`"/test_data/TESTX_S1_L002.bam" >> test_data/test_bams.txt +echo -e "single_sample\tnormal:"`pwd`"/test_data/TESTX_S1_L001.bam" >> test_data/test_bams.txt +echo -e "single_sample\tnormal:"`pwd`"/test_data/TESTX_S1_L002.bam" >> test_data/test_bams.txt +nextflow main.nf -profile test,conda --output $output_folder --input_bams test_data/test_bams.txt --skip_normalization +test -s $output_folder/single_sample/single_sample.filtered_multiallelics.vcf || { echo "Missing test 10 output file!"; exit 1; } +test -s $output_folder/tumor_normal/tumor_normal.filtered_multiallelics.vcf || { echo "Missing test 10 output file!"; exit 1; } +test -s $output_folder/single_sample/single_sample.vaf.vcf || { echo "Missing test 10 output file!"; exit 1; } +test -s $output_folder/tumor_normal/tumor_normal.vaf.vcf || { echo "Missing test 10 output file!"; exit 1; } \ No newline at end of file diff --git a/tests/run_test_6.sh b/tests/run_test_6.sh index 7181bff..e3c64b0 100644 --- a/tests/run_test_6.sh +++ b/tests/run_test_6.sh @@ -12,15 +12,15 @@ echo -e "single_sample\tnormal:"`pwd`"/test_data/TESTX_S1_L002.bam" >> test_data nextflow main.nf -profile test,conda --output $output_folder --input_bams test_data/test_bams.txt test -s $output_folder/single_sample/single_sample.normalized.vcf || { echo "Missing test 6 output file!"; exit 1; } test -s $output_folder/tumor_normal/tumor_normal.normalized.vcf || { echo "Missing test 6 output file!"; exit 1; } -test -s $output_folder/single_sample/single_sample.normalized.vaf.vcf || { echo "Missing test 6 output file!"; exit 1; } -test -s $output_folder/tumor_normal/tumor_normal.normalized.vaf.vcf || { echo "Missing test 6 output file!"; exit 1; } -test -s $output_folder/single_sample/single_sample.normalized.vaf.filtered_multiallelics.vcf || { echo "Missing test 6 output file!"; exit 1; } -test -s $output_folder/tumor_normal/tumor_normal.normalized.vaf.filtered_multiallelics.vcf || { echo "Missing test 6 output file!"; exit 1; } +test -s $output_folder/single_sample/single_sample.vaf.vcf || { echo "Missing test 6 output file!"; exit 1; } +test -s $output_folder/tumor_normal/tumor_normal.vaf.vcf || { echo "Missing test 6 output file!"; exit 1; } +test -s $output_folder/single_sample/single_sample.filtered_multiallelics.vcf || { echo "Missing test 6 output file!"; exit 1; } +test -s $output_folder/tumor_normal/tumor_normal.filtered_multiallelics.vcf || { echo "Missing test 6 output file!"; exit 1; } assert_eq `wc -l $output_folder/single_sample/single_sample.normalized.vcf | cut -d' ' -f 1` 53 "Wrong number of variants" assert_eq `wc -l $output_folder/tumor_normal/tumor_normal.normalized.vcf | cut -d' ' -f 1` 53 "Wrong number of variants" assert_eq `wc -l $output_folder/single_sample/single_sample.normalized.vaf.vcf | cut -d' ' -f 1` 72 "Wrong number of variants" -assert_eq `grep tumor_af $output_folder/single_sample/single_sample.normalized.vaf.vcf | wc -l | cut -d' ' -f 1` 35 "Wrong number of variants" -assert_eq `grep normal_af $output_folder/single_sample/single_sample.normalized.vaf.vcf | wc -l | cut -d' ' -f 1` 35 "Wrong number of variants" -assert_eq `wc -l $output_folder/tumor_normal/tumor_normal.normalized.vaf.vcf | cut -d' ' -f 1` 60 "Wrong number of variants" -assert_eq `wc -l $output_folder/tumor_normal/tumor_normal.normalized.vaf.filtered_multiallelics.vcf | cut -d' ' -f 1` 53 "Wrong number of variants" -assert_eq `grep primary_af $output_folder/tumor_normal/tumor_normal.normalized.vaf.filtered_multiallelics.vcf | wc -l | cut -d' ' -f 1` 24 "Wrong number of variants" \ No newline at end of file +assert_eq `grep tumor_af $output_folder/single_sample/single_sample.vaf.vcf | wc -l | cut -d' ' -f 1` 35 "Wrong number of variants" +assert_eq `grep normal_af $output_folder/single_sample/single_sample.vaf.vcf | wc -l | cut -d' ' -f 1` 35 "Wrong number of variants" +assert_eq `wc -l $output_folder/tumor_normal/tumor_normal.vaf.vcf | cut -d' ' -f 1` 60 "Wrong number of variants" +assert_eq `wc -l $output_folder/tumor_normal/tumor_normal.filtered_multiallelics.vcf | cut -d' ' -f 1` 53 "Wrong number of variants" +assert_eq `grep primary_af $output_folder/tumor_normal/tumor_normal.filtered_multiallelics.vcf | wc -l | cut -d' ' -f 1` 24 "Wrong number of variants" \ No newline at end of file diff --git a/tests/run_test_7.sh b/tests/run_test_7.sh index 046e8cb..f7344c5 100644 --- a/tests/run_test_7.sh +++ b/tests/run_test_7.sh @@ -12,11 +12,11 @@ echo -e "single_sample\tnormal:"`pwd`"/test_data/TESTX_S1_L002.bam" >> test_data nextflow main.nf -profile test,conda --output $output_folder --input_bams test_data/test_bams.txt --skip_multiallelic_filter test -s $output_folder/single_sample/single_sample.normalized.vcf || { echo "Missing test 7 output file!"; exit 1; } test -s $output_folder/tumor_normal/tumor_normal.normalized.vcf || { echo "Missing test 7 output file!"; exit 1; } -test -s $output_folder/single_sample/single_sample.normalized.vaf.vcf || { echo "Missing test 7 output file!"; exit 1; } -test -s $output_folder/tumor_normal/tumor_normal.normalized.vaf.vcf || { echo "Missing test 7 output file!"; exit 1; } +test -s $output_folder/single_sample/single_sample.vaf.vcf || { echo "Missing test 7 output file!"; exit 1; } +test -s $output_folder/tumor_normal/tumor_normal.vaf.vcf || { echo "Missing test 7 output file!"; exit 1; } assert_eq `wc -l $output_folder/single_sample/single_sample.normalized.vcf | cut -d' ' -f 1` 53 "Wrong number of variants" assert_eq `wc -l $output_folder/tumor_normal/tumor_normal.normalized.vcf | cut -d' ' -f 1` 53 "Wrong number of variants" -assert_eq `wc -l $output_folder/single_sample/single_sample.normalized.vaf.vcf | cut -d' ' -f 1` 72 "Wrong number of variants" -assert_eq `grep tumor_af $output_folder/single_sample/single_sample.normalized.vaf.vcf | wc -l | cut -d' ' -f 1` 35 "Wrong number of variants" -assert_eq `grep normal_af $output_folder/single_sample/single_sample.normalized.vaf.vcf | wc -l | cut -d' ' -f 1` 35 "Wrong number of variants" -assert_eq `wc -l $output_folder/tumor_normal/tumor_normal.normalized.vaf.vcf | cut -d' ' -f 1` 60 "Wrong number of variants" \ No newline at end of file +assert_eq `wc -l $output_folder/single_sample/single_sample.vaf.vcf | cut -d' ' -f 1` 72 "Wrong number of variants" +assert_eq `grep tumor_af $output_folder/single_sample/single_sample.vaf.vcf | wc -l | cut -d' ' -f 1` 35 "Wrong number of variants" +assert_eq `grep normal_af $output_folder/single_sample/single_sample.vaf.vcf | wc -l | cut -d' ' -f 1` 35 "Wrong number of variants" +assert_eq `wc -l $output_folder/tumor_normal/tumor_normal.vaf.vcf | cut -d' ' -f 1` 60 "Wrong number of variants" \ No newline at end of file diff --git a/tests/run_test_8.sh b/tests/run_test_8.sh index fa536c7..8c1e2e4 100644 --- a/tests/run_test_8.sh +++ b/tests/run_test_8.sh @@ -12,11 +12,11 @@ echo -e "single_sample\tnormal:"`pwd`"/test_data/TESTX_S1_L002.bam" >> test_data nextflow main.nf -profile test,conda --output $output_folder --input_bams test_data/test_bams.txt test -s $output_folder/single_sample/single_sample.normalized.vcf || { echo "Missing test 8 output file!"; exit 1; } test -s $output_folder/tumor_normal/tumor_normal.normalized.vcf || { echo "Missing test 8 output file!"; exit 1; } -test -s $output_folder/single_sample/single_sample.normalized.vaf.vcf || { echo "Missing test 8 output file!"; exit 1; } -test -s $output_folder/tumor_normal/tumor_normal.normalized.vaf.vcf || { echo "Missing test 8 output file!"; exit 1; } +test -s $output_folder/single_sample/single_sample.vaf.vcf || { echo "Missing test 8 output file!"; exit 1; } +test -s $output_folder/tumor_normal/tumor_normal.vaf.vcf || { echo "Missing test 8 output file!"; exit 1; } assert_eq `wc -l $output_folder/single_sample/single_sample.normalized.vcf | cut -d' ' -f 1` 53 "Wrong number of variants" assert_eq `wc -l $output_folder/tumor_normal/tumor_normal.normalized.vcf | cut -d' ' -f 1` 53 "Wrong number of variants" -assert_eq `wc -l $output_folder/single_sample/single_sample.normalized.vaf.vcf | cut -d' ' -f 1` 72 "Wrong number of variants" -assert_eq `grep tumor_af $output_folder/single_sample/single_sample.normalized.vaf.vcf | wc -l | cut -d' ' -f 1` 35 "Wrong number of variants" -assert_eq `grep normal_af $output_folder/single_sample/single_sample.normalized.vaf.vcf | wc -l | cut -d' ' -f 1` 35 "Wrong number of variants" -assert_eq `wc -l $output_folder/tumor_normal/tumor_normal.normalized.vaf.vcf | cut -d' ' -f 1` 60 "Wrong number of variants" \ No newline at end of file +assert_eq `wc -l $output_folder/single_sample/single_sample.vaf.vcf | cut -d' ' -f 1` 72 "Wrong number of variants" +assert_eq `grep tumor_af $output_folder/single_sample/single_sample.vaf.vcf | wc -l | cut -d' ' -f 1` 35 "Wrong number of variants" +assert_eq `grep normal_af $output_folder/single_sample/single_sample.vaf.vcf | wc -l | cut -d' ' -f 1` 35 "Wrong number of variants" +assert_eq `wc -l $output_folder/tumor_normal/tumor_normal.vaf.vcf | cut -d' ' -f 1` 60 "Wrong number of variants" \ No newline at end of file