From e0a774a91b5bac523384bedbe6d97167ceae1c7d Mon Sep 17 00:00:00 2001 From: priesgof Date: Tue, 1 Jun 2021 13:41:47 +0200 Subject: [PATCH 1/3] support VCFs with no AD field using --vcf_without_ad --- Makefile | 4 +++- main.nf | 5 +++-- nextflow.config | 2 +- test_data/test_input_no_ad.txt | 1 + test_data/test_no_ad.vcf | 38 ++++++++++++++++++++++++++++++++++ 5 files changed, 46 insertions(+), 4 deletions(-) create mode 100644 test_data/test_input_no_ad.txt create mode 100755 test_data/test_no_ad.vcf diff --git a/Makefile b/Makefile index 3e04986..3ba7d7d 100644 --- a/Makefile +++ b/Makefile @@ -16,6 +16,7 @@ test: nextflow main.nf -profile test,conda --output output/test1 nextflow main.nf -profile test,conda --output output/test2 --filter PASS,MNV nextflow main.nf -profile test,conda --output output/test3 --skip_decompose_complex + nextflow main.nf -profile test,conda --input_files test_data/test_input_no_ad.txt --output output/test4 check: test -s output/test1/tumor_normal/tumor_normal.normalized.vcf || { echo "Missing test 1 output file!"; exit 1; } @@ -23,4 +24,5 @@ check: test -s output/test3/tumor_normal/tumor_normal.normalized.vcf || { echo "Missing test 3 output file!"; exit 1; } test -s output/test1/single_sample/single_sample.normalized.vcf || { echo "Missing test 1 output file!"; exit 1; } test -s output/test2/single_sample/single_sample.normalized.vcf || { echo "Missing test 2 output file!"; exit 1; } - test -s output/test3/single_sample/single_sample.normalized.vcf || { echo "Missing test 3 output file!"; exit 1; } \ No newline at end of file + test -s output/test3/single_sample/single_sample.normalized.vcf || { echo "Missing test 3 output file!"; exit 1; } + test -s output/test4/sample_no_ad/sample_no_ad.normalized.vcf || { echo "Missing test 4 output file!"; exit 1; } \ No newline at end of file diff --git a/main.nf b/main.nf index 0ca7544..9326e91 100755 --- a/main.nf +++ b/main.nf @@ -8,6 +8,7 @@ params.skip_decompose_complex = false params.filter = false params.cpus = 1 params.memory = "4g" +params.vcf_without_ad = false if (params.help) { @@ -87,13 +88,13 @@ process normalizeVcf { script: //decompose_complex = params.skip_decompose_complex ? "" : "bcftools norm --atomize - |" decompose_complex = params.skip_decompose_complex ? "" : "vt decompose_blocksub -a -p - |" - + keep_ad_sum = params.vcf_without_ad ? "--keep-sum AD" : "" """ # initial sort of the VCF bcftools sort ${vcf} | \ # checks reference genome, decompose multiallelics, trim and left align indels - bcftools norm --multiallelics -any --keep-sum AD --check-ref e --fasta-ref ${params.reference} \ + bcftools norm --multiallelics -any ${keep_ad_sum} --check-ref e --fasta-ref ${params.reference} \ --old-rec-tag OLD_CLUMPED - | \ # decompose complex variants diff --git a/nextflow.config b/nextflow.config index 9bb0d17..c22112e 100644 --- a/nextflow.config +++ b/nextflow.config @@ -27,7 +27,7 @@ env { // Capture exit codes from upstream processes when piping process.shell = ['/bin/bash', '-euo', 'pipefail'] -VERSION = '1.0.1' +VERSION = '1.1.0' manifest { name = 'TRON-Bioinformatics/tronflow-vcf-normalization' diff --git a/test_data/test_input_no_ad.txt b/test_data/test_input_no_ad.txt new file mode 100644 index 0000000..76d1ae3 --- /dev/null +++ b/test_data/test_input_no_ad.txt @@ -0,0 +1 @@ +sample_no_ad test_data/test_no_ad.vcf \ No newline at end of file diff --git a/test_data/test_no_ad.vcf b/test_data/test_no_ad.vcf new file mode 100755 index 0000000..0196c7a --- /dev/null +++ b/test_data/test_no_ad.vcf @@ -0,0 +1,38 @@ +##fileformat=VCFv4.2 +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FORMAT= +##contig= +##contig= +##contig= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT tumor +chr1 13082 . C A . NOT_PASSED GT 0/1 +chr1 13081 . A C . NOT_PASSED GT 0/1 +chr1 13202 . A G . NOT_PASSED GT 0/1 +chr1 13201 . T G . NOT_PASSED GT 0/1 +chr1 13201 . T C . PASS GT 0/1 +chr1 13263 . T C . PASS GT 0/1 +chr1 13083 . A C . PASS GT 0/1 +chr1 13263 . TCC CCC . UNTRIMMED GT 0/1 +chr1 13083 . AGA AGC . UNTRIMMED GT 0/1 +chr1 13141 . C AAAAAC . UNALIGNED GT 0/1 +chr1 13141 . CT AAAAACT . UNALIGNED-UNTRIMMED GT 0/1 +chr1 13141 . CTGAGG G . UNALIGNED GT 0/1 +chr1 13141 . CTGAGG GG . UNALIGNED-UNTRIMMED GT 0/1 +chr1 13081 . ACAG CCAC . MNV GT 0/1 +chr1 13141 . CTGAGG ATGAGT . MNV GT 0/1 +chr1 13201 . TAGCCT GAGCCC . MNV GT 0/1 +chr1 13261 . GCTCCT CCCCCC . MNV GT 0/1 +chr1 13321 . AGCCCT CGCC . MNV-INDEL GT 0/1 +chr1 13081 . ACA GCAAAAA . MNV-INDEL GT 0/1 +chr1 13204 . C G,T . MULTIALLELIC GT 0/1 +chr1 13324 . C G,T . MULTIALLELIC GT 1/2 +chr1 13262 . C G,T,A . MULTIALLELIC GT 2/3 +chr1 13323 . C A,G,T . MULTIALLELIC GT 2/3 \ No newline at end of file From e090332a02c83de2f601ba784befababd9181280 Mon Sep 17 00:00:00 2001 From: priesgof Date: Tue, 1 Jun 2021 13:43:19 +0200 Subject: [PATCH 2/3] update documentation --- README.md | 11 ++++++----- nextflow.config | 11 ++++++----- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index e290e6e..1721e96 100644 --- a/README.md +++ b/README.md @@ -117,17 +117,18 @@ Usage: Input: - * input_files: the path to a tab-separated values file containing in each row the sample name and path to the VCF file + * --input_files: the path to a tab-separated values file containing in each row the sample name and path to the VCF file The input file does not have header! Example input file: sample1 /path/to/your/file.vcf sample2 /path/to/your/file2.vcf - * reference: path to the FASTA genome reference (indexes expected *.fai, *.dict) + * --reference: path to the FASTA genome reference (indexes expected *.fai, *.dict) + * --vcf-without-ad: indicate when the VCFs to normalize do not have the FORMAT/AD annotation Optional input: - * output: the folder where to publish output - * skip_decompose_complex: flag indicating not to split complex variants (ie: MNVs and combinations of SNVs and indels) - * filter: specify the filter to apply if any (e.g.: PASS), only variants with this value will be kept + * --output: the folder where to publish output + * --skip_decompose_complex: flag indicating not to split complex variants (ie: MNVs and combinations of SNVs and indels) + * --filter: specify the filter to apply if any (e.g.: PASS), only variants with this value will be kept Output: * Normalized VCF file diff --git a/nextflow.config b/nextflow.config index c22112e..38e242a 100644 --- a/nextflow.config +++ b/nextflow.config @@ -47,17 +47,18 @@ Usage: Input: - * input_files: the path to a tab-separated values file containing in each row the sample name and path to the VCF file + * --input_files: the path to a tab-separated values file containing in each row the sample name and path to the VCF file The input file does not have header! Example input file: sample1 /path/to/your/file.vcf sample2 /path/to/your/file2.vcf - * reference: path to the FASTA genome reference (indexes expected *.fai, *.dict) + * --reference: path to the FASTA genome reference (indexes expected *.fai, *.dict) Optional input: - * output: the folder where to publish output - * skip_decompose_complex: flag indicating not to split complex variants (ie: MNVs and combinations of SNVs and indels) - * filter: specify the filter to apply if any (e.g.: PASS), only variants with this value will be kept + * --output: the folder where to publish output + * --skip_decompose_complex: flag indicating not to split complex variants (ie: MNVs and combinations of SNVs and indels) + * --filter: specify the filter to apply if any (e.g.: PASS), only variants with this value will be kept + * --vcf-without-ad: indicate when the VCFs to normalize do not have the FORMAT/AD annotation Output: * Normalized VCF file From 1f3e62a7b46a49b3c7e9c2f1dfb580748bd0def7 Mon Sep 17 00:00:00 2001 From: priesgof Date: Tue, 1 Jun 2021 14:08:56 +0200 Subject: [PATCH 3/3] support either a file with multiple files or a single VCF --- Makefile | 4 +++- README.md | 3 ++- main.nf | 16 ++++++++++++---- nextflow.config | 3 ++- 4 files changed, 19 insertions(+), 7 deletions(-) diff --git a/Makefile b/Makefile index 3ba7d7d..edc42a4 100644 --- a/Makefile +++ b/Makefile @@ -17,6 +17,7 @@ test: nextflow main.nf -profile test,conda --output output/test2 --filter PASS,MNV nextflow main.nf -profile test,conda --output output/test3 --skip_decompose_complex nextflow main.nf -profile test,conda --input_files test_data/test_input_no_ad.txt --output output/test4 + nextflow main.nf -profile test,conda --output output/test5 --input_files false --input_vcf test_data/test_single_sample.vcf check: test -s output/test1/tumor_normal/tumor_normal.normalized.vcf || { echo "Missing test 1 output file!"; exit 1; } @@ -25,4 +26,5 @@ check: test -s output/test1/single_sample/single_sample.normalized.vcf || { echo "Missing test 1 output file!"; exit 1; } test -s output/test2/single_sample/single_sample.normalized.vcf || { echo "Missing test 2 output file!"; exit 1; } test -s output/test3/single_sample/single_sample.normalized.vcf || { echo "Missing test 3 output file!"; exit 1; } - test -s output/test4/sample_no_ad/sample_no_ad.normalized.vcf || { echo "Missing test 4 output file!"; exit 1; } \ No newline at end of file + test -s output/test4/sample_no_ad/sample_no_ad.normalized.vcf || { echo "Missing test 4 output file!"; exit 1; } + test -s output/test5/test_single_sample/test_single_sample.normalized.vcf || { echo "Missing test 5 output file!"; exit 1; } \ No newline at end of file diff --git a/README.md b/README.md index 1721e96..3dd6213 100644 --- a/README.md +++ b/README.md @@ -117,7 +117,8 @@ Usage: Input: - * --input_files: the path to a tab-separated values file containing in each row the sample name and path to the VCF file + * --input_vcf: the path to a single VCF to normalize (not compatible with --input_files) + * --input_files: the path to a tab-separated values file containing in each row the sample name and path to the VCF file (not compatible with --input_vcf) The input file does not have header! Example input file: sample1 /path/to/your/file.vcf diff --git a/main.nf b/main.nf index 9326e91..e0ff22a 100755 --- a/main.nf +++ b/main.nf @@ -2,6 +2,7 @@ params.help= false params.input_files = false +params.input_vcf = false params.reference = false params.output = false params.skip_decompose_complex = false @@ -21,15 +22,22 @@ if (params.output) { publish_dir = params.output } -// checks required inputs -if (params.input_files) { +if (! params.input_files && ! params.input_vcf) { + exit 1, "Neither --input-files or --input-vcf are provided!" +} +else if (params.input_files && params.input_vcf) { + exit 1, "Both --input-files and --input-vcf are provided! Please, provide only one." +} +else if (params.input_files) { Channel .fromPath(params.input_files) .splitCsv(header: ['name', 'vcf'], sep: "\t") .map{ row-> tuple(row.name, file(row.vcf)) } .set { input_files } -} else { - exit 1, "Input file not specified!" +} +else if (params.input_vcf) { + input_vcf = file(params.input_vcf) + Channel.fromList([tuple(input_vcf.name.take(input_vcf.name.lastIndexOf('.')), input_vcf)]).set { input_files } } if (params.filter) { diff --git a/nextflow.config b/nextflow.config index 38e242a..448b95a 100644 --- a/nextflow.config +++ b/nextflow.config @@ -47,7 +47,8 @@ Usage: Input: - * --input_files: the path to a tab-separated values file containing in each row the sample name and path to the VCF file + * --input_vcf: the path to a single VCF to normalize (not compatible with --input_files) + * --input_files: the path to a tab-separated values file containing in each row the sample name and path to the VCF file (not compatible with --input_vcf) The input file does not have header! Example input file: sample1 /path/to/your/file.vcf