Merge branch 'support-vcfs-no-ad' into 'master'

Support vcfs no FORMAT/AD + better support single VCF See merge request tron/tron-variant-normalization!4
TRON-Bioinformatics · Jun 1, 2021 · ce4e698 · ce4e698
2 parents 1c0ddaa + 1f3e62a
commit ce4e698
Show file tree

Hide file tree

Showing 6 changed files with 74 additions and 18 deletions.
diff --git a/Makefile b/Makefile
@@ -16,11 +16,15 @@ test:
 	nextflow main.nf -profile test,conda --output output/test1
 	nextflow main.nf -profile test,conda --output output/test2 --filter PASS,MNV
 	nextflow main.nf -profile test,conda --output output/test3 --skip_decompose_complex
+	nextflow main.nf -profile test,conda --input_files test_data/test_input_no_ad.txt --output output/test4
+	nextflow main.nf -profile test,conda --output output/test5 --input_files false --input_vcf test_data/test_single_sample.vcf
 
 check:
 	test -s output/test1/tumor_normal/tumor_normal.normalized.vcf || { echo "Missing test 1 output file!"; exit 1; }
 	test -s output/test2/tumor_normal/tumor_normal.normalized.vcf || { echo "Missing test 2 output file!"; exit 1; }
 	test -s output/test3/tumor_normal/tumor_normal.normalized.vcf || { echo "Missing test 3 output file!"; exit 1; }
 	test -s output/test1/single_sample/single_sample.normalized.vcf || { echo "Missing test 1 output file!"; exit 1; }
 	test -s output/test2/single_sample/single_sample.normalized.vcf || { echo "Missing test 2 output file!"; exit 1; }
-	test -s output/test3/single_sample/single_sample.normalized.vcf || { echo "Missing test 3 output file!"; exit 1; }
+	test -s output/test3/single_sample/single_sample.normalized.vcf || { echo "Missing test 3 output file!"; exit 1; }
+	test -s output/test4/sample_no_ad/sample_no_ad.normalized.vcf || { echo "Missing test 4 output file!"; exit 1; }
+	test -s output/test5/test_single_sample/test_single_sample.normalized.vcf || { echo "Missing test 5 output file!"; exit 1; }
diff --git a/README.md b/README.md
@@ -117,17 +117,19 @@ Usage:
 
 
 Input:
-    * input_files: the path to a tab-separated values file containing in each row the sample name  and path to the VCF file
+    * --input_vcf: the path to a single VCF to normalize (not compatible with --input_files)
+    * --input_files: the path to a tab-separated values file containing in each row the sample name  and path to the VCF file (not compatible with --input_vcf)
     The input file does not have header!
     Example input file:
     sample1	/path/to/your/file.vcf
     sample2	/path/to/your/file2.vcf
-    * reference: path to the FASTA genome reference (indexes expected *.fai, *.dict)
+    * --reference: path to the FASTA genome reference (indexes expected *.fai, *.dict)
+    * --vcf-without-ad: indicate when the VCFs to normalize do not have the FORMAT/AD annotation
 
 Optional input:
-    * output: the folder where to publish output
-    * skip_decompose_complex: flag indicating not to split complex variants (ie: MNVs and combinations of SNVs and indels)
-    * filter: specify the filter to apply if any (e.g.: PASS), only variants with this value will be kept
+    * --output: the folder where to publish output
+    * --skip_decompose_complex: flag indicating not to split complex variants (ie: MNVs and combinations of SNVs and indels)
+    * --filter: specify the filter to apply if any (e.g.: PASS), only variants with this value will be kept
 
 Output:
     * Normalized VCF file

diff --git a/main.nf b/main.nf
@@ -2,12 +2,14 @@
 
 params.help= false
 params.input_files = false
+params.input_vcf = false
 params.reference = false
 params.output = false
 params.skip_decompose_complex = false
 params.filter = false
 params.cpus = 1
 params.memory = "4g"
+params.vcf_without_ad = false
 
 
 if (params.help) {
@@ -20,15 +22,22 @@ if (params.output) {
   publish_dir = params.output
 }
 
-// checks required inputs
-if (params.input_files) {
+if (! params.input_files && ! params.input_vcf) {
+  exit 1, "Neither --input-files or --input-vcf are provided!"
+}
+else if (params.input_files && params.input_vcf) {
+  exit 1, "Both --input-files and --input-vcf are provided! Please, provide only one."
+}
+else if (params.input_files) {
   Channel
     .fromPath(params.input_files)
     .splitCsv(header: ['name', 'vcf'], sep: "\t")
     .map{ row-> tuple(row.name, file(row.vcf)) }
     .set { input_files }
-} else {
-  exit 1, "Input file not specified!"
+}
+else if (params.input_vcf) {
+  input_vcf = file(params.input_vcf)
+  Channel.fromList([tuple(input_vcf.name.take(input_vcf.name.lastIndexOf('.')), input_vcf)]).set { input_files }
 }
 
 if (params.filter) {
@@ -87,13 +96,13 @@ process normalizeVcf {
     script:
         //decompose_complex = params.skip_decompose_complex ? "" : "bcftools norm --atomize - |"
         decompose_complex = params.skip_decompose_complex ? "" : "vt decompose_blocksub -a -p - |"
-
+        keep_ad_sum = params.vcf_without_ad ? "--keep-sum AD" : ""
     """
     # initial sort of the VCF
     bcftools sort ${vcf} | \
 
     # checks reference genome, decompose multiallelics, trim and left align indels
-    bcftools norm --multiallelics -any --keep-sum AD --check-ref e --fasta-ref ${params.reference} \
+    bcftools norm --multiallelics -any ${keep_ad_sum} --check-ref e --fasta-ref ${params.reference} \
     --old-rec-tag OLD_CLUMPED - | \
 
     # decompose complex variants

diff --git a/nextflow.config b/nextflow.config
@@ -27,7 +27,7 @@ env {
 // Capture exit codes from upstream processes when piping
 process.shell = ['/bin/bash', '-euo', 'pipefail']
 
-VERSION = '1.0.1'
+VERSION = '1.1.0'
 
 manifest {
   name = 'TRON-Bioinformatics/tronflow-vcf-normalization'
@@ -47,17 +47,19 @@ Usage:
 
 
 Input:
-    * input_files: the path to a tab-separated values file containing in each row the sample name  and path to the VCF file
+    * --input_vcf: the path to a single VCF to normalize (not compatible with --input_files)
+    * --input_files: the path to a tab-separated values file containing in each row the sample name  and path to the VCF file (not compatible with --input_vcf)
     The input file does not have header!
     Example input file:
     sample1	/path/to/your/file.vcf
     sample2	/path/to/your/file2.vcf
-    * reference: path to the FASTA genome reference (indexes expected *.fai, *.dict)
+    * --reference: path to the FASTA genome reference (indexes expected *.fai, *.dict)
 
 Optional input:
-    * output: the folder where to publish output
-    * skip_decompose_complex: flag indicating not to split complex variants (ie: MNVs and combinations of SNVs and indels)
-    * filter: specify the filter to apply if any (e.g.: PASS), only variants with this value will be kept
+    * --output: the folder where to publish output
+    * --skip_decompose_complex: flag indicating not to split complex variants (ie: MNVs and combinations of SNVs and indels)
+    * --filter: specify the filter to apply if any (e.g.: PASS), only variants with this value will be kept
+    * --vcf-without-ad: indicate when the VCFs to normalize do not have the FORMAT/AD annotation
 
 Output:
     * Normalized VCF file

diff --git a/test_data/test_input_no_ad.txt b/test_data/test_input_no_ad.txt
@@ -0,0 +1 @@
+sample_no_ad	test_data/test_no_ad.vcf
diff --git a/test_data/test_no_ad.vcf b/test_data/test_no_ad.vcf
@@ -0,0 +1,38 @@
+##fileformat=VCFv4.2
+##FILTER=<ID=PASS,Description="Accept as a confident somatic mutation">
+##FILTER=<ID=NOT_PASSED,Description="whatever">
+##FILTER=<ID=MULTIALLELIC,Description="whatever">
+##FILTER=<ID=UNTRIMMED,Description="whatever">
+##FILTER=<ID=UNALIGNED,Description="whatever">
+##FILTER=<ID=UNALIGNED-UNTRIMMED,Description="whatever">
+##FILTER=<ID=MNV,Description="whatever">
+##FILTER=<ID=MNV-INDEL,Description="whatever">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##contig=<ID=chr1,length=249250621,assembly=hg19>
+##contig=<ID=chr2,length=243199373,assembly=hg19>
+##contig=<ID=chr3,length=198022430,assembly=hg19>
+##contig=<ID=chr4,length=191154276,assembly=hg19>
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	tumor
+chr1	13082	.	C	A	.	NOT_PASSED		GT	0/1
+chr1	13081	.	A	C	.	NOT_PASSED		GT	0/1
+chr1	13202	.	A	G	.	NOT_PASSED		GT	0/1
+chr1	13201	.	T	G	.	NOT_PASSED		GT	0/1
+chr1	13201	.	T	C	.	PASS		GT	0/1
+chr1	13263	.	T	C	.	PASS		GT	0/1
+chr1	13083	.	A	C	.	PASS		GT	0/1
+chr1	13263	.	TCC	CCC	.	UNTRIMMED		GT	0/1
+chr1	13083	.	AGA	AGC	.	UNTRIMMED		GT	0/1
+chr1	13141	.	C	AAAAAC	.	UNALIGNED		GT	0/1
+chr1	13141	.	CT	AAAAACT	.	UNALIGNED-UNTRIMMED		GT	0/1
+chr1	13141	.	CTGAGG	G	.	UNALIGNED		GT	0/1
+chr1	13141	.	CTGAGG	GG	.	UNALIGNED-UNTRIMMED		GT	0/1
+chr1	13081	.	ACAG	CCAC	.	MNV		GT	0/1
+chr1	13141	.	CTGAGG	ATGAGT	.	MNV		GT	0/1
+chr1	13201	.	TAGCCT	GAGCCC	.	MNV		GT	0/1
+chr1	13261	.	GCTCCT	CCCCCC	.	MNV		GT	0/1
+chr1	13321	.	AGCCCT	CGCC	.	MNV-INDEL		GT	0/1
+chr1	13081	.	ACA	GCAAAAA	.	MNV-INDEL		GT	0/1
+chr1	13204	.	C	G,T	.	MULTIALLELIC		GT	0/1
+chr1	13324	.	C	G,T	.	MULTIALLELIC		GT	1/2
+chr1	13262	.	C	G,T,A	.	MULTIALLELIC		GT	2/3
+chr1	13323	.	C	A,G,T	.	MULTIALLELIC		GT	2/3