From e0a774a91b5bac523384bedbe6d97167ceae1c7d Mon Sep 17 00:00:00 2001
From: priesgof <priesgoferreiro@gmail.com>
Date: Tue, 1 Jun 2021 13:41:47 +0200
Subject: [PATCH 1/3] support VCFs with no AD field using --vcf_without_ad

---
 Makefile                       |  4 +++-
 main.nf                        |  5 +++--
 nextflow.config                |  2 +-
 test_data/test_input_no_ad.txt |  1 +
 test_data/test_no_ad.vcf       | 38 ++++++++++++++++++++++++++++++++++
 5 files changed, 46 insertions(+), 4 deletions(-)
 create mode 100644 test_data/test_input_no_ad.txt
 create mode 100755 test_data/test_no_ad.vcf

diff --git a/Makefile b/Makefile
index 3e04986..3ba7d7d 100644
--- a/Makefile
+++ b/Makefile
@@ -16,6 +16,7 @@ test:
 	nextflow main.nf -profile test,conda --output output/test1
 	nextflow main.nf -profile test,conda --output output/test2 --filter PASS,MNV
 	nextflow main.nf -profile test,conda --output output/test3 --skip_decompose_complex
+	nextflow main.nf -profile test,conda --input_files test_data/test_input_no_ad.txt --output output/test4
 
 check:
 	test -s output/test1/tumor_normal/tumor_normal.normalized.vcf || { echo "Missing test 1 output file!"; exit 1; }
@@ -23,4 +24,5 @@ check:
 	test -s output/test3/tumor_normal/tumor_normal.normalized.vcf || { echo "Missing test 3 output file!"; exit 1; }
 	test -s output/test1/single_sample/single_sample.normalized.vcf || { echo "Missing test 1 output file!"; exit 1; }
 	test -s output/test2/single_sample/single_sample.normalized.vcf || { echo "Missing test 2 output file!"; exit 1; }
-	test -s output/test3/single_sample/single_sample.normalized.vcf || { echo "Missing test 3 output file!"; exit 1; }
\ No newline at end of file
+	test -s output/test3/single_sample/single_sample.normalized.vcf || { echo "Missing test 3 output file!"; exit 1; }
+	test -s output/test4/sample_no_ad/sample_no_ad.normalized.vcf || { echo "Missing test 4 output file!"; exit 1; }
\ No newline at end of file
diff --git a/main.nf b/main.nf
index 0ca7544..9326e91 100755
--- a/main.nf
+++ b/main.nf
@@ -8,6 +8,7 @@ params.skip_decompose_complex = false
 params.filter = false
 params.cpus = 1
 params.memory = "4g"
+params.vcf_without_ad = false
 
 
 if (params.help) {
@@ -87,13 +88,13 @@ process normalizeVcf {
     script:
         //decompose_complex = params.skip_decompose_complex ? "" : "bcftools norm --atomize - |"
         decompose_complex = params.skip_decompose_complex ? "" : "vt decompose_blocksub -a -p - |"
-
+        keep_ad_sum = params.vcf_without_ad ? "--keep-sum AD" : ""
     """
     # initial sort of the VCF
     bcftools sort ${vcf} | \
 
     # checks reference genome, decompose multiallelics, trim and left align indels
-    bcftools norm --multiallelics -any --keep-sum AD --check-ref e --fasta-ref ${params.reference} \
+    bcftools norm --multiallelics -any ${keep_ad_sum} --check-ref e --fasta-ref ${params.reference} \
     --old-rec-tag OLD_CLUMPED - | \
 
     # decompose complex variants
diff --git a/nextflow.config b/nextflow.config
index 9bb0d17..c22112e 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -27,7 +27,7 @@ env {
 // Capture exit codes from upstream processes when piping
 process.shell = ['/bin/bash', '-euo', 'pipefail']
 
-VERSION = '1.0.1'
+VERSION = '1.1.0'
 
 manifest {
   name = 'TRON-Bioinformatics/tronflow-vcf-normalization'
diff --git a/test_data/test_input_no_ad.txt b/test_data/test_input_no_ad.txt
new file mode 100644
index 0000000..76d1ae3
--- /dev/null
+++ b/test_data/test_input_no_ad.txt
@@ -0,0 +1 @@
+sample_no_ad	test_data/test_no_ad.vcf
\ No newline at end of file
diff --git a/test_data/test_no_ad.vcf b/test_data/test_no_ad.vcf
new file mode 100755
index 0000000..0196c7a
--- /dev/null
+++ b/test_data/test_no_ad.vcf
@@ -0,0 +1,38 @@
+##fileformat=VCFv4.2
+##FILTER=<ID=PASS,Description="Accept as a confident somatic mutation">
+##FILTER=<ID=NOT_PASSED,Description="whatever">
+##FILTER=<ID=MULTIALLELIC,Description="whatever">
+##FILTER=<ID=UNTRIMMED,Description="whatever">
+##FILTER=<ID=UNALIGNED,Description="whatever">
+##FILTER=<ID=UNALIGNED-UNTRIMMED,Description="whatever">
+##FILTER=<ID=MNV,Description="whatever">
+##FILTER=<ID=MNV-INDEL,Description="whatever">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##contig=<ID=chr1,length=249250621,assembly=hg19>
+##contig=<ID=chr2,length=243199373,assembly=hg19>
+##contig=<ID=chr3,length=198022430,assembly=hg19>
+##contig=<ID=chr4,length=191154276,assembly=hg19>
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	tumor
+chr1	13082	.	C	A	.	NOT_PASSED		GT	0/1
+chr1	13081	.	A	C	.	NOT_PASSED		GT	0/1
+chr1	13202	.	A	G	.	NOT_PASSED		GT	0/1
+chr1	13201	.	T	G	.	NOT_PASSED		GT	0/1
+chr1	13201	.	T	C	.	PASS		GT	0/1
+chr1	13263	.	T	C	.	PASS		GT	0/1
+chr1	13083	.	A	C	.	PASS		GT	0/1
+chr1	13263	.	TCC	CCC	.	UNTRIMMED		GT	0/1
+chr1	13083	.	AGA	AGC	.	UNTRIMMED		GT	0/1
+chr1	13141	.	C	AAAAAC	.	UNALIGNED		GT	0/1
+chr1	13141	.	CT	AAAAACT	.	UNALIGNED-UNTRIMMED		GT	0/1
+chr1	13141	.	CTGAGG	G	.	UNALIGNED		GT	0/1
+chr1	13141	.	CTGAGG	GG	.	UNALIGNED-UNTRIMMED		GT	0/1
+chr1	13081	.	ACAG	CCAC	.	MNV		GT	0/1
+chr1	13141	.	CTGAGG	ATGAGT	.	MNV		GT	0/1
+chr1	13201	.	TAGCCT	GAGCCC	.	MNV		GT	0/1
+chr1	13261	.	GCTCCT	CCCCCC	.	MNV		GT	0/1
+chr1	13321	.	AGCCCT	CGCC	.	MNV-INDEL		GT	0/1
+chr1	13081	.	ACA	GCAAAAA	.	MNV-INDEL		GT	0/1
+chr1	13204	.	C	G,T	.	MULTIALLELIC		GT	0/1
+chr1	13324	.	C	G,T	.	MULTIALLELIC		GT	1/2
+chr1	13262	.	C	G,T,A	.	MULTIALLELIC		GT	2/3
+chr1	13323	.	C	A,G,T	.	MULTIALLELIC		GT	2/3
\ No newline at end of file

From e090332a02c83de2f601ba784befababd9181280 Mon Sep 17 00:00:00 2001
From: priesgof <priesgoferreiro@gmail.com>
Date: Tue, 1 Jun 2021 13:43:19 +0200
Subject: [PATCH 2/3] update documentation

---
 README.md       | 11 ++++++-----
 nextflow.config | 11 ++++++-----
 2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index e290e6e..1721e96 100644
--- a/README.md
+++ b/README.md
@@ -117,17 +117,18 @@ Usage:
 
 
 Input:
-    * input_files: the path to a tab-separated values file containing in each row the sample name  and path to the VCF file
+    * --input_files: the path to a tab-separated values file containing in each row the sample name  and path to the VCF file
     The input file does not have header!
     Example input file:
     sample1	/path/to/your/file.vcf
     sample2	/path/to/your/file2.vcf
-    * reference: path to the FASTA genome reference (indexes expected *.fai, *.dict)
+    * --reference: path to the FASTA genome reference (indexes expected *.fai, *.dict)
+    * --vcf-without-ad: indicate when the VCFs to normalize do not have the FORMAT/AD annotation
 
 Optional input:
-    * output: the folder where to publish output
-    * skip_decompose_complex: flag indicating not to split complex variants (ie: MNVs and combinations of SNVs and indels)
-    * filter: specify the filter to apply if any (e.g.: PASS), only variants with this value will be kept
+    * --output: the folder where to publish output
+    * --skip_decompose_complex: flag indicating not to split complex variants (ie: MNVs and combinations of SNVs and indels)
+    * --filter: specify the filter to apply if any (e.g.: PASS), only variants with this value will be kept
 
 Output:
     * Normalized VCF file
diff --git a/nextflow.config b/nextflow.config
index c22112e..38e242a 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -47,17 +47,18 @@ Usage:
 
 
 Input:
-    * input_files: the path to a tab-separated values file containing in each row the sample name  and path to the VCF file
+    * --input_files: the path to a tab-separated values file containing in each row the sample name  and path to the VCF file
     The input file does not have header!
     Example input file:
     sample1	/path/to/your/file.vcf
     sample2	/path/to/your/file2.vcf
-    * reference: path to the FASTA genome reference (indexes expected *.fai, *.dict)
+    * --reference: path to the FASTA genome reference (indexes expected *.fai, *.dict)
 
 Optional input:
-    * output: the folder where to publish output
-    * skip_decompose_complex: flag indicating not to split complex variants (ie: MNVs and combinations of SNVs and indels)
-    * filter: specify the filter to apply if any (e.g.: PASS), only variants with this value will be kept
+    * --output: the folder where to publish output
+    * --skip_decompose_complex: flag indicating not to split complex variants (ie: MNVs and combinations of SNVs and indels)
+    * --filter: specify the filter to apply if any (e.g.: PASS), only variants with this value will be kept
+    * --vcf-without-ad: indicate when the VCFs to normalize do not have the FORMAT/AD annotation
 
 Output:
     * Normalized VCF file

From 1f3e62a7b46a49b3c7e9c2f1dfb580748bd0def7 Mon Sep 17 00:00:00 2001
From: priesgof <priesgoferreiro@gmail.com>
Date: Tue, 1 Jun 2021 14:08:56 +0200
Subject: [PATCH 3/3] support either a file with multiple files or a single VCF

---
 Makefile        |  4 +++-
 README.md       |  3 ++-
 main.nf         | 16 ++++++++++++----
 nextflow.config |  3 ++-
 4 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/Makefile b/Makefile
index 3ba7d7d..edc42a4 100644
--- a/Makefile
+++ b/Makefile
@@ -17,6 +17,7 @@ test:
 	nextflow main.nf -profile test,conda --output output/test2 --filter PASS,MNV
 	nextflow main.nf -profile test,conda --output output/test3 --skip_decompose_complex
 	nextflow main.nf -profile test,conda --input_files test_data/test_input_no_ad.txt --output output/test4
+	nextflow main.nf -profile test,conda --output output/test5 --input_files false --input_vcf test_data/test_single_sample.vcf
 
 check:
 	test -s output/test1/tumor_normal/tumor_normal.normalized.vcf || { echo "Missing test 1 output file!"; exit 1; }
@@ -25,4 +26,5 @@ check:
 	test -s output/test1/single_sample/single_sample.normalized.vcf || { echo "Missing test 1 output file!"; exit 1; }
 	test -s output/test2/single_sample/single_sample.normalized.vcf || { echo "Missing test 2 output file!"; exit 1; }
 	test -s output/test3/single_sample/single_sample.normalized.vcf || { echo "Missing test 3 output file!"; exit 1; }
-	test -s output/test4/sample_no_ad/sample_no_ad.normalized.vcf || { echo "Missing test 4 output file!"; exit 1; }
\ No newline at end of file
+	test -s output/test4/sample_no_ad/sample_no_ad.normalized.vcf || { echo "Missing test 4 output file!"; exit 1; }
+	test -s output/test5/test_single_sample/test_single_sample.normalized.vcf || { echo "Missing test 5 output file!"; exit 1; }
\ No newline at end of file
diff --git a/README.md b/README.md
index 1721e96..3dd6213 100644
--- a/README.md
+++ b/README.md
@@ -117,7 +117,8 @@ Usage:
 
 
 Input:
-    * --input_files: the path to a tab-separated values file containing in each row the sample name  and path to the VCF file
+    * --input_vcf: the path to a single VCF to normalize (not compatible with --input_files)
+    * --input_files: the path to a tab-separated values file containing in each row the sample name  and path to the VCF file (not compatible with --input_vcf)
     The input file does not have header!
     Example input file:
     sample1	/path/to/your/file.vcf
diff --git a/main.nf b/main.nf
index 9326e91..e0ff22a 100755
--- a/main.nf
+++ b/main.nf
@@ -2,6 +2,7 @@
 
 params.help= false
 params.input_files = false
+params.input_vcf = false
 params.reference = false
 params.output = false
 params.skip_decompose_complex = false
@@ -21,15 +22,22 @@ if (params.output) {
   publish_dir = params.output
 }
 
-// checks required inputs
-if (params.input_files) {
+if (! params.input_files && ! params.input_vcf) {
+  exit 1, "Neither --input-files or --input-vcf are provided!"
+}
+else if (params.input_files && params.input_vcf) {
+  exit 1, "Both --input-files and --input-vcf are provided! Please, provide only one."
+}
+else if (params.input_files) {
   Channel
     .fromPath(params.input_files)
     .splitCsv(header: ['name', 'vcf'], sep: "\t")
     .map{ row-> tuple(row.name, file(row.vcf)) }
     .set { input_files }
-} else {
-  exit 1, "Input file not specified!"
+}
+else if (params.input_vcf) {
+  input_vcf = file(params.input_vcf)
+  Channel.fromList([tuple(input_vcf.name.take(input_vcf.name.lastIndexOf('.')), input_vcf)]).set { input_files }
 }
 
 if (params.filter) {
diff --git a/nextflow.config b/nextflow.config
index 38e242a..448b95a 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -47,7 +47,8 @@ Usage:
 
 
 Input:
-    * --input_files: the path to a tab-separated values file containing in each row the sample name  and path to the VCF file
+    * --input_vcf: the path to a single VCF to normalize (not compatible with --input_files)
+    * --input_files: the path to a tab-separated values file containing in each row the sample name  and path to the VCF file (not compatible with --input_vcf)
     The input file does not have header!
     Example input file:
     sample1	/path/to/your/file.vcf