From 511f9253f9e52f8142911c807df2e538d0413cc9 Mon Sep 17 00:00:00 2001 From: Michael Heuer Date: Fri, 25 Aug 2017 12:53:02 -0500 Subject: [PATCH] Merge VCF header lines with VCFHeaderLineCount.INTEGER correctly. --- .../converters/VariantContextConverter.scala | 30 +++++--- .../invalid/truth_small_variants.vcf | 70 +++++++++++++++++++ .../rdd/variant/VariantContextRDDSuite.scala | 12 ++++ 3 files changed, 104 insertions(+), 8 deletions(-) create mode 100644 adam-core/src/test/resources/invalid/truth_small_variants.vcf diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/converters/VariantContextConverter.scala b/adam-core/src/main/scala/org/bdgenomics/adam/converters/VariantContextConverter.scala index 69738eba82..02e0fee847 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/converters/VariantContextConverter.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/converters/VariantContextConverter.scala @@ -192,10 +192,17 @@ object VariantContextConverter { .find(_.getID == key) .fold(Some(fl).asInstanceOf[Option[VCFCompoundHeaderLine]])(defaultLine => { auditLine(fl, defaultLine, (newId, oldLine) => { - new VCFFormatHeaderLine(newId, - oldLine.getCountType, - oldLine.getType, - oldLine.getDescription) + if (oldLine.getCountType == VCFHeaderLineCount.INTEGER) { + new VCFFormatHeaderLine(newId, + oldLine.getCount, + oldLine.getType, + oldLine.getDescription) + } else { + new VCFFormatHeaderLine(newId, + oldLine.getCountType, + oldLine.getType, + oldLine.getDescription) + } }) }) } @@ -205,10 +212,17 @@ object VariantContextConverter { .find(_.getID == key) .fold(Some(il).asInstanceOf[Option[VCFCompoundHeaderLine]])(defaultLine => { auditLine(il, defaultLine, (newId, oldLine) => { - new VCFInfoHeaderLine(newId, - oldLine.getCountType, - oldLine.getType, - oldLine.getDescription) + if (oldLine.getCountType == VCFHeaderLineCount.INTEGER) { + new VCFInfoHeaderLine(newId, + oldLine.getCount, + oldLine.getType, + oldLine.getDescription) + } else { + new VCFInfoHeaderLine(newId, + oldLine.getCountType, + oldLine.getType, + oldLine.getDescription) + } }) }) } diff --git a/adam-core/src/test/resources/invalid/truth_small_variants.vcf b/adam-core/src/test/resources/invalid/truth_small_variants.vcf new file mode 100644 index 0000000000..bf0fe22b4b --- /dev/null +++ b/adam-core/src/test/resources/invalid/truth_small_variants.vcf @@ -0,0 +1,70 @@ +##fileformat=VCFv4.2 +##fileDate=20160824 +##CL=vcffilter -i filtered-phase-transfer.vcf.gz -o - --javascript "ensureFormatHeader(\"##FORMAT=\"); function record() {if(INTEGRATION.GT==\"1/1\") { INTEGRATION.IPS=\".\"; INTEGRATION.PS=\"HOMVAR\"; INTEGRATION.GT=\"1|1\";} else {if((INTEGRATION.GT==\"0/1\" || INTEGRATION.GT==\"1/2\" || INTEGRATION.GT==\"2/1\" || INTEGRATION.GT==\"1/0\") ) {if(INTEGRATION.IPS.length>1) {INTEGRATION.PS=INTEGRATION.IPS; INTEGRATION.GT=INTEGRATION.IGT;} else {INTEGRATION.PS=\".\";};} else { if((INTEGRATION.IPS.length<2)) { INTEGRATION.IPS=\".\";} INTEGRATION.PS=\"PATMAT\";};};}" +##RUN-ID=16dacf15-fdc9-4199-84bd-723ea8bcddef +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER=0.8"> +##FILTER= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT INTEGRATION +chr1 817186 . G A 50 PASS platforms=3;platformnames=Illumina,CG,Solid;datasets=3;datasetnames=HiSeqPE300x,CGnormal,SolidSE75bp;callsets=4;callsetnames=HiSeqPE300xGATK,CGnormal,HiSeqPE300xfreebayes,SolidSE75GATKHC;datasetsmissingcall=10XChromium,IonExome,SolidPE50x50bp;callable=CS_HiSeqPE300xGATK_callable,CS_CGnormal_callable,CS_HiSeqPE300xfreebayes_callable;filt=CS_SolidSE75GATKHC_filt GT:DP:ADALL:AD:GQ:IGT:IPS:PS 1|1:823:0,381:78,454:283:1/1:.:PATMAT +chr1 817341 . A G 50 PASS platforms=3;platformnames=Illumina,CG,Solid;datasets=4;datasetnames=HiSeqPE300x,CGnormal,SolidPE50x50bp,SolidSE75bp;callsets=5;callsetnames=HiSeqPE300xGATK,CGnormal,HiSeqPE300xfreebayes,SolidPE50x50GATKHC,SolidSE75GATKHC;datasetsmissingcall=10XChromium,IonExome;callable=CS_HiSeqPE300xGATK_callable,CS_CGnormal_callable,CS_HiSeqPE300xfreebayes_callable;filt=CS_SolidPE50x50GATKHC_filt GT:DP:ADALL:AD:GQ:IGT:IPS:PS 1|1:584:0,255:107,342:327:1/1:.:PATMAT +chr1 817889 . C G 50 PASS platforms=2;platformnames=Illumina,CG;datasets=2;datasetnames=HiSeqPE300x,CGnormal;callsets=3;callsetnames=HiSeqPE300xGATK,CGnormal,HiSeqPE300xfreebayes;datasetsmissingcall=10XChromium,IonExome,SolidPE50x50bp,SolidSE75bp;callable=CS_HiSeqPE300xGATK_callable,CS_CGnormal_callable,CS_HiSeqPE300xfreebayes_callable GT:DP:ADALL:AD:GQ:IGT:IPS:PS 1|1:361:0,146:74,220:209:1/1:.:PATMAT +chr1 818025 . C A 50 PASS platforms=2;platformnames=Illumina,CG;datasets=2;datasetnames=HiSeqPE300x,CGnormal;callsets=3;callsetnames=HiSeqPE300xGATK,CGnormal,HiSeqPE300xfreebayes;datasetsmissingcall=10XChromium,IonExome,SolidPE50x50bp,SolidSE75bp;callable=CS_HiSeqPE300xGATK_callable,CS_CGnormal_callable,CS_HiSeqPE300xfreebayes_callable;filt=CS_HiSeqPE300xGATK_filt,CS_HiSeqPE300xfreebayes_filt GT:DP:ADALL:AD:GQ:IGT:IPS:PS 1|1:283:0,118:43,43:219:1/1:.:PATMAT +chr1 818802 . A G 50 PASS platforms=2;platformnames=Illumina,Solid;datasets=2;datasetnames=HiSeqPE300x,SolidSE75bp;callsets=3;callsetnames=HiSeqPE300xGATK,HiSeqPE300xfreebayes,SolidSE75GATKHC;datasetsmissingcall=CGnormal,10XChromium,IonExome,SolidPE50x50bp;callable=CS_HiSeqPE300xGATK_callable,CS_HiSeqPE300xfreebayes_callable;filt=CS_HiSeqPE300xfreebayes_filt,CS_SolidSE75GATKHC_filt GT:DP:ADALL:AD:GQ:IGT:IPS:PS 1|1:432:0,205:0,202:108:1|1:818802_A_G:PATMAT +chr1 818812 . A G 50 PASS platforms=2;platformnames=Illumina,Solid;datasets=2;datasetnames=HiSeqPE300x,SolidSE75bp;callsets=3;callsetnames=HiSeqPE300xGATK,HiSeqPE300xfreebayes,SolidSE75GATKHC;datasetsmissingcall=CGnormal,10XChromium,IonExome,SolidPE50x50bp;callable=CS_HiSeqPE300xGATK_callable,CS_HiSeqPE300xfreebayes_callable;filt=CS_HiSeqPE300xfreebayes_filt,CS_SolidSE75GATKHC_filt GT:DP:ADALL:AD:GQ:IGT:IPS:PS 1|1:412:0,192:0,190:108:1|1:818802_A_G:PATMAT +chr1 818954 . T C 50 PASS platforms=3;platformnames=Illumina,CG,Solid;datasets=3;datasetnames=HiSeqPE300x,CGnormal,SolidPE50x50bp;callsets=4;callsetnames=HiSeqPE300xGATK,CGnormal,HiSeqPE300xfreebayes,SolidPE50x50GATKHC;datasetsmissingcall=10XChromium,IonExome,SolidSE75bp;callable=CS_HiSeqPE300xGATK_callable,CS_CGnormal_callable,CS_HiSeqPE300xfreebayes_callable;filt=CS_CGnormal_filt,CS_SolidPE50x50GATKHC_filt GT:DP:ADALL:AD:GQ:IGT:IPS 1|1:621:0,250:0,246:233:1/1:. diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/variant/VariantContextRDDSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/variant/VariantContextRDDSuite.scala index bc8354d07c..952f9b1461 100644 --- a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/variant/VariantContextRDDSuite.scala +++ b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/variant/VariantContextRDDSuite.scala @@ -129,6 +129,18 @@ class VariantContextRDDSuite extends ADAMFunSuite { assert(vcRdd.sequences.records(0).name === "chr11") } + sparkTest("transform a vcf file with bad header") { + val path = testFile("invalid/truth_small_variants.vcf") + val before = sc.loadVcf(path, ValidationStringency.SILENT) + assert(before.rdd.count == 1) + + val tempPath = tmpLocation(".adam") + before.toVariantRDD().saveAsParquet(tempPath) + + val after = sc.loadVariants(tempPath).toVariantContextRDD() + assert(after.rdd.count == 1) + } + sparkTest("don't lose any variants when piping as VCF") { val smallVcf = testFile("small.vcf") val rdd: VariantContextRDD = sc.loadVcf(smallVcf)