diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/GenotypeGVCFs.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/GenotypeGVCFs.java index d26b555586e..f36a511f22f 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/GenotypeGVCFs.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/GenotypeGVCFs.java @@ -88,6 +88,7 @@ public final class GenotypeGVCFs extends VariantWalker { public static final String PHASED_HOM_VAR_STRING = "1|1"; public static final String ONLY_OUTPUT_CALLS_STARTING_IN_INTERVALS_FULL_NAME = "only-output-calls-starting-in-intervals"; public static final String ALL_SITES_LONG_NAME = "include-non-variant-sites"; + public static final String ALL_SITES_SHORT_NAME = "all-sites"; private static final String GVCF_BLOCK = "GVCFBlock"; @Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, @@ -271,29 +272,20 @@ private VariantContext calculateGenotypes(VariantContext vc){ /** * Determines whether the provided VariantContext has real alternate alleles. * - * There is a bit of a hack to handle the case because it is not defined in htsjdk.Allele - * We check for this as a biallelic symbolic allele. - * * @param vc the VariantContext to evaluate * @return true if it has proper alternate alleles, false otherwise */ - @VisibleForTesting - static boolean isProperlyPolymorphic(final VariantContext vc) { + public static boolean isProperlyPolymorphic(final VariantContext vc) { //obvious cases if (vc == null || vc.getAlternateAlleles().isEmpty()) { return false; } else if (vc.isBiallelic()) { - return !(isSpanningDeletion(vc.getAlternateAllele(0)) || vc.isSymbolic()); + return !(GATKVCFConstants.isSpanningDeletion(vc.getAlternateAllele(0)) || vc.isSymbolic()); } else { return true; } } - @VisibleForTesting - static boolean isSpanningDeletion(final Allele allele){ - return allele.equals(Allele.SPAN_DEL) || allele.equals(GATKVCFConstants.SPANNING_DELETION_SYMBOLIC_ALLELE_DEPRECATED); - } - /** * Add genotyping-based annotations to the new VC * diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/QualByDepth.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/QualByDepth.java index 783c372e4b2..0d9576857b2 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/QualByDepth.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/QualByDepth.java @@ -67,6 +67,39 @@ public Map annotate(final ReferenceContext ref, return Collections.emptyMap(); } + final int depth = getDepth(genotypes, likelihoods); + + if ( depth == 0 ) { + return Collections.emptyMap(); + } + + final double qual = -10.0 * vc.getLog10PError(); + double QD = qual / depth; + + // Hack: see note in the fixTooHighQD method below + QD = fixTooHighQD(QD); + + return Collections.singletonMap(getKeyNames().get(0), String.format("%.2f", QD)); + } + + /** + * The haplotype caller generates very high quality scores when multiple events are on the + * same haplotype. This causes some very good variants to have unusually high QD values, + * and VQSR will filter these out. This code looks at the QD value, and if it is above + * threshold we map it down to the mean high QD value, with some jittering + * + * @param QD the raw QD score + * @return a QD value + */ + public static double fixTooHighQD(final double QD) { + if ( QD < MAX_QD_BEFORE_FIXING ) { + return QD; + } else { + return IDEAL_HIGH_QD + Utils.getRandomGenerator().nextGaussian() * JITTER_SIGMA; + } + } + + public static int getDepth(final GenotypesContext genotypes, final ReadLikelihoods likelihoods) { int depth = 0; int ADrestrictedDepth = 0; @@ -100,35 +133,7 @@ public Map annotate(final ReferenceContext ref, if ( ADrestrictedDepth > 0 ) { depth = ADrestrictedDepth; } - - if ( depth == 0 ) { - return Collections.emptyMap(); - } - - final double qual = -10.0 * vc.getLog10PError(); - double QD = qual / depth; - - // Hack: see note in the fixTooHighQD method below - QD = fixTooHighQD(QD); - - return Collections.singletonMap(getKeyNames().get(0), String.format("%.2f", QD)); - } - - /** - * The haplotype caller generates very high quality scores when multiple events are on the - * same haplotype. This causes some very good variants to have unusually high QD values, - * and VQSR will filter these out. This code looks at the QD value, and if it is above - * threshold we map it down to the mean high QD value, with some jittering - * - * @param QD the raw QD score - * @return a QD value - */ - public static double fixTooHighQD(final double QD) { - if ( QD < MAX_QD_BEFORE_FIXING ) { - return QD; - } else { - return IDEAL_HIGH_QD + Utils.getRandomGenerator().nextGaussian() * JITTER_SIGMA; - } + return depth; } @Override diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/RMSMappingQuality.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/RMSMappingQuality.java index 5fb3324d685..eb49a7096b5 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/RMSMappingQuality.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/RMSMappingQuality.java @@ -53,12 +53,12 @@ public List getKeyNames() { @Override public List getDescriptions() { - return Arrays.asList(VCFStandardHeaderLines.getInfoLine(getKeyNames().get(0)), GATKVCFHeaderLines.getInfoLine(getRawKeyName())); + return Arrays.asList(VCFStandardHeaderLines.getInfoLine(getKeyNames().get(0))); } @Override public List getRawDescriptions() { - return getDescriptions(); + return Arrays.asList(GATKVCFHeaderLines.getInfoLine(getRawKeyName())); } /** diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCallerArgumentCollection.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCallerArgumentCollection.java index 89a1b069acf..253cc035b1c 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCallerArgumentCollection.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCallerArgumentCollection.java @@ -22,6 +22,8 @@ public class HaplotypeCallerArgumentCollection extends AssemblyBasedCallerArgume public static final String MAX_MNP_DISTANCE_LONG_NAME = "max-mnp-distance"; public static final String MAX_MNP_DISTANCE_SHORT_NAME = "mnp-dist"; + public static final String GQ_BAND_LONG_NAME = "gvcf-gq-bands"; + public static final String GQ_BAND_SHORT_NAME = "GQB"; /** * You can use this argument to specify that HC should process a single sample out of a multisample BAM file. This @@ -73,7 +75,7 @@ public class HaplotypeCallerArgumentCollection extends AssemblyBasedCallerArgume * and end at 100 (exclusive). */ @Advanced - @Argument(fullName = "gvcf-gq-bands", shortName = "GQB", doc= "Exclusive upper bounds for reference confidence GQ bands " + + @Argument(fullName = GQ_BAND_LONG_NAME, shortName = GQ_BAND_SHORT_NAME, doc= "Exclusive upper bounds for reference confidence GQ bands " + "(must be in [1, 100] and specified in increasing order)", optional = true) public List GVCFGQBands = new ArrayList<>(70); { diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java index e509e3ddd0b..5e8fdf0f577 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java @@ -69,7 +69,7 @@ protected String callSourceString() { @Override protected boolean forceKeepAllele(final Allele allele) { - return allele == Allele.NON_REF_ALLELE || referenceConfidenceMode != ReferenceConfidenceMode.NONE + return allele.equals(Allele.NON_REF_ALLELE,false) || referenceConfidenceMode != ReferenceConfidenceMode.NONE || configuration.genotypingOutputMode == GenotypingOutputMode.GENOTYPE_GIVEN_ALLELES; } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/variantutils/ReblockGVCF.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/variantutils/ReblockGVCF.java new file mode 100644 index 00000000000..5cdb257ea7b --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/variantutils/ReblockGVCF.java @@ -0,0 +1,435 @@ +package org.broadinstitute.hellbender.tools.walkers.variantutils; + +import com.google.common.annotations.VisibleForTesting; +import htsjdk.variant.variantcontext.*; +import htsjdk.variant.variantcontext.writer.VariantContextWriter; +import htsjdk.variant.vcf.*; +import org.broadinstitute.barclay.argparser.*; +import org.broadinstitute.barclay.help.DocumentedFeature; +import org.broadinstitute.hellbender.cmdline.*; +import org.broadinstitute.hellbender.cmdline.argumentcollections.DbsnpArgumentCollection; +import org.broadinstitute.hellbender.engine.*; +import org.broadinstitute.hellbender.exceptions.UserException; +import org.broadinstitute.hellbender.tools.walkers.GenotypeGVCFs; +import org.broadinstitute.hellbender.tools.walkers.annotator.*; +import org.broadinstitute.hellbender.tools.walkers.genotyper.*; +import org.broadinstitute.hellbender.tools.walkers.genotyper.afcalc.FixedAFCalculatorProvider; +import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.HaplotypeCallerArgumentCollection; +import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.HaplotypeCallerGenotypingEngine; +import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.ReferenceConfidenceMode; +import org.broadinstitute.hellbender.utils.MathUtils; +import org.broadinstitute.hellbender.utils.genotyper.IndexedSampleList; +import org.broadinstitute.hellbender.utils.genotyper.SampleList; +import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants; +import org.broadinstitute.hellbender.utils.variant.GATKVCFHeaderLines; +import org.broadinstitute.hellbender.utils.variant.GATKVariantContextUtils; +import org.broadinstitute.hellbender.utils.variant.writers.GVCFWriter; +import picard.cmdline.programgroups.OtherProgramGroup; + +import java.io.File; +import java.util.*; + +/** + * Condense homRef blocks in a single-sample GVCF + * + *

+ * ReblockGVCF compressed a GVCF by merging hom-ref blocks that were produced using the '-ERC GVCF' or '-ERC BP_RESOLUTION' mode of the + * HaplotypeCaller according to new GQ band parameters. A joint callset produced with GVCFs reprocessed by ReblockGVCF will have + * lower precision for hom-ref genotype qualities at variant sites, but the input data footprint can be greatly reduced + * if the default GQ band parameters are used.

+ * + *

Input

+ *

+ * A HaplotypeCaller-produced gVCF to reblock + *

+ * + *

Output

+ *

+ * A smaller GVCF. + *

+ * + *

Usage example

+ *
+ * gatk ReblockGVCF \
+ *   -R reference.fasta \
+ *   -V sample1.g.vcf \
+ *   -O sample1.reblocked.g.vcf
+ * 
+ * + *

Caveats

+ *

Only single-sample gVCF files produced by HaplotypeCaller can be used as input for this tool.

+ *

By default this tool only passes through annotations used by VQSR. A different set of annotations can be specified with the usual -A argument. + *

Special note on ploidy

+ *

This tool assumes diploid genotypes.

+ * + */ +@ExperimentalFeature +@CommandLineProgramProperties(summary = "Compress a single-sample GVCF from HaplotypeCaller by merging homRef blocks using new GQ band parameters", + oneLineSummary = "Condenses homRef blocks in a single-sample GVCF", + programGroup = OtherProgramGroup.class, + omitFromCommandLine = true) +@DocumentedFeature +public final class ReblockGVCF extends VariantWalker { + + private final static int PLOIDY_TWO = 2; //assume diploid genotypes + + @Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, + doc="File to which variants should be written") + private File outputFile; + + @ArgumentCollection + public GenotypeCalculationArgumentCollection genotypeArgs = new GenotypeCalculationArgumentCollection(); + + @Advanced + @Argument(fullName=HaplotypeCallerArgumentCollection.GQ_BAND_LONG_NAME, shortName=HaplotypeCallerArgumentCollection.GQ_BAND_SHORT_NAME, + doc="Exclusive upper bounds for reference confidence GQ bands (must be in [1, 100] and specified in increasing order)") + public List GVCFGQBands = new ArrayList<>(); + { + GVCFGQBands.add(20); GVCFGQBands.add(100); + } + + @Advanced + @Argument(fullName="drop-low-quals", shortName="drop-low-quals", doc="Exclude variants and homRef blocks that are GQ0 from the reblocked GVCF to save space; drop low quality/uncalled alleles") + protected boolean dropLowQuals = false; + + @Advanced + @Argument(fullName="rgq-threshold-to-no-call", shortName="rgq-threshold", doc="Reference genotype quality (PL[0]) value below which variant sites will be converted to GQ0 homRef calls") + protected double rgqThreshold = 0.0; + + @Advanced + @Argument(fullName="do-qual-score-approximation", shortName="do-qual-approx", doc="Add necessary INFO field annotation to perform QUAL approximation downstream") + protected boolean doQualApprox = false; + + /** + * The rsIDs from this file are used to populate the ID column of the output. Also, the DB INFO flag will be set when appropriate. Note that dbSNP is not used in any way for the calculations themselves. + */ + @ArgumentCollection + protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection(); + + // the genotyping engine + private HaplotypeCallerGenotypingEngine genotypingEngine; + // the annotation engine + private VariantAnnotatorEngine annotationEngine; + // the INFO field annotation key names to remove + private final List infoFieldAnnotationKeyNamesToRemove = Arrays.asList(GVCFWriter.GVCF_BLOCK, + GATKVCFConstants.DOWNSAMPLED_KEY, GATKVCFConstants.HAPLOTYPE_SCORE_KEY, + GATKVCFConstants.INBREEDING_COEFFICIENT_KEY, GATKVCFConstants.MLE_ALLELE_COUNT_KEY, + GATKVCFConstants.MLE_ALLELE_FREQUENCY_KEY); + + private VariantContextWriter vcfWriter; + + @Override + public boolean useVariantAnnotations() { return true;} + + @Override + public List getDefaultVariantAnnotations() { + return Arrays.asList(new Coverage(), new RMSMappingQuality(), new ReadPosRankSumTest(), new MappingQualityRankSumTest()); + + } + + @Override + public void onTraversalStart() { + VCFHeader inputHeader = getHeaderForVariants(); + if (inputHeader.getGenotypeSamples().size() > 1) { + throw new UserException.BadInput("ReblockGVCF is a single sample tool, but the input GVCF has more than 1 sample."); + } + final Set inputHeaders = inputHeader.getMetaDataInSortedOrder(); + + final Set headerLines = new HashSet<>(inputHeaders); + // Remove GCVFBlocks, legacy headers, and annotations that aren't informative for single samples + headerLines.removeIf(vcfHeaderLine -> vcfHeaderLine.getKey().startsWith(GVCFWriter.GVCF_BLOCK) || + (vcfHeaderLine.getKey().equals("INFO")) && infoFieldAnnotationKeyNamesToRemove.contains(((VCFInfoHeaderLine)vcfHeaderLine).getID())); + + headerLines.addAll(getDefaultToolVCFHeaderLines()); + + genotypingEngine = createGenotypingEngine(new IndexedSampleList(inputHeader.getGenotypeSamples())); + createAnnotationEngine(); + + headerLines.addAll(annotationEngine.getVCFAnnotationDescriptions(false)); + headerLines.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.DEPTH_KEY)); // needed for gVCFs without DP tags + headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.RAW_QUAL_APPROX_KEY)); + headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.VARIANT_DEPTH_KEY)); + headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.MAPPING_QUALITY_DEPTH)); + + if ( dbsnp.dbsnp != null ) { + VCFStandardHeaderLines.addStandardInfoLines(headerLines, true, VCFConstants.DBSNP_KEY); + } + + VariantContextWriter writer = createVCFWriter(outputFile); + + try { + vcfWriter = new GVCFWriter(writer, GVCFGQBands, PLOIDY_TWO); + } catch ( IllegalArgumentException e ) { + throw new IllegalArgumentException("GQBands are malformed: " + e.getMessage(), e); + } + vcfWriter.writeHeader(new VCFHeader(headerLines, inputHeader.getGenotypeSamples())); + + logger.info("Notice that the -ploidy parameter is ignored in " + getClass().getSimpleName() + " tool as this is tool assumes a diploid sample"); + } + + private HaplotypeCallerGenotypingEngine createGenotypingEngine(SampleList samples) { + final HaplotypeCallerArgumentCollection hcArgs = new HaplotypeCallerArgumentCollection(); + // create the genotyping engine + hcArgs.outputMode = OutputMode.EMIT_ALL_SITES; + hcArgs.annotateAllSitesWithPLs = true; + hcArgs.genotypeArgs = new GenotypeCalculationArgumentCollection(genotypeArgs); + hcArgs.emitReferenceConfidence = ReferenceConfidenceMode.GVCF; //this is important to force emission of all alleles at a multiallelic site + return new HaplotypeCallerGenotypingEngine(hcArgs, samples, FixedAFCalculatorProvider.createThreadSafeProvider(hcArgs), true); + + } + + @VisibleForTesting + protected void createAnnotationEngine() { + annotationEngine = new VariantAnnotatorEngine(makeVariantAnnotations(), dbsnp.dbsnp, Collections.emptyList(), false); + } + + // get VariantContexts from input gVCFs and regenotype + @Override + public void apply(VariantContext variant, ReadsContext reads, ReferenceContext ref, FeatureContext features) { + final VariantContext newVC = regenotypeVC(variant); + if (newVC != null) { + vcfWriter.add(newVC); + } + } + + /** + * Re-genotype (and re-annotate) a VariantContext + * Note that the GVCF write takes care of the actual homRef block merging based on {@code GVCFGQBands} + * + * @param originalVC the combined genomic VC + * @return a new VariantContext or null if the site turned monomorphic and we don't want such sites + */ + private VariantContext regenotypeVC(final VariantContext originalVC) { + VariantContext result = originalVC; + + //Pass back ref-conf homRef sites/blocks to be combined by the GVCFWriter + if (isHomRefBlock(result)) { + return filterHomRefBlock(result); + } + + //don't need to calculate quals for sites with no data whatsoever or sites already genotyped homRef, + // but if STAND_CALL_CONF > 0 we need to drop low quality alleles and regenotype + if (result.getAttributeAsInt(VCFConstants.DEPTH_KEY,0) > 0 && !isHomRefCall(result)) { + final GenotypeLikelihoodsCalculationModel model = result.getType() == VariantContext.Type.INDEL + ? GenotypeLikelihoodsCalculationModel.INDEL + : GenotypeLikelihoodsCalculationModel.SNP; + result = genotypingEngine.calculateGenotypes(originalVC, model, null); + } + + if (result == null) { + return null; + } + + //variants with PL[0] less than threshold get turned to homRef with PL=[0,0,0], shouldn't get INFO attributes + //make sure we can call het variants with GQ >= rgqThreshold in joint calling downstream + if(shouldBeReblocked(result)) { + return lowQualVariantToGQ0HomRef(result, originalVC); + } + //high quality variant + else { + return cleanUpHighQualityVariant(result, originalVC); + } + } + + private boolean isHomRefBlock(final VariantContext result) { + return result.getLog10PError() == VariantContext.NO_LOG10_PERROR; + } + + /** + * determine if VC is a homRef "call", i.e. an annotated variant with non-symbolic alt alleles and homRef genotypes + * we treat these differently from het/homVar calls or homRef blocks + * @param result VariantContext to process + * @return true if VC is a 0/0 call and not a homRef block + */ + private boolean isHomRefCall(final VariantContext result) { + final Genotype genotype = result.getGenotype(0); + return genotype.isHomRef() && result.getLog10PError() != VariantContext.NO_LOG10_PERROR; + } + + private VariantContext filterHomRefBlock(final VariantContext result) { + final Genotype genotype = result.getGenotype(0); + if (dropLowQuals && (genotype.getGQ() <= rgqThreshold || genotype.getGQ() == 0)) { + return null; + } + else if (genotype.isCalled() && genotype.isHomRef()) { + return result; + } + else if (!genotype.isCalled() && genotype.hasPL() && genotype.getPL()[0] == 0) { + return result; + } + else { + return null; + } + } + + private boolean shouldBeReblocked(final VariantContext result) { + final Genotype genotype = result.getGenotype(0); + return genotype.getPL()[0] < rgqThreshold || genotype.isHomRef(); + } + + /** + * "reblock" a variant by converting its genotype to homRef, changing PLs, adding reblock END tags and other attributes + * @param result a variant already determined to be low quality + * @param originalVC the variant context with the original, full set of alleles + * @return + */ + @VisibleForTesting + public VariantContext lowQualVariantToGQ0HomRef(final VariantContext result, final VariantContext originalVC) { + if(dropLowQuals && !isHomRefCall(result)) { + return null; + } + + final Map attrMap = new HashMap<>(); + final GenotypeBuilder gb = changeCallToGQ0HomRef(result, attrMap); + + //there are some cases where there are low quality variants with homRef calls AND alt alleles! + //TODO: the best thing would be to take the most likely alt's likelihoods + if (isHomRefCall(originalVC)) { + final Genotype genotype = result.getGenotype(0); + final int[] idxVector = originalVC.getGLIndicesOfAlternateAllele(Allele.NON_REF_ALLELE); //this is always length 3 + final int[] multiallelicPLs = genotype.getPL(); + final int[] newPLs = new int[3]; + newPLs[0] = multiallelicPLs[idxVector[0]]; + newPLs[1] = multiallelicPLs[idxVector[1]]; + newPLs[2] = multiallelicPLs[idxVector[2]]; + gb.PL(newPLs); + if (genotype.hasAD()) { + int depth = (int) MathUtils.sum(genotype.getAD()); + gb.DP(depth); + gb.attribute(GATKVCFConstants.MIN_DP_FORMAT_KEY, depth); + } + } + + VariantContextBuilder builder = new VariantContextBuilder(result); + final Genotype newG = gb.make(); + return builder.alleles(Arrays.asList(newG.getAlleles().get(0), Allele.NON_REF_ALLELE)).unfiltered().log10PError(VariantContext.NO_LOG10_PERROR).attributes(attrMap).genotypes(newG).make(); //genotyping engine will add lowQual filter, so strip it off + } + + /** + * Note that this modifies {@code attrMap} as a side effect + * @param result a VC to be converted to a GQ0 homRef call + * @param attrMap the new VC attribute map, to update the END tag as necessary + * @return a GenotypeBuilder to make a 0/0 call with PLs=[0,0,0] + */ + @VisibleForTesting + protected GenotypeBuilder changeCallToGQ0HomRef(final VariantContext result, final Map attrMap) { + Genotype genotype = result.getGenotype(0); + Allele newRef = result.getReference(); + GenotypeBuilder gb = new GenotypeBuilder(genotype); + //NB: If we're dropping a deletion allele, then we need to trim the reference and add an END tag with the vc stop position + if (result.getReference().length() > 1) { + attrMap.put(VCFConstants.END_KEY, result.getEnd()); + newRef = Allele.create(newRef.getBases()[0], true); + gb.alleles(Collections.nCopies(PLOIDY_TWO, newRef)); + } + //if GT is not homRef, correct it + if (!isHomRefCall(result)) { + gb.PL(new int[3]); //3 for diploid PLs, automatically initializes to zero + gb.GQ(0).noAD().alleles(Collections.nCopies(PLOIDY_TWO, newRef)).noAttributes(); + } + return gb; + } + + @VisibleForTesting + protected VariantContext cleanUpHighQualityVariant(final VariantContext result, final VariantContext originalVC) { + Map attrMap = new HashMap<>(); + Map origMap = originalVC.getAttributes(); + //copy over info annotations + for(final InfoFieldAnnotation annotation : annotationEngine.getInfoAnnotations()) { + for (final String key : annotation.getKeyNames()) { + if (infoFieldAnnotationKeyNamesToRemove.contains(key)) { + continue; + } + if (origMap.containsKey(key)) { + attrMap.put(key, origMap.get(key)); + } + } + } + final Genotype genotype = result.getGenotype(0); + if (doQualApprox && genotype.hasPL()) { + attrMap.put(GATKVCFConstants.RAW_QUAL_APPROX_KEY, genotype.getPL()[0]); + int varDP = QualByDepth.getDepth(result.getGenotypes(), null); + if (varDP == 0) { //prevent QD=Infinity case + varDP = result.getAttributeAsInt(VCFConstants.DEPTH_KEY, 1); //if there's no VarDP and no DP, just prevent Infs/NaNs and QD will get capped later + } + attrMap.put(GATKVCFConstants.VARIANT_DEPTH_KEY, varDP); + } + VariantContextBuilder builder = new VariantContextBuilder(result); + builder.attributes(attrMap); + + boolean allelesNeedSubsetting = false; + List allelesToDrop = new ArrayList<>(); + if (dropLowQuals) { + //drop low quality alleles iff we're dropping low quality variants (mostly because this can introduce GVCF gaps if deletion alleles are dropped) + for (final Allele currAlt : result.getAlternateAlleles()) { + boolean foundMatch = false; + for (final Allele gtAllele : genotype.getAlleles()) { + if (gtAllele.equals(currAlt, false)) { + foundMatch = true; + break; + } + if (gtAllele.equals(Allele.NON_REF_ALLELE)) { + if (dropLowQuals) { //don't regenotype, just drop it -- this is a GQ 0 case if ever I saw one + return null; + } else { + GenotypeBuilder gb = changeCallToGQ0HomRef(result, attrMap); + return builder.alleles(Arrays.asList(result.getReference(), Allele.NON_REF_ALLELE)).unfiltered().log10PError(VariantContext.NO_LOG10_PERROR).attributes(attrMap).genotypes(gb.make()).make(); + } + } + } + if (!foundMatch && !currAlt.isSymbolic()) { + allelesNeedSubsetting = true; + allelesToDrop.add(currAlt); + } + } + } + //remove any AD reads for the non-ref + int nonRefInd = result.getAlleleIndex(Allele.NON_REF_ALLELE); + boolean genotypesWereModified = false; + final ArrayList genotypesArray = new ArrayList<>(); + GenotypesContext newGenotypes = result.getGenotypes(); + Genotype g = genotype; + if(g.hasAD()) { + int[] ad = g.getAD(); + if (ad.length >= nonRefInd && ad[nonRefInd] > 0) { //only initialize a builder if we have to + genotypesWereModified = true; + GenotypeBuilder gb = new GenotypeBuilder(g); + ad[nonRefInd] = 0; + gb.AD(ad).DP((int) MathUtils.sum(ad)); + genotypesArray.add(gb.make()); + newGenotypes = GenotypesContext.create(genotypesArray); + } + } + else { + genotypesArray.add(g); + } + + //we're going to approximate MQ_DP with the site-level DP (should be informative and uninformative reads), which is pretty safe because it will only differ if reads are missing MQ + attrMap.put(GATKVCFConstants.MAPPING_QUALITY_DEPTH, originalVC.getAttributeAsInt(VCFConstants.DEPTH_KEY,0)); + + if(allelesNeedSubsetting) { + List newAlleleSet = new ArrayList<>(); + for(final Allele a : result.getAlleles()) { + newAlleleSet.add(a); + } + newAlleleSet.removeAll(allelesToDrop); + builder.alleles(newAlleleSet); + if(!genotypesWereModified) { + builder.genotypes(AlleleSubsettingUtils.subsetAlleles(result.getGenotypes(), PLOIDY_TWO, result.getAlleles(), newAlleleSet, GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN, result.getAttributeAsInt(VCFConstants.DEPTH_KEY, 0))); + } + else { //again, only initialize a builder if we have to + builder.genotypes(AlleleSubsettingUtils.subsetAlleles(newGenotypes, PLOIDY_TWO, result.getAlleles(), newAlleleSet, GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN, result.getAttributeAsInt(VCFConstants.DEPTH_KEY, 0))); + } + //only trim if we're subsetting alleles, and we only subset if we're allowed to drop sites, as per the -drop-low-quals arg + return GATKVariantContextUtils.reverseTrimAlleles(builder.attributes(attrMap).unfiltered().make()); + } + return builder.attributes(attrMap).genotypes(newGenotypes).unfiltered().make(); + } + + @Override + public void closeTool() { + if ( vcfWriter != null ) { + vcfWriter.close(); + } + } +} diff --git a/src/main/java/org/broadinstitute/hellbender/utils/MathUtils.java b/src/main/java/org/broadinstitute/hellbender/utils/MathUtils.java index 809ce18356f..ea243eb3641 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/MathUtils.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/MathUtils.java @@ -19,10 +19,7 @@ import org.broadinstitute.hellbender.utils.param.ParamUtils; import javax.annotation.Nonnull; -import java.util.Arrays; -import java.util.Collection; -import java.util.Collections; -import java.util.List; +import java.util.*; import java.util.function.*; import java.util.stream.Collectors; import java.util.stream.IntStream; diff --git a/src/main/java/org/broadinstitute/hellbender/utils/variant/GATKVCFConstants.java b/src/main/java/org/broadinstitute/hellbender/utils/variant/GATKVCFConstants.java index ef27535522a..db2474e937b 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/variant/GATKVCFConstants.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/variant/GATKVCFConstants.java @@ -20,6 +20,7 @@ public final class GATKVCFConstants { public static final String ALLELE_SPECIFIC_PREFIX = "AS_"; public static final String AS_FILTER_STATUS_KEY = "AS_FilterStatus"; public static final String RAW_RMS_MAPPING_QUALITY_KEY = "RAW_MQ"; + public static final String MAPPING_QUALITY_DEPTH = "MQ_DP"; public static final String AS_RMS_MAPPING_QUALITY_KEY = "AS_MQ"; public static final String AS_RAW_RMS_MAPPING_QUALITY_KEY = "AS_RAW_MQ"; public static final String AS_CULPRIT_KEY = "AS_culprit"; @@ -81,6 +82,8 @@ public final class GATKVCFConstants { public static final String QUAL_BY_DEPTH_KEY = "QD"; public static final String AS_QUAL_BY_DEPTH_KEY = "AS_QD"; public static final String AS_QUAL_KEY = "AS_QUAL"; + public static final String RAW_QUAL_APPROX_KEY = "QUALapprox"; + public static final String VARIANT_DEPTH_KEY = "VarDP"; public static final String READ_POS_RANK_SUM_KEY = "ReadPosRankSum"; public static final String AS_READ_POS_RANK_SUM_KEY = "AS_ReadPosRankSum"; public static final String AS_RAW_READ_POS_RANK_SUM_KEY = "AS_RAW_ReadPosRankSum"; diff --git a/src/main/java/org/broadinstitute/hellbender/utils/variant/GATKVCFHeaderLines.java b/src/main/java/org/broadinstitute/hellbender/utils/variant/GATKVCFHeaderLines.java index 504a61c44a0..d4150e53750 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/variant/GATKVCFHeaderLines.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/variant/GATKVCFHeaderLines.java @@ -1,6 +1,7 @@ package org.broadinstitute.hellbender.utils.variant; import htsjdk.variant.vcf.*; +import org.broadinstitute.hellbender.exceptions.GATKException; import org.broadinstitute.hellbender.utils.Utils; import java.util.*; @@ -13,9 +14,24 @@ */ public class GATKVCFHeaderLines { - public static VCFInfoHeaderLine getInfoLine(final String id) { return infoLines.get(id); } - public static VCFFormatHeaderLine getFormatLine(final String id) { return formatLines.get(id); } - public static VCFFilterHeaderLine getFilterLine(final String id) { return filterLines.get(id); } + public static VCFInfoHeaderLine getInfoLine(final String id) { + if (!infoLines.containsKey(id)) { + throw new IllegalStateException("No VCF INFO header line found for key " + id); + } + return infoLines.get(id); + } + public static VCFFormatHeaderLine getFormatLine(final String id) { + if (!formatLines.containsKey(id)) { + throw new IllegalStateException("No VCF FORMAT header line found for key " + id); + } + return formatLines.get(id); + } + public static VCFFilterHeaderLine getFilterLine(final String id) { + if (!filterLines.containsKey(id)) { + throw new IllegalStateException("No VCF FILTER header line found for key " + id); + } + return filterLines.get(id); + } public static Set getAllInfoLines() { return Collections.unmodifiableSet(new HashSet<>(infoLines.values())); } public static Set getAllFormatLines() { return Collections.unmodifiableSet(new HashSet<>(formatLines.values())); } @@ -126,6 +142,7 @@ private static void addFilterLine(final VCFFilterHeaderLine line) { addInfoLine(new VCFInfoHeaderLine(MAP_QUAL_RANK_SUM_KEY, 1, VCFHeaderLineType.Float, "Z-score From Wilcoxon rank sum test of Alt vs. Ref read mapping qualities")); addInfoLine(new VCFInfoHeaderLine(AS_MAP_QUAL_RANK_SUM_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Float, "allele specific Z-score From Wilcoxon rank sum test of each Alt vs. Ref read mapping qualities")); addInfoLine(new VCFInfoHeaderLine(RAW_RMS_MAPPING_QUALITY_KEY, 1, VCFHeaderLineType.Float, "Raw data for RMS Mapping Quality")); + addInfoLine(new VCFInfoHeaderLine(MAPPING_QUALITY_DEPTH, 1, VCFHeaderLineType.Integer, "Depth over variant samples for better MQ calculation")); addInfoLine(new VCFInfoHeaderLine(AS_RAW_RMS_MAPPING_QUALITY_KEY, 1, VCFHeaderLineType.String, "Allele-specfic raw data for RMS Mapping Quality")); addInfoLine(new VCFInfoHeaderLine(AS_RMS_MAPPING_QUALITY_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Float, "Allele-specific RMS Mapping Quality")); addInfoLine(new VCFInfoHeaderLine(AS_RAW_MAP_QUAL_RANK_SUM_KEY, 1, VCFHeaderLineType.String, "Allele-specfic raw data for Mapping Quality Rank Sum")); @@ -140,6 +157,8 @@ private static void addFilterLine(final VCFFilterHeaderLine line) { addInfoLine(new VCFInfoHeaderLine(QUAL_BY_DEPTH_KEY, 1, VCFHeaderLineType.Float, "Variant Confidence/Quality by Depth")); addInfoLine(new VCFInfoHeaderLine(AS_QUAL_BY_DEPTH_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Float, "Allele-specific Variant Confidence/Quality by Depth")); addInfoLine(new VCFInfoHeaderLine(AS_QUAL_KEY, 1, VCFHeaderLineType.Float, "Allele-specific Variant Qual Score")); + addInfoLine(new VCFInfoHeaderLine(RAW_QUAL_APPROX_KEY, 1, VCFHeaderLineType.Integer, "Sum of PL[0] values; used to approximate the QUAL score")); + addInfoLine(new VCFInfoHeaderLine(VARIANT_DEPTH_KEY, 1, VCFHeaderLineType.Integer, "(informative) depth over variant genotypes")); addInfoLine(new VCFInfoHeaderLine(READ_POS_RANK_SUM_KEY, 1, VCFHeaderLineType.Float, "Z-score from Wilcoxon rank sum test of Alt vs. Ref read position bias")); addInfoLine(new VCFInfoHeaderLine(AS_READ_POS_RANK_SUM_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Float, "allele specific Z-score from Wilcoxon rank sum test of each Alt vs. Ref read position bias")); addInfoLine(new VCFInfoHeaderLine(AS_RAW_READ_POS_RANK_SUM_KEY, 1, VCFHeaderLineType.String, "allele specific raw data for rank sum test of read position bias")); diff --git a/src/main/java/org/broadinstitute/hellbender/utils/variant/writers/GVCFWriter.java b/src/main/java/org/broadinstitute/hellbender/utils/variant/writers/GVCFWriter.java index f8b5f72d970..92b70d1ce6d 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/variant/writers/GVCFWriter.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/variant/writers/GVCFWriter.java @@ -26,6 +26,8 @@ */ public final class GVCFWriter implements VariantContextWriter { + public final static String GVCF_BLOCK = "GVCFBlock"; + /** Where we'll ultimately write our VCF records */ private final VariantContextWriter underlyingWriter; @@ -123,7 +125,7 @@ public void writeHeader(VCFHeader header) { static VCFHeaderLine rangeToVCFHeaderLine(Range genotypeQualityBand) { // Need to uniquify the key for the header line using the min/max GQ, since // VCFHeader does not allow lines with duplicate keys. - final String key = String.format("GVCFBlock%d-%d", genotypeQualityBand.lowerEndpoint(), genotypeQualityBand.upperEndpoint()); + final String key = String.format(GVCF_BLOCK+"%d-%d", genotypeQualityBand.lowerEndpoint(), genotypeQualityBand.upperEndpoint()); return new VCFHeaderLine(key, "minGQ=" + genotypeQualityBand.lowerEndpoint() + "(inclusive),maxGQ=" + genotypeQualityBand.upperEndpoint() + "(exclusive)"); } @@ -155,9 +157,10 @@ public boolean checkError() { protected VariantContext addHomRefSite(final VariantContext vc, final Genotype g) { if (nextAvailableStart != -1) { - // don't create blocks while the hom-ref site falls before nextAvailableStart (for deletions) - if (vc.getStart() <= nextAvailableStart && vc.getContig().equals(contigOfNextAvailableStart)) { - return null; + //there's a use case here related to ReblockGVCFs for overlapping deletions on different haplotypes + if ( vc.getStart() <= nextAvailableStart && vc.getContig().equals(contigOfNextAvailableStart) ) { + if (vc.getEnd() <= nextAvailableStart) + return null; } // otherwise, reset to non-relevant nextAvailableStart = -1; @@ -166,7 +169,7 @@ protected VariantContext addHomRefSite(final VariantContext vc, final Genotype g final VariantContext result; if (genotypeCanBeMergedInCurrentBlock(g)) { - currentBlock.add(vc.getStart(), g); + currentBlock.add(vc.getStart(), vc.getAttributeAsInt(VCFConstants.END_KEY, vc.getStart()), g); result = null; } else { result = currentBlock != null ? currentBlock.toVariantContext(sampleName): null; @@ -211,7 +214,7 @@ private HomRefBlock createNewBlock(final VariantContext vc, final Genotype g) { // create the block, add g to it, and return it for use final HomRefBlock block = new HomRefBlock(vc, partition.lowerEndpoint(), partition.upperEndpoint(), defaultPloidy); - block.add(vc.getStart(), g); + block.add(vc.getStart(), vc.getAttributeAsInt(VCFConstants.END_KEY, vc.getStart()), g); return block; } diff --git a/src/main/java/org/broadinstitute/hellbender/utils/variant/writers/HomRefBlock.java b/src/main/java/org/broadinstitute/hellbender/utils/variant/writers/HomRefBlock.java index d789b5c7b69..78d1d5a34e8 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/variant/writers/HomRefBlock.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/variant/writers/HomRefBlock.java @@ -106,6 +106,17 @@ private Genotype createHomRefGenotype(String sampleName) { * @param genotype A non-null Genotype with GQ and DP attributes */ public void add(final int pos, final Genotype genotype) { + add(pos, pos, genotype); + } + + /** + * Add a homRef block to the current block + * + * @param pos current genomic position + * @param newEnd new calculated block end position + * @param genotype A non-null Genotype with GQ and DP attributes + */ + public void add(final int pos, final int newEnd, final Genotype genotype) { Utils.nonNull(genotype, "genotype cannot be null"); if ( ! genotype.hasPL() ) { throw new IllegalArgumentException("genotype must have PL field");} if ( pos != end + 1 ) { throw new IllegalArgumentException("adding genotype at pos " + pos + " isn't contiguous with previous end " + end); } @@ -144,7 +155,7 @@ public void add(final int pos, final Genotype genotype) { } } - end = pos; + end = newEnd; DPs.add(Math.max(genotype.getDP(), 0)); // DP must be >= 0 } @@ -186,7 +197,7 @@ int getGQLowerBound() { } public boolean isContiguous(final VariantContext vc) { - return (vc.getEnd() == getEnd() + 1) && startingVC.getContig().equals(vc.getContig()); + return (vc.getStart() == getEnd() + 1) && startingVC.getContig().equals(vc.getContig()); } public VariantContext getStartingVC() { diff --git a/src/test/java/org/broadinstitute/hellbender/tools/walkers/GenotypeGVCFsUnitTest.java b/src/test/java/org/broadinstitute/hellbender/tools/walkers/GenotypeGVCFsUnitTest.java index 1da63e6b4e5..330cd7f7ab5 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/walkers/GenotypeGVCFsUnitTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/walkers/GenotypeGVCFsUnitTest.java @@ -165,6 +165,6 @@ public Object[][] getSpanningAndNonSpanningAlleles(){ @Test(dataProvider = "getSpanningAndNonSpanningAlleles") public void testIsSpanningDeletion(Allele allele, boolean expected){ - Assert.assertEquals(GenotypeGVCFs.isSpanningDeletion(allele), expected); + Assert.assertEquals(GATKVCFConstants.isSpanningDeletion(allele), expected); } } diff --git a/src/test/java/org/broadinstitute/hellbender/tools/walkers/annotator/RMSMappingQualityUnitTest.java b/src/test/java/org/broadinstitute/hellbender/tools/walkers/annotator/RMSMappingQualityUnitTest.java index 6494c915edd..471a3417df1 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/walkers/annotator/RMSMappingQualityUnitTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/walkers/annotator/RMSMappingQualityUnitTest.java @@ -38,9 +38,9 @@ public void testAllNull() throws Exception { @Test public void testDescriptions() throws Exception { final InfoFieldAnnotation cov = new RMSMappingQuality(); - Assert.assertEquals(cov.getDescriptions().size(), 2); + Assert.assertEquals(cov.getDescriptions().size(), 1); Assert.assertEquals(cov.getDescriptions().get(0).getID(), VCFConstants.RMS_MAPPING_QUALITY_KEY); - Assert.assertEquals(cov.getDescriptions().get(1).getID(), GATKVCFConstants.RAW_RMS_MAPPING_QUALITY_KEY); + Assert.assertEquals(((RMSMappingQuality)cov).getRawDescriptions().get(0).getID(), GATKVCFConstants.RAW_RMS_MAPPING_QUALITY_KEY); Assert.assertEquals(new RMSMappingQuality().getRawKeyName(), GATKVCFConstants.RAW_RMS_MAPPING_QUALITY_KEY); Assert.assertEquals(new RMSMappingQuality().getKeyNames(), Sets.newHashSet(VCFConstants.RMS_MAPPING_QUALITY_KEY, GATKVCFConstants.RAW_RMS_MAPPING_QUALITY_KEY)); } @@ -54,7 +54,7 @@ public void testNullLikelihoods() throws Exception { Assert.assertTrue(annotate.isEmpty()); Assert.assertEquals(cov.getDescriptions().get(0).getID(), VCFConstants.RMS_MAPPING_QUALITY_KEY); - Assert.assertEquals(cov.getDescriptions().get(1).getID(), GATKVCFConstants.RAW_RMS_MAPPING_QUALITY_KEY); + Assert.assertEquals(((RMSMappingQuality)cov).getRawDescriptions().get(0).getID(), GATKVCFConstants.RAW_RMS_MAPPING_QUALITY_KEY); Assert.assertEquals(cov.getDescriptions().get(0).getID(), VCFConstants.RMS_MAPPING_QUALITY_KEY); } diff --git a/src/test/java/org/broadinstitute/hellbender/tools/walkers/variantutils/ReblockGVCFIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/walkers/variantutils/ReblockGVCFIntegrationTest.java new file mode 100644 index 00000000000..a87fe974159 --- /dev/null +++ b/src/test/java/org/broadinstitute/hellbender/tools/walkers/variantutils/ReblockGVCFIntegrationTest.java @@ -0,0 +1,83 @@ +package org.broadinstitute.hellbender.tools.walkers.variantutils; + +import htsjdk.variant.variantcontext.VariantContext; +import org.broadinstitute.hellbender.CommandLineProgramTest; +import org.broadinstitute.hellbender.GATKBaseTest; +import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; +import org.broadinstitute.hellbender.engine.FeatureDataSource; +import org.broadinstitute.hellbender.utils.IntervalUtils; +import org.broadinstitute.hellbender.utils.SimpleInterval; +import org.broadinstitute.hellbender.testutils.ArgumentsBuilder; +import org.broadinstitute.hellbender.testutils.CommandLineProgramTester; +import org.broadinstitute.hellbender.testutils.IntegrationTestSpec; +import org.broadinstitute.hellbender.testutils.VariantContextTestUtils; +import org.testng.annotations.Test; + +import java.io.File; +import java.util.Arrays; +import java.util.Collections; + +public class ReblockGVCFIntegrationTest extends CommandLineProgramTest { + + private static final String hg38_reference_20_21 = largeFileTestDir + "Homo_sapiens_assembly38.20.21.fasta"; + private static final String b37_reference_20_21 = largeFileTestDir + "human_g1k_v37.20.21.fasta"; + + @Test + public void testJustOneSample() throws Exception { + final IntegrationTestSpec spec = new IntegrationTestSpec( + "-L chr20:69485-69791 -O %s -R " + hg38_reference_20_21 + + " -V " + getToolTestDataDir() + "gvcfForReblocking.g.vcf -rgq-threshold 20" + + " --" + StandardArgumentDefinitions.ADD_OUTPUT_VCF_COMMANDLINE + " false", + Arrays.asList(getToolTestDataDir() + "testJustOneSample.expected.g.vcf")); + spec.executeTest("testJustOneSample", this); + } + + @Test + public void testGVCFReblockingIsContiguous() throws Exception { + final File output = createTempFile("reblockedgvcf", ".vcf"); + final File expected = new File(largeFileTestDir + "testProductionGVCF.expected.g.vcf.gz"); + + final ArgumentsBuilder args = new ArgumentsBuilder(); + args.addReference(new File(b37_reference_20_21)) + .addArgument("V", largeFileTestDir + "NA12878.prod.chr20snippet.g.vcf.gz") + .addArgument("rgq-threshold", "20") + .addArgument("L", "20:60001-1000000") + .addOutput(output); + runCommandLine(args); + + final CommandLineProgramTester validator = ValidateVariants.class::getSimpleName; + final ArgumentsBuilder args2 = new ArgumentsBuilder(); + args2.addArgument("R", b37_reference_20_21); + args2.addArgument("V", output.getAbsolutePath()); + args2.addArgument("L", IntervalUtils.locatableToString(new SimpleInterval("20:60001-1000000"))); + args2.add("-gvcf"); + validator.runCommandLine(args2); //will throw a UserException if GVCF isn't contiguous + + try (final FeatureDataSource actualVcs = new FeatureDataSource<>(output); + final FeatureDataSource expectedVcs = new FeatureDataSource<>(expected)) { + GATKBaseTest.assertCondition(actualVcs, expectedVcs, + (a, e) -> VariantContextTestUtils.assertVariantContextsAreEqual(a, e, + Collections.emptyList())); + } + } + + @Test + public void testOneSampleAsForGnomAD() throws Exception { + final IntegrationTestSpec spec = new IntegrationTestSpec( + "-drop-low-quals -do-qual-approx -L chr20:69485-69791 -O %s -R " + hg38_reference_20_21 + + " -V " + getToolTestDataDir() + "gvcfForReblocking.g.vcf" + + " --" + StandardArgumentDefinitions.ADD_OUTPUT_VCF_COMMANDLINE + " false", + Arrays.asList(getToolTestDataDir() + "testOneSampleAsForGnomAD.expected.g.vcf")); + spec.executeTest("testOneSampleDropLows", this); + } + + @Test //covers non-ref AD and non-ref GT corrections + public void testNonRefADCorrection() throws Exception { + final IntegrationTestSpec spec = new IntegrationTestSpec( + "-O %s -R " + hg38_reference_20_21 + + " -V " + getToolTestDataDir() + "nonRefAD.g.vcf" + + " --" + StandardArgumentDefinitions.ADD_OUTPUT_VCF_COMMANDLINE + " false", + Arrays.asList(getToolTestDataDir() + "testNonRefADCorrection.expected.g.vcf")); + spec.executeTest("testNonRefADCorrection", this); + } +} \ No newline at end of file diff --git a/src/test/java/org/broadinstitute/hellbender/tools/walkers/variantutils/ReblockGVCFUnitTest.java b/src/test/java/org/broadinstitute/hellbender/tools/walkers/variantutils/ReblockGVCFUnitTest.java new file mode 100644 index 00000000000..fd1b22370ef --- /dev/null +++ b/src/test/java/org/broadinstitute/hellbender/tools/walkers/variantutils/ReblockGVCFUnitTest.java @@ -0,0 +1,111 @@ +package org.broadinstitute.hellbender.tools.walkers.variantutils; + +import htsjdk.variant.variantcontext.*; +import htsjdk.variant.vcf.VCFConstants; +import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants; +import org.testng.Assert; +import org.testng.annotations.Test; + +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +public class ReblockGVCFUnitTest { + private final static Allele LONG_REF = Allele.create("ACTG", true); + private final static Allele DELETION = Allele.create("A", false); + private final static Allele SHORT_REF = Allele.create("A", true); + private final static Allele LONG_SNP = Allele.create("TCTA", false); + + @Test + public void testCleanUpHighQualityVariant() { + final ReblockGVCF reblocker = new ReblockGVCF(); + //We need an annotation engine for cleanUpHighQualityVariant() + reblocker.createAnnotationEngine(); + reblocker.dropLowQuals = true; + reblocker.doQualApprox = true; + + final Genotype g0 = makeG("sample1", LONG_REF, DELETION, 41, 0, 37, 200, 100, 200, 400, 600, 800); + final Genotype g = addAD(g0,0,13,17,0); + final VariantContext extraAlt0 = makeDeletionVC("lowQualVar", Arrays.asList(LONG_REF, DELETION, LONG_SNP, Allele.NON_REF_ALLELE), LONG_REF.length(), g); + final Map attr = new HashMap<>(); + attr.put(VCFConstants.DEPTH_KEY, 32); + final VariantContext extraAlt = addAttributes(extraAlt0, attr); + //we'll call this with the same VC again under the assumption that STAND_CALL_CONF is zero so no alleles/GTs change + final VariantContext cleaned1 = reblocker.cleanUpHighQualityVariant(extraAlt, extraAlt); + Assert.assertTrue(cleaned1.getAlleles().size() == 3); + Assert.assertTrue(cleaned1.getAlleles().contains(LONG_REF)); + Assert.assertTrue(cleaned1.getAlleles().contains(DELETION)); + Assert.assertTrue(cleaned1.getAlleles().contains(Allele.NON_REF_ALLELE)); + Assert.assertTrue(cleaned1.hasAttribute(GATKVCFConstants.RAW_QUAL_APPROX_KEY)); + Assert.assertTrue(cleaned1.getAttribute(GATKVCFConstants.RAW_QUAL_APPROX_KEY).equals(41)); + Assert.assertTrue(cleaned1.hasAttribute(GATKVCFConstants.VARIANT_DEPTH_KEY)); + Assert.assertTrue(cleaned1.getAttribute(GATKVCFConstants.VARIANT_DEPTH_KEY).equals(30)); + Assert.assertTrue(cleaned1.hasAttribute(GATKVCFConstants.MAPPING_QUALITY_DEPTH)); + Assert.assertTrue(cleaned1.getAttribute(GATKVCFConstants.MAPPING_QUALITY_DEPTH).equals(32)); + + final Genotype hetNonRef = makeG("sample2", DELETION, LONG_SNP, 891,879,1128,84,0,30,891,879,84,891); + final VariantContext keepAlts = makeDeletionVC("keepAllAlts", Arrays.asList(LONG_REF, DELETION, LONG_SNP, Allele.NON_REF_ALLELE), LONG_REF.length(), hetNonRef); + Assert.assertTrue(keepAlts.getAlleles().size() == 4); + Assert.assertTrue(keepAlts.getAlleles().contains(LONG_REF)); + Assert.assertTrue(keepAlts.getAlleles().contains(DELETION)); + Assert.assertTrue(keepAlts.getAlleles().contains(LONG_SNP)); + Assert.assertTrue(keepAlts.getAlleles().contains(Allele.NON_REF_ALLELE)); + } + + @Test + public void testLowQualVariantToGQ0HomRef() { + final ReblockGVCF reblocker = new ReblockGVCF(); + + reblocker.dropLowQuals = true; + final Genotype g = makeG("sample1", LONG_REF, Allele.NON_REF_ALLELE, 200, 100, 200, 11, 0, 37); + final VariantContext toBeNoCalled = makeDeletionVC("lowQualVar", Arrays.asList(LONG_REF, DELETION, Allele.NON_REF_ALLELE), LONG_REF.length(), g); + final VariantContext dropped = reblocker.lowQualVariantToGQ0HomRef(toBeNoCalled, toBeNoCalled); + Assert.assertEquals(dropped, null); + + reblocker.dropLowQuals = false; + final VariantContext modified = reblocker.lowQualVariantToGQ0HomRef(toBeNoCalled, toBeNoCalled); + Assert.assertTrue(modified.getAttributes().containsKey(VCFConstants.END_KEY)); + Assert.assertTrue(modified.getAttributes().get(VCFConstants.END_KEY).equals(13)); + Assert.assertTrue(modified.getReference().equals(SHORT_REF)); + Assert.assertTrue(modified.getAlternateAllele(0).equals(Allele.NON_REF_ALLELE)); + Assert.assertTrue(!modified.filtersWereApplied()); + Assert.assertTrue(modified.getLog10PError() == VariantContext.NO_LOG10_PERROR); + } + + @Test + public void testChangeCallToGQ0HomRef() { + final ReblockGVCF reblocker = new ReblockGVCF(); + + final Genotype g = makeG("sample1", LONG_REF, Allele.NON_REF_ALLELE, 200, 100, 200, 11, 0, 37); + final VariantContext toBeNoCalled = makeDeletionVC("lowQualVar", Arrays.asList(LONG_REF, DELETION, Allele.NON_REF_ALLELE), LONG_REF.length(), g); + final Map noAttributesMap = new HashMap<>(); + final GenotypeBuilder noCalled = reblocker.changeCallToGQ0HomRef(toBeNoCalled, noAttributesMap); + final Genotype newG = noCalled.make(); + Assert.assertTrue(noAttributesMap.containsKey(VCFConstants.END_KEY)); + Assert.assertTrue(noAttributesMap.get(VCFConstants.END_KEY).equals(13)); + Assert.assertTrue(newG.getAllele(0).equals(SHORT_REF)); + Assert.assertTrue(newG.getAllele(1).equals(SHORT_REF)); + Assert.assertTrue(!newG.hasAD()); + } + + //TODO: these are duplicated from PosteriorProbabilitiesUtilsUnitTest but PR #4947 modifies VariantContextTestUtils, so I'll do some refactoring before the second of the two is merged + private Genotype makeG(final String sample, final Allele a1, final Allele a2, final int... pls) { + return new GenotypeBuilder(sample, Arrays.asList(a1, a2)).PL(pls).make(); + } + + private VariantContext makeDeletionVC(final String source, final List alleles, final int refLength, final Genotype... genotypes) { + final int start = 10; + final int stop = start+refLength-1; + return new VariantContextBuilder(source, "1", start, stop, alleles).genotypes(Arrays.asList(genotypes)).filters((String)null).make(); + } + + private Genotype addAD(final Genotype g, final int... ads) { + return new GenotypeBuilder(g).AD(ads).make(); + } + + private VariantContext addAttributes(final VariantContext vc, final Map attributes) { + return new VariantContextBuilder(vc).attributes(attributes).make(); + } + +} \ No newline at end of file diff --git a/src/test/java/org/broadinstitute/hellbender/utils/variant/writers/GVCFWriterUnitTest.java b/src/test/java/org/broadinstitute/hellbender/utils/variant/writers/GVCFWriterUnitTest.java index 84999f4cd6f..5333782dc27 100644 --- a/src/test/java/org/broadinstitute/hellbender/utils/variant/writers/GVCFWriterUnitTest.java +++ b/src/test/java/org/broadinstitute/hellbender/utils/variant/writers/GVCFWriterUnitTest.java @@ -11,6 +11,7 @@ import htsjdk.variant.vcf.VCFHeaderLine; import htsjdk.variant.vcf.VCFStandardHeaderLines; import org.broadinstitute.hellbender.engine.FeatureDataSource; +import org.broadinstitute.hellbender.tools.walkers.variantutils.ReblockGVCF; import org.broadinstitute.hellbender.utils.MathUtils; import org.broadinstitute.hellbender.utils.SimpleInterval; import org.broadinstitute.hellbender.utils.Utils; @@ -71,6 +72,7 @@ public void setHeader(VCFHeader header) { } private static final List standardPartition = ImmutableList.of(1, 10, 20); + private static final List highConfLowConf = ImmutableList.of(20,100); private static final Allele REF = Allele.create("G", true); private static final Allele ALT = Allele.create("A"); private static final List ALLELES = ImmutableList.of(REF, Allele.NON_REF_ALLELE); @@ -116,6 +118,17 @@ private static VariantContext makeHomRef(final String contig, final int start, f return makeVariantContext(vcb, Arrays.asList(REF, REF), GQ); } + private VariantContext makeHomRef(final String contig, final int start, final int GQ, final int end) { + final VariantContextBuilder vcb = new VariantContextBuilder("test", contig, start, end, ALLELES); + final GenotypeBuilder gb = new GenotypeBuilder(SAMPLE_NAME, Arrays.asList(REF, REF)); + gb.GQ(GQ); + gb.DP(10); + gb.AD(new int[]{1, 2}); + gb.PL(new int[]{0, 10, 100}); + vcb.attribute(VCFConstants.END_KEY, end); + return vcb.genotypes(gb.make()).make(); + } + private static VariantContext makeHomRefAlt(final int start) { final VariantContextBuilder vcb = new VariantContextBuilder("test", CHR1, start, start, Arrays.asList(REF, ALT)); return makeVariantContext(vcb, Arrays.asList(REF, REF), 0); @@ -357,6 +370,18 @@ public void testNonContiguousBlocks() { assertGoodVC(mockWriter.emitted.get(1), CHR1, 10, 11, false); } + @Test + public void testInputBlocks() { + final MockWriter mockWriter = new MockWriter(); + final GVCFWriter writer = new GVCFWriter(mockWriter, highConfLowConf, HomoSapiensConstants.DEFAULT_PLOIDY); + + writer.add(makeHomRef("20", 1, 16, 600)); + writer.add(makeHomRef("20", 601, 0, 620)); + writer.close(); + Assert.assertEquals(mockWriter.emitted.size(), 1); + assertGoodVC(mockWriter.emitted.get(0), "20", 1, 620, false); + } + @Test public void testDeletion() { final MockWriter mockWriter = new MockWriter(); @@ -391,7 +416,7 @@ public void testHomRefAlt() { writer.close(); Assert.assertEquals(mockWriter.emitted.size(), 3); assertGoodVC(mockWriter.emitted.get(0), CHR1, 1, 2, false); - Assert.assertFalse(mockWriter.emitted.get(1).hasAttribute("END")); + Assert.assertFalse(mockWriter.emitted.get(1).hasAttribute(VCFConstants.END_KEY)); Assert.assertFalse(mockWriter.emitted.get(1).hasAttribute("BLOCK_SIZE")); assertGoodVC(mockWriter.emitted.get(2), CHR1, 4, 7, false); } @@ -629,4 +654,53 @@ public void testAgainstExampleGVCF() throws IOException { } + @Test + public void testOverlappingDeletions() { + final ReblockGVCF reblocker = new ReblockGVCF(); + + final Allele ref1 = Allele.create("TACACACACATACACACACAC", true); + final Allele alt1 = Allele.create("T", false); + final Allele ref2 = Allele.create("TACACACACACTACTA", true); + final Allele ref3 = Allele.create("C", true); + final VariantContext deletion1 = new VariantContextBuilder(null, "1", 10000, 10020, Arrays.asList(ref1, alt1, + Allele.NON_REF_ALLELE)) + .log10PError(1000 / -10 ) + .genotypes(new GenotypeBuilder(SAMPLE_NAME, Arrays.asList(ref1, alt1)) + .AD(new int[]{7,23,0}) + .DP(30) + .GQ(99) + .PL(new int[] {40, 0, 212, 591, 281, 872}) + .attribute(GATKVCFConstants.STRAND_BIAS_BY_SAMPLE_KEY, new int[]{4,3,23,0}) + .make()) + .make(); + + final VariantContext deletion2 = new VariantContextBuilder(null, "1", 10010, 10025, Arrays.asList(ref2, alt1, + Allele.NON_REF_ALLELE)) + .log10PError(10 / -10 ) + .genotypes(new GenotypeBuilder(SAMPLE_NAME, Arrays.asList(ref2, alt1)) + .AD(new int[]{7,23,0}) + .DP(30) + .GQ(99) + .PL(new int[] {40, 0, 212, 591, 281, 872}) + .attribute(GATKVCFConstants.STRAND_BIAS_BY_SAMPLE_KEY, new int[]{4,3,23,0}) + .make()) + .make(); + + final VariantContext origRefBlock = makeHomRef("1", 10026, 60, 10050); + + //Let's say that these "low quality" deletions are below the RGQ threshold and get converted to homRefs with all zero PLs + final VariantContext block1 = reblocker.lowQualVariantToGQ0HomRef(deletion1, deletion1); + final VariantContext block2 = reblocker.lowQualVariantToGQ0HomRef(deletion2, deletion2); + + final MockWriter mockWriter = new MockWriter(); + final GVCFWriter writer = new GVCFWriter(mockWriter, Arrays.asList(20,100), 2); + writer.add(deletion1); + writer.add(block2); + writer.add(origRefBlock); + writer.close(); + Assert.assertTrue(mockWriter.emitted.size() == 3); + Assert.assertTrue(mockWriter.emitted.get(1).getEnd()+1 == mockWriter.emitted.get(2).getStart()); + //The first two blocks overlap, which is fine, but the important thing is that there's no "hole" between the first deletion and the final block + } + } diff --git a/src/test/resources/large/NA12878.prod.chr20snippet.g.vcf.gz b/src/test/resources/large/NA12878.prod.chr20snippet.g.vcf.gz new file mode 100644 index 00000000000..d3d6b906a85 --- /dev/null +++ b/src/test/resources/large/NA12878.prod.chr20snippet.g.vcf.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:93501556e4efb5f889b7419ef05ce396930b4015d266da5da6bc9fe3af3cbe85 +size 3892216 diff --git a/src/test/resources/large/NA12878.prod.chr20snippet.g.vcf.gz.tbi b/src/test/resources/large/NA12878.prod.chr20snippet.g.vcf.gz.tbi new file mode 100644 index 00000000000..613f1d22a2a --- /dev/null +++ b/src/test/resources/large/NA12878.prod.chr20snippet.g.vcf.gz.tbi @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4fc6793dbda8f5bf66c2a0f692fc845e7fab76b5d7da2df990da2b7d0d7fafb7 +size 814 diff --git a/src/test/resources/large/testProductionGVCF.expected.g.vcf.gz b/src/test/resources/large/testProductionGVCF.expected.g.vcf.gz new file mode 100644 index 00000000000..62298071799 --- /dev/null +++ b/src/test/resources/large/testProductionGVCF.expected.g.vcf.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3434135510e78426cc1c1f5ad552956fc9094d56b7d60bf778778753f8da43a9 +size 272188 diff --git a/src/test/resources/large/testProductionGVCF.expected.g.vcf.gz.tbi b/src/test/resources/large/testProductionGVCF.expected.g.vcf.gz.tbi new file mode 100644 index 00000000000..0f270ece120 --- /dev/null +++ b/src/test/resources/large/testProductionGVCF.expected.g.vcf.gz.tbi @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:956bb4e7acd913cdf6affa0556ad18e84b9bf120eb979e719bb64bfa1297f54b +size 369 diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/haplotypecaller/expected.testGVCFMode.gatk4.alleleSpecific.g.vcf b/src/test/resources/org/broadinstitute/hellbender/tools/haplotypecaller/expected.testGVCFMode.gatk4.alleleSpecific.g.vcf index 3c4b47050b2..6206edc8981 100644 --- a/src/test/resources/org/broadinstitute/hellbender/tools/haplotypecaller/expected.testGVCFMode.gatk4.alleleSpecific.g.vcf +++ b/src/test/resources/org/broadinstitute/hellbender/tools/haplotypecaller/expected.testGVCFMode.gatk4.alleleSpecific.g.vcf @@ -90,7 +90,6 @@ ##INFO= ##INFO= ##INFO= -##INFO= ##INFO= ##INFO= ##INFO= diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/haplotypecaller/expected.testGVCFMode.gatk4.g.vcf b/src/test/resources/org/broadinstitute/hellbender/tools/haplotypecaller/expected.testGVCFMode.gatk4.g.vcf index 897c5701ec0..864c8e90d8e 100644 --- a/src/test/resources/org/broadinstitute/hellbender/tools/haplotypecaller/expected.testGVCFMode.gatk4.g.vcf +++ b/src/test/resources/org/broadinstitute/hellbender/tools/haplotypecaller/expected.testGVCFMode.gatk4.g.vcf @@ -83,7 +83,6 @@ ##INFO= ##INFO= ##INFO= -##INFO= ##INFO= ##INFO= ##INFO= diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/haplotypecaller/expected.testGVCFMode.gatk4.g.vcf.idx b/src/test/resources/org/broadinstitute/hellbender/tools/haplotypecaller/expected.testGVCFMode.gatk4.g.vcf.idx index 8090317b0b7..e63f7ce8701 100644 Binary files a/src/test/resources/org/broadinstitute/hellbender/tools/haplotypecaller/expected.testGVCFMode.gatk4.g.vcf.idx and b/src/test/resources/org/broadinstitute/hellbender/tools/haplotypecaller/expected.testGVCFMode.gatk4.g.vcf.idx differ diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/haplotypecaller/expected.testGenotypeGivenAllelesMode.gatk4.vcf b/src/test/resources/org/broadinstitute/hellbender/tools/haplotypecaller/expected.testGenotypeGivenAllelesMode.gatk4.vcf index eb9b0e240fb..842236c0226 100644 --- a/src/test/resources/org/broadinstitute/hellbender/tools/haplotypecaller/expected.testGenotypeGivenAllelesMode.gatk4.vcf +++ b/src/test/resources/org/broadinstitute/hellbender/tools/haplotypecaller/expected.testGenotypeGivenAllelesMode.gatk4.vcf @@ -19,7 +19,6 @@ ##INFO= ##INFO= ##INFO= -##INFO= ##INFO= ##INFO= ##contig= diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/haplotypecaller/expected.testHaplotypeCallerRemoveAltAlleleBasedOnHaptypeScores.gatk4.vcf b/src/test/resources/org/broadinstitute/hellbender/tools/haplotypecaller/expected.testHaplotypeCallerRemoveAltAlleleBasedOnHaptypeScores.gatk4.vcf index f2640afaaa9..dad5ee4d505 100644 --- a/src/test/resources/org/broadinstitute/hellbender/tools/haplotypecaller/expected.testHaplotypeCallerRemoveAltAlleleBasedOnHaptypeScores.gatk4.vcf +++ b/src/test/resources/org/broadinstitute/hellbender/tools/haplotypecaller/expected.testHaplotypeCallerRemoveAltAlleleBasedOnHaptypeScores.gatk4.vcf @@ -19,7 +19,6 @@ ##INFO= ##INFO= ##INFO= -##INFO= ##INFO= ##INFO= ##contig= diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/haplotypecaller/expected.testVCFMode.gatk4.vcf b/src/test/resources/org/broadinstitute/hellbender/tools/haplotypecaller/expected.testVCFMode.gatk4.vcf index f9ff9d6a7cd..662b9fda6e8 100644 --- a/src/test/resources/org/broadinstitute/hellbender/tools/haplotypecaller/expected.testVCFMode.gatk4.vcf +++ b/src/test/resources/org/broadinstitute/hellbender/tools/haplotypecaller/expected.testVCFMode.gatk4.vcf @@ -19,7 +19,6 @@ ##INFO= ##INFO= ##INFO= -##INFO= ##INFO= ##INFO= ##contig= diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/walkers/variantutils/ReblockGVCF/gvcfForReblocking.g.vcf b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/variantutils/ReblockGVCF/gvcfForReblocking.g.vcf new file mode 100644 index 00000000000..ab55049e824 --- /dev/null +++ b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/variantutils/ReblockGVCF/gvcfForReblocking.g.vcf @@ -0,0 +1,35 @@ +##fileformat=VCFv4.1 +##ALT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##GVCFBlock=minGQ=0(inclusive),maxGQ=5(exclusive) +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##contig= +##source=SelectVariants +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA1 +chr20 69485 . G . . BLOCK_SIZE=20;END=69510 GT:DP:GQ:MIN_DP:MIN_GQ:PL 0/0:94:99:82:99:0,120,1800 +chr20 69511 . A G, 2253.77 . BaseQRankSum=1.169;DP=82;MQ=31.05;MQ0=0;MQRankSum=-0.866;ReadPosRankSum=1.689 GT:AD:DP:GQ:PL:SB 1/1:1,79,0:80:99:2284,207,0,2287,237,2316:0,1,46,33 +chr20 69512 . A . . BLOCK_SIZE=10;END=69521 GT:DP:GQ:MIN_DP:MIN_GQ:PL 0/0:96:99:82:99:0,120,1800 +chr20 69522 . A . . BLOCK_SIZE=27;END=69548 GT:DP:GQ:MIN_DP:MIN_GQ:PL 0/0:95:0:95:0:0,0,0 +chr20 69549 . T . . BLOCK_SIZE=86;END=69634 GT:DP:GQ:MIN_DP:MIN_GQ:PL 0/0:6:16:5:16:0,16,90 +chr20 69635 . A T, 60.77 . BaseQRankSum=0.937;DP=7;MQ=34.15;MQ0=0;MQRankSum=1.300;ReadPosRankSum=1.754 GT:AD:DP:GQ:PL:SB 0/1:4,3,0:7:10:10,0,119,101,128,229:0,4,0,3 +chr20 69762 . A . . BLOCK_SIZE=1;END=69762 GT:DP:GQ:MIN_DP:MIN_GQ:PL 0/0:7:18:7:18:0,18,270 +chr20 69763 . A . . BLOCK_SIZE=4;END=69766 GT:DP:GQ:MIN_DP:MIN_GQ:PL 0/0:7:21:7:21:0,21,253 +chr20 69767 . A . . BLOCK_SIZE=4;END=69770 GT:DP:GQ:MIN_DP:MIN_GQ:PL 0/0:7:12:7:12:0,12,180 +chr20 69771 . TAAAAA T, 60.77 . BaseQRankSum=0.937;DP=7;MQ=34.15;MQ0=0;MQRankSum=1.300;ReadPosRankSum=1.754 GT:AD:DP:GQ:PL:SB 0/1:4,3,0:7:20:20,0,119,101,128,229:0,4,0,3 +chr20 69777 . G . . BLOCK_SIZE=10;END=69783 GT:DP:GQ:MIN_DP:MIN_GQ:PL 0/0:7:0:3:0:0,0,0 +chr20 69785 . T G, 2253.77 . BaseQRankSum=1.169;DP=82;MQ=31.05;MQ0=0;MQRankSum=-0.866;ReadPosRankSum=1.689 GT:DP:GQ:PL:SB 1/1:80:99:2284,207,0,2287,237,2316:0,1,46,33 diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/walkers/variantutils/ReblockGVCF/gvcfForReblocking.g.vcf.idx b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/variantutils/ReblockGVCF/gvcfForReblocking.g.vcf.idx new file mode 100644 index 00000000000..52f70037a73 Binary files /dev/null and b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/variantutils/ReblockGVCF/gvcfForReblocking.g.vcf.idx differ diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/walkers/variantutils/ReblockGVCF/nonRefAD.g.vcf b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/variantutils/ReblockGVCF/nonRefAD.g.vcf new file mode 100644 index 00000000000..faafa35f3a0 --- /dev/null +++ b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/variantutils/ReblockGVCF/nonRefAD.g.vcf @@ -0,0 +1,38 @@ +##fileformat=VCFv4.2 +##ALT= +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##GATKCommandLine.HaplotypeCaller= +##GATKCommandLine.ReblockGVCF= +##GATKCommandLine= +##GVCFBlock0-20=minGQ=0(inclusive),maxGQ=20(exclusive) +##GVCFBlock20-100=minGQ=20(inclusive),maxGQ=100(exclusive) +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##contig= +##reference=file:///seq/references/Homo_sapiens_assembly38/v0/Homo_sapiens_assembly38.fasta +##source=SelectVariants +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT HK017-0046 +chr20 1133569 . A ATATATATATATATATATATTT, 240.77 . DP=22;MQRankSum=1.380;MQ_DP=22;QUALapprox=269;RAW_MQ=69700.00;ReadPosRankSum=0.000;VarDP=7 GT:AD:DP:GQ:PL:SB 0/1:2,4,1:7:30:269,0,253,30,51,87:0,2,3,1 +chr20 1133750 . A G,AGAATGGAATG, 399.76 . DP=121;MLEAC=1,0,1;MLEAF=0.500,0.00,0.500;MQ=37.23 GT:AD:DP:GQ:PL:SB 1/3:0,6,0,0:6:9:436,21,9,291,45,834,242,0,270,220:0,0,5,1 diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/walkers/variantutils/ReblockGVCF/nonRefAD.g.vcf.idx b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/variantutils/ReblockGVCF/nonRefAD.g.vcf.idx new file mode 100644 index 00000000000..4dd2ab4db2f Binary files /dev/null and b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/variantutils/ReblockGVCF/nonRefAD.g.vcf.idx differ diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/walkers/variantutils/ReblockGVCF/testJustOneSample.expected.g.vcf b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/variantutils/ReblockGVCF/testJustOneSample.expected.g.vcf new file mode 100755 index 00000000000..3edd30b356e --- /dev/null +++ b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/variantutils/ReblockGVCF/testJustOneSample.expected.g.vcf @@ -0,0 +1,36 @@ +##fileformat=VCFv4.2 +##ALT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##GVCFBlock0-20=minGQ=0(inclusive),maxGQ=20(exclusive) +##GVCFBlock20-100=minGQ=20(inclusive),maxGQ=100(exclusive) +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##contig= +##source=SelectVariants +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA1 +chr20 69485 . G . . END=69510 GT:DP:GQ:MIN_DP:PL 0/0:94:99:94:0,120,1800 +chr20 69511 . A G, 2255.77 . DP=82;MQ=31.05;MQRankSum=-0.866;MQ_DP=82;ReadPosRankSum=1.689 GT:AD:DP:GQ:PL:SB 1/1:1,79,0:80:99:2284,207,0,2287,237,2316:0,1,46,33 +chr20 69512 . A . . END=69521 GT:DP:GQ:MIN_DP:PL 0/0:96:99:96:0,120,1800 +chr20 69522 . A . . END=69635 GT:DP:GQ:MIN_DP:PL 0/0:7:0:6:0,0,0 +chr20 69762 . A . . END=69762 GT:DP:GQ:MIN_DP:PL 0/0:7:18:7:0,18,270 +chr20 69763 . A . . END=69766 GT:DP:GQ:MIN_DP:PL 0/0:7:21:7:0,21,253 +chr20 69767 . A . . END=69770 GT:DP:GQ:MIN_DP:PL 0/0:7:12:7:0,12,180 +chr20 69771 . TAAAAA T, 0.61 . DP=7;MQ=34.15;MQRankSum=1.300;MQ_DP=7;ReadPosRankSum=1.754 GT:AD:DP:GQ:PL:SB 0/1:4,3,0:7:20:20,0,119,101,128,229:0,4,0,3 +chr20 69777 . G . . END=69783 GT:DP:GQ:MIN_DP:PL 0/0:7:0:7:0,0,0 +chr20 69785 . T G, 2255.77 . DP=82;MQ=31.05;MQRankSum=-0.866;MQ_DP=82;ReadPosRankSum=1.689 GT:DP:GQ:PL:SB 1/1:80:99:2284,207,0,2287,237,2316:0,1,46,33 diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/walkers/variantutils/ReblockGVCF/testNonRefADCorrection.expected.g.vcf b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/variantutils/ReblockGVCF/testNonRefADCorrection.expected.g.vcf new file mode 100755 index 00000000000..9a7322be47d --- /dev/null +++ b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/variantutils/ReblockGVCF/testNonRefADCorrection.expected.g.vcf @@ -0,0 +1,38 @@ +##fileformat=VCFv4.2 +##ALT= +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##GATKCommandLine.HaplotypeCaller= +##GATKCommandLine.ReblockGVCF= +##GATKCommandLine= +##GVCFBlock0-20=minGQ=0(inclusive),maxGQ=20(exclusive) +##GVCFBlock20-100=minGQ=20(inclusive),maxGQ=100(exclusive) +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##contig= +##reference=file:///seq/references/Homo_sapiens_assembly38/v0/Homo_sapiens_assembly38.fasta +##source=SelectVariants +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT HK017-0046 +chr20 1133569 . A ATATATATATATATATATATTT, 240.77 . DP=22;MQRankSum=1.380;MQ_DP=22;RAW_MQ=69700.00;ReadPosRankSum=0.000 GT:AD:DP:GQ:PL:SB 0/1:2,4,0:6:30:269,0,253,30,51,87:0,2,3,1 +chr20 1133750 . A G,AGAATGGAATG, 408.79 . DP=121;MQ=37.23;MQ_DP=121 GT:AD:DP:GQ:PL:SB 1/3:0,6,0,0:6:9:436,21,9,291,45,834,242,0,270,220:0,0,5,1 diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/walkers/variantutils/ReblockGVCF/testOneSampleAsForGnomAD.expected.g.vcf b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/variantutils/ReblockGVCF/testOneSampleAsForGnomAD.expected.g.vcf new file mode 100755 index 00000000000..d6bc2119206 --- /dev/null +++ b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/variantutils/ReblockGVCF/testOneSampleAsForGnomAD.expected.g.vcf @@ -0,0 +1,36 @@ +##fileformat=VCFv4.2 +##ALT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##GVCFBlock0-20=minGQ=0(inclusive),maxGQ=20(exclusive) +##GVCFBlock20-100=minGQ=20(inclusive),maxGQ=100(exclusive) +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##contig= +##source=SelectVariants +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA1 +chr20 69485 . G . . END=69510 GT:DP:GQ:MIN_DP:PL 0/0:94:99:94:0,120,1800 +chr20 69511 . A G, 2255.77 . DP=82;MQ=31.05;MQRankSum=-0.866;MQ_DP=82;QUALapprox=2284;ReadPosRankSum=1.689;VarDP=80 GT:AD:DP:GQ:PL:SB 1/1:1,79,0:80:99:2284,207,0,2287,237,2316:0,1,46,33 +chr20 69512 . A . . END=69521 GT:DP:GQ:MIN_DP:PL 0/0:96:99:96:0,120,1800 +chr20 69549 . T . . END=69634 GT:DP:GQ:MIN_DP:PL 0/0:6:16:6:0,16,90 +chr20 69635 . A T, 0.06 . DP=7;MQ=34.15;MQRankSum=1.300;MQ_DP=7;QUALapprox=10;ReadPosRankSum=1.754;VarDP=7 GT:AD:DP:GQ:PL:SB 0/1:4,3,0:7:10:10,0,119,101,128,229:0,4,0,3 +chr20 69762 . A . . END=69762 GT:DP:GQ:MIN_DP:PL 0/0:7:18:7:0,18,270 +chr20 69763 . A . . END=69766 GT:DP:GQ:MIN_DP:PL 0/0:7:21:7:0,21,253 +chr20 69767 . A . . END=69770 GT:DP:GQ:MIN_DP:PL 0/0:7:12:7:0,12,180 +chr20 69771 . TAAAAA T, 0.61 . DP=7;MQ=34.15;MQRankSum=1.300;MQ_DP=7;QUALapprox=20;ReadPosRankSum=1.754;VarDP=7 GT:AD:DP:GQ:PL:SB 0/1:4,3,0:7:20:20,0,119,101,128,229:0,4,0,3 +chr20 69785 . T G, 2255.77 . DP=82;MQ=31.05;MQRankSum=-0.866;MQ_DP=82;QUALapprox=2284;ReadPosRankSum=1.689;VarDP=80 GT:DP:GQ:PL:SB 1/1:80:99:2284,207,0,2287,237,2316:0,1,46,33