Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Using cigar complexity to break ties in uninformative reads' best haplotypes #5359

Merged
merged 2 commits into from
Nov 28, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package org.broadinstitute.hellbender.tools.walkers.haplotypecaller;

import htsjdk.samtools.Cigar;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMFileWriter;
import htsjdk.samtools.reference.ReferenceSequenceFile;
Expand Down Expand Up @@ -30,6 +31,7 @@

import java.io.File;
import java.util.*;
import java.util.function.Function;
import java.util.stream.Collectors;

/**
Expand All @@ -41,6 +43,12 @@ public final class AssemblyBasedCallerUtils {
public static final String SUPPORTED_ALLELES_TAG="SA";
public static final String CALLABLE_REGION_TAG = "CR";
public static final String ALIGNMENT_REGION_TAG = "AR";
public static final Function<Haplotype, Double> HAPLOTYPE_ALIGNMENT_TIEBREAKING_PRIORITY = h -> {
final Cigar cigar = h.getCigar();
final int referenceTerm = (h.isReference() ? 1 : 0);
final int cigarTerm = cigar == null ? 0 : (1 - cigar.numCigarElements());
return (double) referenceTerm + cigarTerm;
};

/**
* Returns a map with the original read as a key and the realigned read as the value.
Expand All @@ -50,7 +58,7 @@ public final class AssemblyBasedCallerUtils {
* @return never {@code null}
*/
public static Map<GATKRead, GATKRead> realignReadsToTheirBestHaplotype(final ReadLikelihoods<Haplotype> originalReadLikelihoods, final Haplotype refHaplotype, final Locatable paddedReferenceLoc, final SmithWatermanAligner aligner) {
final Collection<ReadLikelihoods<Haplotype>.BestAllele> bestAlleles = originalReadLikelihoods.bestAllelesBreakingTies();
final Collection<ReadLikelihoods<Haplotype>.BestAllele> bestAlleles = originalReadLikelihoods.bestAllelesBreakingTies(HAPLOTYPE_ALIGNMENT_TIEBREAKING_PRIORITY);
final Map<GATKRead, GATKRead> result = new HashMap<>(bestAlleles.size());

for (final ReadLikelihoods<Haplotype>.BestAllele bestAllele : bestAlleles) {
Expand Down Expand Up @@ -291,7 +299,7 @@ public static AssemblyResultSet assembleReads(final AssemblyRegion region,
public static void annotateReadLikelihoodsWithRegions(final ReadLikelihoods<Haplotype> likelihoods,
final Locatable callableRegion) {
//assign alignment regions to each read
final Collection<ReadLikelihoods<Haplotype>.BestAllele> bestHaplotypes = likelihoods.bestAllelesBreakingTies();
final Collection<ReadLikelihoods<Haplotype>.BestAllele> bestHaplotypes = likelihoods.bestAllelesBreakingTies(HAPLOTYPE_ALIGNMENT_TIEBREAKING_PRIORITY);
for (final ReadLikelihoods<Haplotype>.BestAllele bestHaplotype : bestHaplotypes) {
final GATKRead read = bestHaplotype.read;
final Haplotype haplotype = bestHaplotype.allele;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import org.broadinstitute.hellbender.utils.read.GATKRead;

import java.util.*;
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.IntStream;

Expand Down Expand Up @@ -366,7 +367,7 @@ public void normalizeLikelihoods(final double maximumLikelihoodDifferenceCap) {
private void normalizeLikelihoodsPerRead(final double maximumBestAltLikelihoodDifference,
final double[][] sampleValues, final int sampleIndex, final int readIndex) {

final BestAllele bestAlternativeAllele = searchBestAllele(sampleIndex,readIndex,false, false);
final BestAllele bestAlternativeAllele = searchBestAllele(sampleIndex,readIndex,false);

final double worstLikelihoodCap = bestAlternativeAllele.likelihood + maximumBestAltLikelihoodDifference;

Expand Down Expand Up @@ -421,11 +422,12 @@ public List<A> alleles() {
* @param sampleIndex including sample index.
* @param readIndex target read index.
*
* @param useReferenceIfUninformative
* @param priorities An array of allele priorities (higher values have higher priority) to be used, if present, to break ties for
* uninformative likelihoods, in which case the read is assigned to the allele with the higher score.
* @return never {@code null}, but with {@link BestAllele#allele allele} == {@code null}
* if non-could be found.
*/
private BestAllele searchBestAllele(final int sampleIndex, final int readIndex, final boolean canBeReference, boolean useReferenceIfUninformative) {
private BestAllele searchBestAllele(final int sampleIndex, final int readIndex, final boolean canBeReference, Optional<double[]> priorities) {
final int alleleCount = alleles.numberOfAlleles();
if (alleleCount == 0 || (alleleCount == 1 && referenceAlleleIndex == 0 && !canBeReference)) {
return new BestAllele(sampleIndex, readIndex, -1, Double.NEGATIVE_INFINITY, Double.NEGATIVE_INFINITY);
Expand All @@ -434,33 +436,56 @@ private BestAllele searchBestAllele(final int sampleIndex, final int readIndex,
final double[][] sampleValues = valuesBySampleIndex[sampleIndex];
int bestAlleleIndex = canBeReference || referenceAlleleIndex != 0 ? 0 : 1;

int secondBestIndex = 0;
double bestLikelihood = sampleValues[bestAlleleIndex][readIndex];
double secondBestLikelihood = Double.NEGATIVE_INFINITY;

for (int a = bestAlleleIndex + 1; a < alleleCount; a++) {
if (!canBeReference && referenceAlleleIndex == a) {
continue;
}
final double candidateLikelihood = sampleValues[a][readIndex];
if (candidateLikelihood > bestLikelihood) {
secondBestIndex = bestAlleleIndex;
bestAlleleIndex = a;
secondBestLikelihood = bestLikelihood;
bestLikelihood = candidateLikelihood;
} else if (candidateLikelihood > secondBestLikelihood) {
secondBestIndex = a;
secondBestLikelihood = candidateLikelihood;
}
}

// if our read is not informative against the ref we set the ref as the best allele. This is so that bamouts don't
// spuriously show deletions in ref reads that end in STRs
if (useReferenceIfUninformative && canBeReference && referenceAlleleIndex != MISSING_REF && bestAlleleIndex != referenceAlleleIndex) {
final double referenceLikelihood = sampleValues[referenceAlleleIndex][readIndex];
if ( bestLikelihood - referenceLikelihood < BestAllele.INFORMATIVE_THRESHOLD ) {
secondBestLikelihood = bestLikelihood;
bestAlleleIndex = referenceAlleleIndex;
bestLikelihood = referenceLikelihood;
if (priorities.isPresent() && bestLikelihood - secondBestLikelihood < BestAllele.INFORMATIVE_THRESHOLD) {
double bestPriority = priorities.get()[bestAlleleIndex];
double secondBestPriority = priorities.get()[secondBestIndex];
for (int a = 0; a < alleleCount; a++) {
final double candidateLikelihood = sampleValues[a][readIndex];
if (a == bestAlleleIndex || (!canBeReference && a == referenceAlleleIndex) || bestLikelihood - candidateLikelihood > BestAllele.INFORMATIVE_THRESHOLD) {
continue;
}
final double candidatePriority = priorities.get()[a];

if (candidatePriority > bestPriority) {
secondBestIndex = bestAlleleIndex;
bestAlleleIndex = a;
secondBestPriority = bestPriority;
bestPriority = candidatePriority;
} else if (candidatePriority > secondBestPriority) {
secondBestIndex = a;
secondBestPriority = candidatePriority;
}
}
}
return new BestAllele(sampleIndex,readIndex,bestAlleleIndex,bestLikelihood,secondBestLikelihood);

bestLikelihood = sampleValues[bestAlleleIndex][readIndex];
secondBestLikelihood = secondBestIndex != bestAlleleIndex ? sampleValues[secondBestIndex][readIndex] : Double.NEGATIVE_INFINITY;

return new BestAllele(sampleIndex, readIndex, bestAlleleIndex, bestLikelihood, secondBestLikelihood);
}

private BestAllele searchBestAllele(final int sampleIndex, final int readIndex, final boolean canBeReference) {
return searchBestAllele(sampleIndex, readIndex, canBeReference, Optional.empty());
}

public void changeReads(final Map<GATKRead, GATKRead> readRealignments) {
Expand Down Expand Up @@ -903,7 +928,7 @@ public void updateNonRefAlleleLikelihoods(final AlleleList<A> allelesToConsider)
final double[][] sampleValues = valuesBySampleIndex[s];
final int readCount = sampleValues[0].length;
for (int r = 0; r < readCount; r++) {
final BestAllele bestAllele = searchBestAllele(s, r, true, false);
final BestAllele bestAllele = searchBestAllele(s, r, true);
int numberOfQualifiedAlleleLikelihoods = 0;
for (int i = 0; i < alleleCount; i++) {
final double alleleLikelihood = sampleValues[i][r];
Expand Down Expand Up @@ -960,25 +985,40 @@ public void contaminationDownsampling(final Map<String, Double> perSampleDownsam
/**
* Returns the collection of best allele estimates for the reads based on the read-likelihoods.
* "Ties" where the ref likelihood is within {@code ReadLikelihoods.INFORMATIVE_THRESHOLD} of the greatest likelihood
* are broken in favor of the reference.
* are broken by the {@code tieBreakingPriority} function.
*
* @throws IllegalStateException if there is no alleles.
*
* @return never {@code null}, one element per read in the read-likelihoods collection.
*/
public Collection<BestAllele> bestAllelesBreakingTies(final Function<A, Double> tieBreakingPriority) {
return IntStream.range(0, numberOfSamples()).boxed().flatMap(n -> bestAllelesBreakingTies(n, tieBreakingPriority).stream()).collect(Collectors.toList());
}

/**
* Default version where ties are broken in favor of the reference allele
*/
public Collection<BestAllele> bestAllelesBreakingTies() {
return IntStream.range(0, numberOfSamples()).boxed().flatMap(n -> bestAllelesBreakingTies(n).stream()).collect(Collectors.toList());
}

/**
* Returns the collection of best allele estimates for one sample's reads based on the read-likelihoods.
* "Ties" where the ref likelihood is within {@code ReadLikelihoods.INFORMATIVE_THRESHOLD} of the greatest likelihood
* are broken in favor of the reference.
* are broken by the {@code tieBreakingPriority} function.
*
* @throws IllegalStateException if there is no alleles.
*
* @return never {@code null}, one element per read in the read-likelihoods collection.
*/
public Collection<BestAllele> bestAllelesBreakingTies(final String sample, final Function<A, Double> tieBreakingPriority) {
final int sampleIndex = indexOfSample(sample);
return bestAllelesBreakingTies(sampleIndex, tieBreakingPriority);
}

/**
* Default version where ties are broken in favor of the reference allele
*/
public Collection<BestAllele> bestAllelesBreakingTies(final String sample) {
final int sampleIndex = indexOfSample(sample);
return bestAllelesBreakingTies(sampleIndex);
Expand All @@ -987,25 +1027,36 @@ public Collection<BestAllele> bestAllelesBreakingTies(final String sample) {
/**
* Returns the collection of best allele estimates for one sample's reads reads based on the read-likelihoods.
* "Ties" where the ref likelihood is within {@code ReadLikelihoods.INFORMATIVE_THRESHOLD} of the greatest likelihood
* are broken in favor of the reference.
* are broken by the {@code tieBreakingPriority} function.
*
* @throws IllegalStateException if there is no alleles.
*
* @return never {@code null}, one element per read in the read-likelihoods collection.
*/
private Collection<BestAllele> bestAllelesBreakingTies(final int sampleIndex) {
private Collection<BestAllele> bestAllelesBreakingTies(final int sampleIndex, final Function<A, Double> tieBreakingPriority) {
Utils.validIndex(sampleIndex, numberOfSamples());

//TODO: this currently just does ref vs alt. Really we want CIGAR complexity.
final Optional<double[]> priorities = alleles() == null ? Optional.empty() :
Optional.of(alleles().stream().mapToDouble(tieBreakingPriority::apply).toArray());

final GATKRead[] sampleReads = readsBySampleIndex[sampleIndex];
final int readCount = sampleReads.length;
final List<BestAllele> result = new ArrayList<>(readCount);
for (int r = 0; r < readCount; r++) {
result.add(searchBestAllele(sampleIndex, r, true, true));
result.add(searchBestAllele(sampleIndex, r, true, priorities));
}

return result;
}

/**
* Default version where ties are broken in favor of the reference allele
*/
private Collection<BestAllele> bestAllelesBreakingTies(final int sampleIndex) {
return bestAllelesBreakingTies(sampleIndex, a -> a.isReference() ? 1.0 : 0);
}


/**
* Returns reads stratified by their best allele.
Expand Down Expand Up @@ -1048,7 +1099,7 @@ private void readsByBestAlleleMap(final int sampleIndex, final Map<A, List<GATKR
final int readCount = reads.length;

for (int r = 0; r < readCount; r++) {
final BestAllele bestAllele = searchBestAllele(sampleIndex,r,true, false);
final BestAllele bestAllele = searchBestAllele(sampleIndex,r,true);
if (!bestAllele.isInformative()) {
continue;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,7 @@
20 10004194 . C <NON_REF> . . END=10004221 GT:DP:GQ:MIN_DP:PL ./.:50:99:46:0,105,1575 ./.
20 10004222 . C CA,<NON_REF> . . BaseQRankSum=0.335;DP=54;ExcessHet=3.01;MQRankSum=-2.042e+00;RAW_MQ=194729.00;ReadPosRankSum=1.27 GT:AD:DP:GQ:PL:SB ./.:31,8,0:39:50:50,0,923,153,952,1105:18,13,4,4 ./.
20 10004223 . A AG,<NON_REF> . . BaseQRankSum=-4.800e-01;DP=62;ExcessHet=3.01;MQRankSum=1.89;RAW_MQ=210769.00;ReadPosRankSum=0.339 GT:AD:DP:GQ:PL:SB ./.:36,11,0:47:85:85,0,982,193,1017,1210:16,20,5,6 ./.
20 10004224 . A <NON_REF> . . . GT:DP:GQ:MIN_DP:PL ./.:66:0:66:0,0,1403 ./.
20 10004224 . A <NON_REF> . . . GT:DP:GQ:MIN_DP:PL ./.:66:0:66:0,0,1325 ./.
20 10004225 . A <NON_REF> . . END=10004240 GT:DP:GQ:MIN_DP:PL ./.:70:99:65:0,117,1755 ./.
20 10004241 . G <NON_REF> . . . GT:DP:GQ:MIN_DP:PL ./.:66:0:66:0,0,1965 ./.
20 10004242 . A <NON_REF> . . END=10004350 GT:DP:GQ:MIN_DP:PL ./.:73:99:58:0,120,1800 ./.
Expand Down Expand Up @@ -914,7 +914,7 @@
20 10040773 . T <NON_REF> . . END=10040811 GT:DP:GQ:MIN_DP:PL ./.:77:99:75:0,120,1800 ./.
20 10040812 . AT A,<NON_REF> . . DP=79;ExcessHet=3.01;RAW_MQ=203299.00 GT:AD:DP:GQ:PL:SB ./.:0,71,0:71:99:2645,214,0,2645,214,2645:0,0,36,35 ./.
20 10040813 . T *,<NON_REF> . . DP=79 GT:AD:DP:GQ:PL:SB ./.:0,71,0:71:99:2645,214,0,2645,214,2645:0,0,36,35 ./.
20 10040814 . T <NON_REF> . . END=10040820 GT:DP:GQ:MIN_DP:PL ./.:78:99:77:0,120,1800 ./.
20 10040814 . T <NON_REF> . . END=10040820 GT:DP:GQ:MIN_DP:PL ./.:77:99:75:0,120,1800 ./.
20 10040821 . T A,<NON_REF> . . BaseQRankSum=-2.100e-02;DP=78;ExcessHet=3.01;MQRankSum=-9.700e-02;RAW_MQ=199699.00;ReadPosRankSum=-1.385e+00 GT:AD:DP:GQ:PL:SB ./.:38,40,0:78:99:1200,0,1099,1314,1219,2533:20,18,21,19 ./.
20 10040822 . A <NON_REF> . . END=10041303 GT:DP:GQ:MIN_DP:PL ./.:71:99:54:0,119,1800 ./.
20 10041304 . C T,<NON_REF> . . DP=68;ExcessHet=3.01;RAW_MQ=239810.00 GT:AD:DP:GQ:PL:SB ./.:0,68,0:68:99:2794,204,0,2794,204,2794:0,0,34,34 ./.
Expand Down Expand Up @@ -1198,9 +1198,9 @@
20 10087821 . A *,<NON_REF> . . DP=99 GT:AD:DP:GQ:PL:SB ./.:35,0,0:53:99:466,669,2577,661,2237,2189:17,18,8,10 ./.
20 10087822 . G *,<NON_REF> . . DP=99 GT:AD:DP:GQ:PL:SB ./.:35,0,0:53:99:466,669,2577,661,2237,2189:17,18,8,10 ./.
20 10087823 . A <NON_REF> . . END=10087841 GT:DP:GQ:MIN_DP:PL ./.:86:99:83:0,120,1800 ./.
20 10087842 . A <NON_REF> . . . GT:DP:GQ:MIN_DP:PL ./.:82:0:82:0,0,2206 ./.
20 10087843 . A <NON_REF> . . . GT:DP:GQ:MIN_DP:PL ./.:82:73:82:0,73,2973 ./.
20 10087844 . G <NON_REF> . . . GT:DP:GQ:MIN_DP:PL ./.:80:0:80:0,0,2190 ./.
20 10087842 . A <NON_REF> . . . GT:DP:GQ:MIN_DP:PL ./.:82:0:82:0,0,2353 ./.
20 10087843 . A <NON_REF> . . . GT:DP:GQ:MIN_DP:PL ./.:82:99:82:0,120,1800 ./.
20 10087844 . G <NON_REF> . . . GT:DP:GQ:MIN_DP:PL ./.:80:0:80:0,0,2416 ./.
20 10087845 . A <NON_REF> . . END=10088062 GT:DP:GQ:MIN_DP:PL ./.:95:99:71:0,120,1800 ./.
20 10088063 . C T,<NON_REF> . . BaseQRankSum=3.40;DP=93;ExcessHet=3.01;MQRankSum=0.515;RAW_MQ=327579.00;ReadPosRankSum=0.285 GT:AD:DP:GQ:PL:SB ./.:49,44,0:93:99:1446,0,1403,1592,1535,3128:24,25,18,26 ./.
20 10088064 . G <NON_REF> . . END=10088698 GT:DP:GQ:MIN_DP:PL ./.:87:99:60:0,120,1800 ./.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -521,8 +521,7 @@
20 10132717 . T A,<NON_REF> 83.77 . BaseQRankSum=0.593;DP=47;ExcessHet=3.0103;MLEAC=1,0;MLEAF=0.500,0.00;MQRankSum=-3.720;RAW_MQ=114046.00;ReadPosRankSum=-0.480 GT:AD:DP:GQ:PGT:PID:PL:SB 0/1:22,9,0:31:99:0|1:10132706_G_GAC:112,0,872,179,888,1068:6,16,6,3
20 10132718 . G <NON_REF> . . END=10132718 GT:DP:GQ:MIN_DP:PL 0/0:48:90:48:0,90,1350
20 10132719 . T A,<NON_REF> 99.77 . BaseQRankSum=0.592;DP=48;ExcessHet=3.0103;MLEAC=1,0;MLEAF=0.500,0.00;MQRankSum=-3.720;RAW_MQ=117646.00;ReadPosRankSum=-0.829 GT:AD:DP:GQ:PGT:PID:PL:SB 0/1:22,9,0:31:99:0|1:10132706_G_GAC:128,0,872,195,888,1084:6,16,6,3
20 10132720 . G <NON_REF> . . END=10132720 GT:DP:GQ:MIN_DP:PL 0/0:48:90:48:0,90,1350
20 10132721 . A <NON_REF> . . END=10132721 GT:DP:GQ:MIN_DP:PL 0/0:47:81:47:0,81,1215
20 10132720 . G <NON_REF> . . END=10132721 GT:DP:GQ:MIN_DP:PL 0/0:48:99:47:0,99,1485
20 10132722 . CAG C,GAG,CAGAGAGAGAGAGAGAGAG,<NON_REF> 951.73 . DP=48;ExcessHet=3.0103;MLEAC=1,1,0,0;MLEAF=0.500,0.500,0.00,0.00;RAW_MQ=117646.00 GT:AD:DP:GQ:PL:SB 1/2:0,15,9,1,0:25:99:989,671,679,357,0,685,675,373,439,1166,1028,718,465,831,1176:0,0,9,16
20 10132725 . A <NON_REF> . . END=10132726 GT:DP:GQ:MIN_DP:PL 0/0:52:72:51:0,72,1080
20 10132727 . A <NON_REF> . . END=10132728 GT:DP:GQ:MIN_DP:PL 0/0:53:81:52:0,81,1215
Expand Down
Loading