Skip to content

Commit

Permalink
Improved implementation of allele-specific new qual (#5460)
Browse files Browse the repository at this point in the history
  • Loading branch information
davidbenjamin authored Dec 11, 2018
1 parent 8de55e8 commit a012990
Show file tree
Hide file tree
Showing 4 changed files with 104 additions and 23 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.function.IntConsumer;
import java.util.stream.Collectors;
import java.util.stream.IntStream;

Expand All @@ -18,19 +19,27 @@
*
* <p>Alleles are represented herein by their indices running from <b>0</b> to <b>N-1</b> where <i>N</i> is the number of alleles.</p>
*
* <p>Genotypes are represented as a single array of alternating alleles and counts, where only alleles with non-zero counts are included:
* [allele 1, count1, allele 2, count2. . .]</p>
*
* <p>Each allele present in a genotype (count != 0) has a <i>rank</i>, that is the 0-based ordinal of
* that allele amongst the ones present in the genotype as sorted by their index.</p>
*
* <p>For example:</p>
*
* <p><b>0/0/2/2</b> has two alleles with indices <b>0</b> and <b>2</b>, both with count 2.
* <p><b>[0,1,2,1]</b> has two alleles with indices <b>0</b> and <b>2</b>, both with count 1, corresponding to diploid genotype 0/2.
* The rank of <b>0</b> is <i>0</i> whereas the rank of <b>2</b> is <i>1</i>.</p>
*
* <p><b>2/4/4/7</b> has three alleles with indices <b>2</b>, <b>4</b> and <b>7</b>. <b>2</b> and <b>7</b> have count 1 whereas <b>4</b> has count 2.
* <p><b>[2,1,4,2,7,1]</b> has three alleles with indices <b>2</b>, <b>4</b> and <b>7</b>. <b>2</b> and <b>7</b> have count 1 whereas <b>4</b> has count 2.
* It corresponds to tetraploid genotype 2/4/4/7
* The rank of <b>2</b> is <i>0</i>, the rank of <b>4</b> is <i>1</i>. and the rank of <b>7</b> is <i>2</i>.</p>
*
* <p>In contrast, in both examples above both <b>3</b> and <b>10</b> (and many others) are absent thus they have no rank (represented by <i>-1</i> whenever applies).</p>
*
* <p><b>[0,0,1,2]</b> is not valid because allele 0 has a count of 0 and should be absent from the array.</p>
*
* <p><b>[1,1,0,1]</b> is not valid because allele 1 comes before allele 0.</p>
*
* <p>{@link GenotypeAlleleCounts} instances have themselves their own index (returned by {@link #index() index()}, that indicate their 0-based ordinal within the possible genotype combinations with the same ploidy.</p>
*
* <p>For example, for ploidy 3:</p>
Expand Down Expand Up @@ -152,7 +161,7 @@ protected void increase(final int times) {
}

/**
* Updates the genotype counts to match the next genotype.
* Updates the genotype counts to match the next genotype according to the canonical ordering of PLs.
*
* <p>
* This method must not be invoked on cached genotype-allele-counts that are meant to remain constant,
Expand Down Expand Up @@ -655,6 +664,31 @@ public void forEachAlleleIndexAndCount(final IntBiConsumer action) {
new IndexRange(0, distinctAlleleCount).forEach(n -> action.accept(sortedAlleleCounts[2*n], sortedAlleleCounts[2*n+1]));
}

/**
* Perform an action for every allele index not represented in this genotype. For example if the total allele count
* is 4 and {@code sortedAlleleCounts} is [0,1,2,1] then alleles 0 and 2 are present, each with a count of 1, while
* alleles 1 and 3 are absent, so we perform {@code action} on 1 and 3.
*/
public void forEachAbsentAlleleIndex(final IntConsumer action, final int alleleCount) {
int presentAlleleIndex = 0;
int presentAllele = sortedAlleleCounts[0];

for (int n = 0; n < alleleCount; n++) {
// if we find n in sortedAlleleCounts, it is present, so we move presentAllele to the next
// index in sortedAlleleCounts and skip the allele; otherwise the allele is absent and we perform the action on it.
if (n == presentAllele) {
// if we haven't exhausted all the present alleles, move to the next one.
// Note that distinctAlleleCount == sortedAlleleCounts.length/2
if (++presentAlleleIndex < distinctAlleleCount) {
// every other entry in sortedAlleleCounts is an allele index; hence we multiply by 2
presentAllele = sortedAlleleCounts[2 * presentAlleleIndex];
}
continue;
}
action.accept(n);
}
}

public double sumOverAlleleIndicesAndCounts(final IntToDoubleBiFunction func) {
return new IndexRange(0, distinctAlleleCount).sum(n -> func.apply(sortedAlleleCounts[2*n], sortedAlleleCounts[2*n+1]));
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.Genotype;
import htsjdk.variant.variantcontext.VariantContext;
import it.unimi.dsi.fastutil.doubles.DoubleArrayList;
import it.unimi.dsi.fastutil.ints.Int2ObjectArrayMap;
import org.apache.commons.math3.util.MathArrays;
import org.broadinstitute.hellbender.tools.walkers.genotyper.GenotypeAlleleCounts;
Expand Down Expand Up @@ -85,6 +86,9 @@ public AFCalculationResult getLog10PNonRef(final VariantContext vc, final int de

final boolean spanningDeletionPresent = alleles.contains(Allele.SPAN_DEL);
final Map<Integer, int[]> nonVariantIndicesByPloidy = new Int2ObjectArrayMap<>();

// re-usable buffers of the log10 genotype posteriors of genotypes missing each allele
final List<DoubleArrayList> log10AbsentPosteriors = IntStream.range(0,numAlleles).mapToObj(n -> new DoubleArrayList()).collect(Collectors.toList());
for (final Genotype g : vc.getGenotypes()) {
if (!g.hasLikelihoods()) {
continue;
Expand All @@ -107,26 +111,30 @@ public AFCalculationResult getLog10PNonRef(final VariantContext vc, final int de
log10PNoVariant += Math.min(0,MathUtils.log10SumLog10(nonVariantLog10Posteriors));
}

// per allele non-log space probabilities of zero counts for this sample
// for each allele calculate the total probability of genotypes containing at least one copy of the allele
final double[] log10ProbabilityOfNonZeroAltAlleles = new double[numAlleles];
Arrays.fill(log10ProbabilityOfNonZeroAltAlleles, Double.NEGATIVE_INFINITY);
// if the VC is biallelic the allele-specific qual equals the variant qual
if (numAlleles == 2) {
continue;
}

// for each allele, we collect the log10 probabilities of genotypes in which the allele is absent, then add (in log space)
// to get the log10 probability that the allele is absent in this sample
log10AbsentPosteriors.forEach(DoubleArrayList::clear); // clear the buffers. Note that this is O(1) due to the primitive backing array
for (int genotype = 0; genotype < glCalc.genotypeCount(); genotype++) {
final double log10GenotypePosterior = log10GenotypePosteriors[genotype];
glCalc.genotypeAlleleCountsAt(genotype).forEachAlleleIndexAndCount((alleleIndex, count) ->
log10ProbabilityOfNonZeroAltAlleles[alleleIndex] =
MathUtils.log10SumLog10(log10ProbabilityOfNonZeroAltAlleles[alleleIndex], log10GenotypePosterior));
glCalc.genotypeAlleleCountsAt(genotype).forEachAbsentAlleleIndex(a -> log10AbsentPosteriors.get(a).add(log10GenotypePosterior), numAlleles);
}

for (int allele = 0; allele < numAlleles; allele++) {
// if prob of non hom ref == 1 up to numerical precision, short-circuit to avoid NaN
if (log10ProbabilityOfNonZeroAltAlleles[allele] >= 0) {
log10POfZeroCountsByAllele[allele] = Double.NEGATIVE_INFINITY;
} else {
log10POfZeroCountsByAllele[allele] += MathUtils.log10OneMinusPow10(log10ProbabilityOfNonZeroAltAlleles[allele]);
}
}
final double[] log10PNoAllele = log10AbsentPosteriors.stream()
.mapToDouble(buffer -> MathUtils.log10SumLog10(buffer.toDoubleArray()))
.map(x -> Math.min(0, x)).toArray(); // if prob of non hom ref > 1 due to finite precision, short-circuit to avoid NaN

// multiply the cumulative probabilities of alleles being absent, which is addition of logs
MathUtils.addToArrayInPlace(log10POfZeroCountsByAllele, log10PNoAllele);
}

// for biallelic the allele-specific qual equals the variant qual, and we short-circuited the calculation above
if (numAlleles == 2) {
log10POfZeroCountsByAllele[1] = log10PNoVariant;
}

// unfortunately AFCalculationResult expects integers for the MLE. We really should emit the EM no-integer values
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,7 @@
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.*;

/**
* Test {@link org.broadinstitute.gatk.tools.walkers.genotyper.GenotypeAlleleCounts}
Expand Down Expand Up @@ -184,8 +181,11 @@ private void testPloidyTwoOrMore(final int ploidy) {

GenotypeAlleleCounts current = GenotypeAlleleCounts.first(ploidy);

while (!current.containsAllele(MAXIMUM_ALLELE_INDEX + 1)) {
while (true) {
final GenotypeAlleleCounts next = current.next();
if (next.containsAllele(MAXIMUM_ALLELE_INDEX + 1)) {
break;
}

// test log10CombinationCount
if (ploidy == 2) {
Expand All @@ -203,14 +203,18 @@ private void testPloidyTwoOrMore(final int ploidy) {

//test forEach
final List<Integer> alleleCountsAsList = new ArrayList<>(next.distinctAlleleCount()*2);
final Set<Integer> absentAlleles = new HashSet<>();
next.forEachAlleleIndexAndCount((alleleIndex, alleleCount) -> {
alleleCountsAsList.add(alleleIndex);
alleleCountsAsList.add(alleleCount);});
next.forEachAbsentAlleleIndex(absentAlleles::add, MAXIMUM_ALLELE_INDEX + 1);
final int[] actualAlleleCounts = new int[next.distinctAlleleCount()*2];
next.copyAlleleCounts(actualAlleleCounts, 0);

Assert.assertEquals(alleleCountsAsList.stream().mapToInt(n->n).toArray(), actualAlleleCounts);

Assert.assertEquals(absentAlleles.size(), MAXIMUM_ALLELE_INDEX + 1 - next.distinctAlleleCount());
next.forEachAlleleIndexAndCount((index, count) -> Assert.assertTrue(!absentAlleles.contains(index)));


if (current.distinctAlleleCount() == 1) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import org.broadinstitute.hellbender.tools.walkers.genotyper.GenotypeLikelihoodCalculator;
import org.broadinstitute.hellbender.tools.walkers.genotyper.GenotypeLikelihoodCalculators;
import org.broadinstitute.hellbender.GATKBaseTest;
import org.broadinstitute.hellbender.utils.MathUtils;
import org.testng.Assert;
import org.testng.annotations.Test;

Expand Down Expand Up @@ -128,6 +129,40 @@ public void testManySamplesWithLowConfidence() {
Assert.assertTrue(counts[8] >= 3); // ten samples
}

// a previous implementation of {@link AlleleFrequencyCalculator} had finite precision errors that manifested
// when there were very many confident samples. This is a regression test for that bug.
// See https://github.com/broadinstitute/gatk/issues/4833
@Test
public void testManyVeryConfidentSamples() {
// flat prior to simplify back-of-the-envelope calculations
final AlleleFrequencyCalculator afCalc = new AlleleFrequencyCalculator(1, 1, 1, DEFAULT_PLOIDY);
final List<Allele> alleles = Arrays.asList(A,B,C);


final Genotype AC = genotypeWithObviousCall(DIPLOID, TRIALLELIC, new int[] {0,1,2,1}, EXTREMELY_CONFIDENT_PL);
for (final int numSamples : new int[] {100, 1000}) {

final VariantContext vc = makeVC(alleles, Collections.nCopies(numSamples, AC));
final AFCalculationResult result = afCalc.getLog10PNonRef(vc);
Assert.assertEquals(result.getAlleleCountAtMLE(B), 0);
Assert.assertEquals(result.getAlleleCountAtMLE(C), numSamples);

Assert.assertEquals(result.getLog10LikelihoodOfAFEq0(), result.getLog10PosteriorOfAFEq0ForAllele(C), numSamples * 0.01);

// with a large number of samples all with the AC genotype, the calculator will learn that the frequencies of the A and C alleles
// are 1/2, while the frequency of the B allele is 0. Thus the only genotypes with appreciable priors are AA, AC, and CC
// with priors of 1/4, 1/2, and 1/4 and relative likelihoods of 1, 10^(PL/10), and 1
// the posterior probability of each sample having the C allele is thus
// (1 + 2*10^(PL/10))/(1 + 2*10^(PL/10) + 1) = (1 + x/2)/(1 + x), where x = 10^(-PL/10)

// to first-order in x, which is an extremely good approximation, this is 1 - x/2
// thus the probability that N identical samples don't have the C allele is (x/2)^N, and the log-10 probability of this is
// N * [log_10(1/2) - PL/10]
final double expectedLog10ProbabilityOfNoCAllele = numSamples * (MathUtils.LOG10_ONE_HALF - EXTREMELY_CONFIDENT_PL / 10);
Assert.assertEquals(result.getLog10PosteriorOfAFEq0ForAllele(C), expectedLog10ProbabilityOfNoCAllele, numSamples * 0.01);
}
}

@Test
public void testApproximateMultiplicativeConfidence() {
final AlleleFrequencyCalculator afCalc = new AlleleFrequencyCalculator(1, 1, 1, DEFAULT_PLOIDY); //flat prior -- we will choose genotypes such that the posterior remains flat
Expand Down

0 comments on commit a012990

Please sign in to comment.