broadinstitute · davidbenjamin · Dec 11, 2018 · Oct 30, 2018 · Dec 10, 2018 · Dec 11, 2018
diff --git a/...main/java/org/broadinstitute/hellbender/tools/walkers/genotyper/GenotypeAlleleCounts.java b/...main/java/org/broadinstitute/hellbender/tools/walkers/genotyper/GenotypeAlleleCounts.java
@@ -10,6 +10,7 @@
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.List;
+import java.util.function.IntConsumer;
 import java.util.stream.Collectors;
 import java.util.stream.IntStream;
 
@@ -18,19 +19,26 @@
  *
  * <p>Alleles are represented herein by their indices running from <b>0</b> to <b>N-1</b> where <i>N</i> is the number of alleles.</p>
  *
+ * <p>Genotypes are represented as a single array of alternating alleles and counts, where only alleles with non-zero counts are included:
+ * [allele 1, count1, allele 2, count2. . .]</p>
+ *
  * <p>Each allele present in a genotype (count != 0) has a <i>rank</i>, that is the 0-based ordinal of
  * that allele amongst the ones present in the genotype as sorted by their index.</p>
  *
  * <p>For example:</p>
  *
- * <p><b>0/0/2/2</b> has two alleles with indices <b>0</b> and <b>2</b>, both with count 2.
+ * <p><b>[0,1,2,1]</b> has two alleles with indices <b>0</b> and <b>2</b>, both with count 1.
  * The rank of <b>0</b> is <i>0</i> whereas the rank of <b>2</b> is <i>1</i>.</p>
  *
- * <p><b>2/4/4/7</b> has three alleles with indices <b>2</b>, <b>4</b> and <b>7</b>. <b>2</b> and <b>7</b> have count 1 whereas <b>4</b> has count 2.
+ * <p><b>[2,1,4,2,7,1]</b> has three alleles with indices <b>2</b>, <b>4</b> and <b>7</b>. <b>2</b> and <b>7</b> have count 1 whereas <b>4</b> has count 2.
  * The rank of <b>2</b> is <i>0</i>, the rank of <b>4</b> is <i>1</i>. and the rank of <b>7</b> is <i>2</i>.</p>
  *
  * <p>In contrast, in both examples above both <b>3</b> and <b>10</b> (and many others) are absent thus they have no rank (represented by <i>-1</i> whenever applies).</p>
  *
+ * <p><b>[0,0,1,2]</b> is not valid because allele 0 has a count of 0 and should be absent from the array.</p>
+ *
+ * <p><b>[1,1,0,1]</b> is not valid because allele 1 comes before allele 0.</p>
+ *
  * <p>{@link GenotypeAlleleCounts} instances have themselves their own index (returned by {@link #index() index()}, that indicate their 0-based ordinal within the possible genotype combinations with the same ploidy.</p>
  *
  * <p>For example, for ploidy 3:</p>
@@ -152,7 +160,7 @@ protected void increase(final int times) {
     }
 
     /**
-     * Updates the genotype counts to match the next genotype.
+     * Updates the genotype counts to match the next genotype according to the canonical ordering of PLs.
      *
      * <p>
      *     This method must not be invoked on cached genotype-allele-counts that are meant to remain constant,
@@ -655,6 +663,31 @@ public void forEachAlleleIndexAndCount(final IntBiConsumer action) {
         new IndexRange(0, distinctAlleleCount).forEach(n -> action.accept(sortedAlleleCounts[2*n], sortedAlleleCounts[2*n+1]));
     }
 
+    /**
+     * Perform an action for every allele index not represented in this genotype.  For example if the total allele count
+     * is 4 and {@code sortedAlleleCounts} is [0,1,2,1] then alleles 0 and 2 are present, each with a count of 1, while
+     * alleles 1 and 3 are absent, so we perform {@code action} on 1 and 3.
+     */
+    public void forEachAbsentAlleleIndex(final IntConsumer action, final int alleleCount) {
+        int presentAlleleIndex = 0;
+        int presentAllele = sortedAlleleCounts[0];
+
+        for (int n = 0; n < alleleCount; n++) {
+            // if we find n in sortedAlleleCounts, it is present, so we move presentAllele to the next
+            // index in sortedAlleleCounts and skip the allele; otherwise the allele is absent and we perform the action on it.
+            if (n == presentAllele) {
+                // if we haven't exhausted all the present alleles, move to the next one.
+                // Note that distinctAlleleCount == sortedAlleleCounts.length/2
+                if (++presentAlleleIndex < distinctAlleleCount) {
+                    // every other entry in sortedAlleleCounts is an allele index; hence we multiply by 2
+                    presentAllele = sortedAlleleCounts[2 * presentAlleleIndex];
+                }
+                continue;
+            }
+            action.accept(n);
+        }
+    }
+
     public double sumOverAlleleIndicesAndCounts(final IntToDoubleBiFunction func) {
         return new IndexRange(0, distinctAlleleCount).sum(n -> func.apply(sortedAlleleCounts[2*n], sortedAlleleCounts[2*n+1]));
     }

diff --git a/...g/broadinstitute/hellbender/tools/walkers/genotyper/afcalc/AlleleFrequencyCalculator.java b/...g/broadinstitute/hellbender/tools/walkers/genotyper/afcalc/AlleleFrequencyCalculator.java
@@ -3,6 +3,7 @@
 import htsjdk.variant.variantcontext.Allele;
 import htsjdk.variant.variantcontext.Genotype;
 import htsjdk.variant.variantcontext.VariantContext;
+import it.unimi.dsi.fastutil.doubles.DoubleArrayList;
 import it.unimi.dsi.fastutil.ints.Int2ObjectArrayMap;
 import org.apache.commons.math3.util.MathArrays;
 import org.broadinstitute.hellbender.tools.walkers.genotyper.GenotypeAlleleCounts;
@@ -85,6 +86,9 @@ public AFCalculationResult getLog10PNonRef(final VariantContext vc, final int de
 
         final boolean spanningDeletionPresent = alleles.contains(Allele.SPAN_DEL);
         final Map<Integer, int[]> nonVariantIndicesByPloidy = new Int2ObjectArrayMap<>();
+
+        // re-usable buffers of the log10 genotype posteriors of genotypes missing each allele
+        final List<DoubleArrayList> log10AbsentPosteriors = IntStream.range(0,numAlleles).mapToObj(n -> new DoubleArrayList()).collect(Collectors.toList());
         for (final Genotype g : vc.getGenotypes()) {
             if (!g.hasLikelihoods()) {
                 continue;
@@ -107,26 +111,30 @@ public AFCalculationResult getLog10PNonRef(final VariantContext vc, final int de
                 log10PNoVariant += Math.min(0,MathUtils.log10SumLog10(nonVariantLog10Posteriors));
             }
 
-            // per allele non-log space probabilities of zero counts for this sample
-            // for each allele calculate the total probability of genotypes containing at least one copy of the allele
-            final double[] log10ProbabilityOfNonZeroAltAlleles = new double[numAlleles];
-            Arrays.fill(log10ProbabilityOfNonZeroAltAlleles, Double.NEGATIVE_INFINITY);
+            // if the VC is biallelic the allele-specific qual equals the variant qual
+            if (numAlleles == 2) {
+                continue;
+            }
 
+            // for each allele, we collect the log10 probabilities of genotypes in which the allele is absent, then add (in log space)
+            // to get the log10 probability that the allele is absent in this sample
+            log10AbsentPosteriors.forEach(DoubleArrayList::clear);  // clear the buffers.  Note that this is O(1) due to the primitive backing array
             for (int genotype = 0; genotype < glCalc.genotypeCount(); genotype++) {
                 final double log10GenotypePosterior = log10GenotypePosteriors[genotype];
-                glCalc.genotypeAlleleCountsAt(genotype).forEachAlleleIndexAndCount((alleleIndex, count) ->
-                        log10ProbabilityOfNonZeroAltAlleles[alleleIndex] =
-                                MathUtils.log10SumLog10(log10ProbabilityOfNonZeroAltAlleles[alleleIndex], log10GenotypePosterior));
+                glCalc.genotypeAlleleCountsAt(genotype).forEachAbsentAlleleIndex(a -> log10AbsentPosteriors.get(a).add(log10GenotypePosterior), numAlleles);
             }
 
-            for (int allele = 0; allele < numAlleles; allele++) {
-                // if prob of non hom ref == 1 up to numerical precision, short-circuit to avoid NaN
-                if (log10ProbabilityOfNonZeroAltAlleles[allele] >= 0) {
-                    log10POfZeroCountsByAllele[allele] = Double.NEGATIVE_INFINITY;
-                } else {
-                    log10POfZeroCountsByAllele[allele] += MathUtils.log10OneMinusPow10(log10ProbabilityOfNonZeroAltAlleles[allele]);
-                }
-            }
+            final double[] log10PNoAllele = log10AbsentPosteriors.stream()
+                    .mapToDouble(buffer -> MathUtils.log10SumLog10(buffer.toDoubleArray()))
+                    .map(x -> Math.min(0, x)).toArray();    // if prob of non hom ref > 1 due to finite precision, short-circuit to avoid NaN
+
+            // multiply the cumulative probabilities of alleles being absent, which is addition of logs
+            MathUtils.addToArrayInPlace(log10POfZeroCountsByAllele, log10PNoAllele);
+        }
+
+        // for biallelic the allele-specific qual equals the variant qual, and we short-circuited the calculation above
+        if (numAlleles == 2) {
+            log10POfZeroCountsByAllele[1] = log10PNoVariant;
         }
 
         // unfortunately AFCalculationResult expects integers for the MLE.  We really should emit the EM no-integer values

diff --git a/...a/org/broadinstitute/hellbender/tools/walkers/genotyper/GenotypeAlleleCountsUnitTest.java b/...a/org/broadinstitute/hellbender/tools/walkers/genotyper/GenotypeAlleleCountsUnitTest.java
@@ -7,10 +7,7 @@
 import org.testng.annotations.DataProvider;
 import org.testng.annotations.Test;
 
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.List;
+import java.util.*;
 
 /**
  * Test {@link org.broadinstitute.gatk.tools.walkers.genotyper.GenotypeAlleleCounts}
@@ -184,8 +181,11 @@ private void testPloidyTwoOrMore(final int ploidy) {
 
         GenotypeAlleleCounts current = GenotypeAlleleCounts.first(ploidy);
 
-        while (!current.containsAllele(MAXIMUM_ALLELE_INDEX + 1)) {
+        while (true) {
             final GenotypeAlleleCounts next = current.next();
+            if (next.containsAllele(MAXIMUM_ALLELE_INDEX + 1)) {
+                break;
+            }
 
             // test log10CombinationCount
             if (ploidy == 2) {
@@ -203,14 +203,18 @@ private void testPloidyTwoOrMore(final int ploidy) {
 
             //test forEach
             final List<Integer> alleleCountsAsList = new ArrayList<>(next.distinctAlleleCount()*2);
+            final Set<Integer> absentAlleles = new HashSet<>();
             next.forEachAlleleIndexAndCount((alleleIndex, alleleCount) -> {
                 alleleCountsAsList.add(alleleIndex);
                 alleleCountsAsList.add(alleleCount);});
+            next.forEachAbsentAlleleIndex(absentAlleles::add, MAXIMUM_ALLELE_INDEX + 1);
             final int[] actualAlleleCounts = new int[next.distinctAlleleCount()*2];
             next.copyAlleleCounts(actualAlleleCounts, 0);
 
             Assert.assertEquals(alleleCountsAsList.stream().mapToInt(n->n).toArray(), actualAlleleCounts);
 
+            Assert.assertEquals(absentAlleles.size(), MAXIMUM_ALLELE_INDEX + 1 - next.distinctAlleleCount());
+            next.forEachAlleleIndexAndCount((index, count) -> Assert.assertTrue(!absentAlleles.contains(index)));
 
 
             if (current.distinctAlleleCount() == 1) {