From ab174c533d8c1c1763635c497f8b8dae9f870c40 Mon Sep 17 00:00:00 2001
From: Mayya Sharipova <mayya.sharipova@elastic.co>
Date: Tue, 11 Jun 2019 16:15:14 -0400
Subject: [PATCH 1/4] Skip optimization if the index has duplicate data

Skip sort optimization if the index has 50% or more data
with the same value.
When index has a lot of docs with the same value, sort
optimization doesn't make sense, as DistanceFeatureQuery
will produce same scores for these docs, and Lucene
will use the second sort to tie-break. This could be slower
than usual sorting.
---
 .../search/query/QueryPhase.java              | 92 ++++++++++++++++++-
 .../search/query/QueryPhaseTests.java         | 75 ++++++++-------
 2 files changed, 128 insertions(+), 39 deletions(-)

diff --git a/server/src/main/java/org/elasticsearch/search/query/QueryPhase.java b/server/src/main/java/org/elasticsearch/search/query/QueryPhase.java
index 48e238c3e3168..dc86c842cc685 100644
--- a/server/src/main/java/org/elasticsearch/search/query/QueryPhase.java
+++ b/server/src/main/java/org/elasticsearch/search/query/QueryPhase.java
@@ -387,8 +387,12 @@ private static Query tryRewriteLongSort(SearchContext searchContext, IndexReader
         ((sortField.getReverse() == false) && (missingValue == Long.MAX_VALUE));
         if (missingValuesAccordingToSort == false) return null;
 
+        int docCount = PointValues.getDocCount(reader, fieldName);
+         // is not worth to run optimization on small index, also estimation of duplicate data doesn't work well on small index
+        if (docCount <= 512) return null;
+
         // check for multiple values
-        if (PointValues.size(reader, fieldName) != PointValues.getDocCount(reader, fieldName)) return null; //TODO: handle multiple values
+        if (PointValues.size(reader, fieldName) != docCount) return null; //TODO: handle multiple values
 
         // check if the optimization makes sense with the track_total_hits setting
         if (searchContext.trackTotalHitsUpTo() == Integer.MAX_VALUE) {
@@ -408,6 +412,7 @@ private static Query tryRewriteLongSort(SearchContext searchContext, IndexReader
         if (minValue == maxValue) {
             rewrittenQuery = new DocValuesFieldExistsQuery(fieldName);
         } else {
+            if (indexFieldHasDuplicateData(reader, fieldName)) return null;
             long origin = (sortField.getReverse()) ? maxValue : minValue;
             long pivotDistance = (maxValue - minValue) >>> 1; // division by 2 on the unsigned representation to avoid overflow
             if (pivotDistance == 0) { // 0 if maxValue = (minValue + 1)
@@ -469,5 +474,90 @@ private static boolean canEarlyTerminate(IndexReader reader, SortAndFormats sort
         return true;
     }
 
+    /**
+    * Returns true if more than 50% of data in the index have the same value
+    * The evaluation is approximation based on finding the median value and estimating its count
+    * Returns true if most of the segments have duplicate data, false - otherwise
+    */
+    static boolean indexFieldHasDuplicateData(IndexReader reader, String field) throws IOException {
+        int duplicateSegments = 0;
+        int noDuplicateSegments = 0;
+        for (LeafReaderContext lrc : reader.leaves()) {
+            PointValues pointValues = lrc.reader().getPointValues(field);
+            int thresholdDocCount = pointValues.getDocCount()/2;
+            byte[] minValueAsBytes = pointValues.getMinPackedValue();
+            byte[] maxValueAsBytes = pointValues.getMaxPackedValue();
+            long minValue = LongPoint.decodeDimension(minValueAsBytes, 0);
+            long maxValue = LongPoint.decodeDimension(maxValueAsBytes, 0);
+            long medianCount = estimateMedianCount(pointValues, minValue, maxValue, thresholdDocCount);
+            if (medianCount > thresholdDocCount) {
+                duplicateSegments++;
+            } else {
+                noDuplicateSegments++;
+            }
+        }
+        return (duplicateSegments >= noDuplicateSegments);
+    }
+
+    private static long estimateMedianCount(PointValues pointValues, long minValue, long maxValue, long threshold) {
+        while (minValue < maxValue) {
+            long avgValue = Math.floorDiv(minValue + maxValue, 2);
+            long countLeft = estimatePointCount(pointValues, minValue, avgValue);
+            if (countLeft >= threshold) {
+                maxValue = avgValue;
+                threshold = countLeft/2;
+            } else {
+                long countRight = estimatePointCount(pointValues, avgValue + 1, maxValue);
+                minValue = avgValue + 1;
+                threshold = countRight/2;
+            }
+        }
+        // maxValue is the approximate median value, estimate its count
+        long medianCount = estimatePointCount(pointValues, maxValue, maxValue);
+        return medianCount;
+    }
+
+
+    private static long estimatePointCount(PointValues pointValues, long minValue, long maxValue) {
+        final byte[] minValueAsBytes = new byte[Long.BYTES];
+        LongPoint.encodeDimension(minValue, minValueAsBytes, 0);
+        final byte[] maxValueAsBytes = new byte[Long.BYTES];
+        LongPoint.encodeDimension(maxValue, maxValueAsBytes, 0);
+
+        PointValues.IntersectVisitor visitor = new PointValues.IntersectVisitor() {
+            @Override
+            public void grow(int count) {}
+
+            @Override
+            public void visit(int docID) {}
+
+            @Override
+            public void visit(int docID, byte[] packedValue) {
+                if (Arrays.compareUnsigned(packedValue, 0, Long.BYTES, minValueAsBytes, 0, Long.BYTES) < 0) {
+                    // Doc's value is too low, in this dimension
+                    return;
+                }
+                if (Arrays.compareUnsigned(packedValue, 0, Long.BYTES, maxValueAsBytes, 0, Long.BYTES) > 0) {
+                    // Doc's value is too high, in this dimension
+                    return;
+                }
+            }
+
+            @Override
+            public PointValues.Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
+                if (Arrays.compareUnsigned(minPackedValue, 0, Long.BYTES, maxValueAsBytes, 0, Long.BYTES) > 0 ||
+                    Arrays.compareUnsigned(maxPackedValue, 0, Long.BYTES, minValueAsBytes, 0, Long.BYTES) < 0) {
+                    return PointValues.Relation.CELL_OUTSIDE_QUERY;
+                }
+                if (Arrays.compareUnsigned(minPackedValue, 0, Long.BYTES, minValueAsBytes, 0, Long.BYTES) < 0 ||
+                    Arrays.compareUnsigned(maxPackedValue, 0, Long.BYTES, maxValueAsBytes, 0, Long.BYTES) > 0) {
+                    return PointValues.Relation.CELL_CROSSES_QUERY;
+                }
+                return PointValues.Relation.CELL_INSIDE_QUERY;
+            }
+        };
+        return pointValues.estimatePointCount(visitor);
+    }
+
     private static class TimeExceededException extends RuntimeException {}
 }
diff --git a/server/src/test/java/org/elasticsearch/search/query/QueryPhaseTests.java b/server/src/test/java/org/elasticsearch/search/query/QueryPhaseTests.java
index bf0c8fca9a15c..e3a3b45d3c4bd 100644
--- a/server/src/test/java/org/elasticsearch/search/query/QueryPhaseTests.java
+++ b/server/src/test/java/org/elasticsearch/search/query/QueryPhaseTests.java
@@ -31,6 +31,7 @@
 import org.apache.lucene.document.TextField;
 import org.apache.lucene.index.DirectoryReader;
 import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.index.IndexWriterConfig;
 import org.apache.lucene.index.LeafReaderContext;
 import org.apache.lucene.index.NoMergePolicy;
@@ -88,6 +89,7 @@
 import java.util.Collections;
 import java.util.List;
 
+import static org.elasticsearch.search.query.QueryPhase.indexFieldHasDuplicateData;
 import static org.hamcrest.Matchers.anyOf;
 import static org.hamcrest.Matchers.equalTo;
 import static org.hamcrest.Matchers.greaterThanOrEqualTo;
@@ -652,9 +654,9 @@ public void testNumericLongOrDateSortOptimization() throws Exception {
         TestSearchContext searchContext = spy(new TestSearchContext(null, indexShard));
         when(searchContext.mapperService()).thenReturn(mapperService);
 
-        final int numDocs = scaledRandomIntBetween(50, 100);
+        final int numDocs = 10000;
         Directory dir = newDirectory();
-        RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
+        IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(null));
         for (int i = 0; i < numDocs; ++i) {
             Document doc = new Document();
             long longValue = randomLongBetween(-10000000L, 10000000L);
@@ -708,6 +710,39 @@ public void testNumericLongOrDateSortOptimization() throws Exception {
         dir.close();
     }
 
+    public void testIndexFieldHasDuplicateData() throws IOException {
+        final int numDocs = 10000;
+        final int threshold1 = numDocs * 60 / 100;
+        final int threshold2 = numDocs * 40 / 100;
+        final int threshold3 = numDocs * 5 / 100;
+
+        final String fieldName = "duplicateField";
+        final String fieldName2 = "notMuchDuplicateField";
+        final String fieldName3 = "notDuplicateField";
+
+        long duplicateValue = randomLongBetween(-10000000L, 10000000L);
+        long value, value2, value3;
+        Directory dir = newDirectory();
+        IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(null));
+        for (int i = 0; i < numDocs; ++i) {
+            value = i < threshold1 ? duplicateValue : i;
+            value2 = i < threshold2 ? duplicateValue : i;
+            value3 = i < threshold3 ? duplicateValue : i;
+            Document doc = new Document();
+            doc.add(new LongPoint(fieldName, value));
+            doc.add(new LongPoint(fieldName2, value2));
+            doc.add(new LongPoint(fieldName3, value3));
+            writer.addDocument(doc);
+        }
+        writer.close();
+        final IndexReader reader = DirectoryReader.open(dir);
+        assertTrue(indexFieldHasDuplicateData(reader, fieldName));
+        assertFalse(indexFieldHasDuplicateData(reader, fieldName2));
+        assertFalse(indexFieldHasDuplicateData(reader, fieldName3));
+        reader.close();
+        dir.close();
+    }
+
 
     public void testMaxScoreQueryVisitor() {
         BitSetProducer producer = context -> new FixedBitSet(1);
@@ -760,42 +795,6 @@ public void testMaxScoreQueryVisitor() {
         }
     }
 
-    public void testNumericLongSortOptimizationDocsHaveTheSameValue() throws Exception {
-        final String fieldNameLong = "long-field";
-        MappedFieldType fieldTypeLong = new NumberFieldMapper.NumberFieldType(NumberFieldMapper.NumberType.LONG);
-        MapperService mapperService = mock(MapperService.class);
-        when(mapperService.fullName(fieldNameLong)).thenReturn(fieldTypeLong);
-        TestSearchContext searchContext = spy(new TestSearchContext(null, indexShard));
-        when(searchContext.mapperService()).thenReturn(mapperService);
-
-        final int numDocs = scaledRandomIntBetween(5, 10);
-        long longValue = randomLongBetween(-10000000L, 10000000L); // all docs have the same value
-        Directory dir = newDirectory();
-        RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
-        for (int i = 0; i < numDocs; ++i) {
-            Document doc = new Document();
-            doc.add(new LongPoint(fieldNameLong, longValue));
-            doc.add(new NumericDocValuesField(fieldNameLong, longValue));
-            writer.addDocument(doc);
-        }
-        writer.close();
-        final IndexReader reader = DirectoryReader.open(dir);
-        IndexSearcher searcher = getAssertingSortOptimizedSearcher(reader, 1);
-
-        final SortField sortFieldLong = new SortField(fieldNameLong, SortField.Type.LONG);
-        sortFieldLong.setMissingValue(Long.MAX_VALUE);
-        final Sort longSort = new Sort(sortFieldLong);
-        SortAndFormats sortAndFormats = new SortAndFormats(longSort, new DocValueFormat[]{DocValueFormat.RAW});
-        searchContext.sort(sortAndFormats);
-        searchContext.parsedQuery(new ParsedQuery(new MatchAllDocsQuery()));
-        searchContext.setTask(new SearchTask(123L, "", "", "", null, Collections.emptyMap()));
-        searchContext.setSize(10);
-        QueryPhase.execute(searchContext, searcher, checkCancelled -> {});
-        assertSortResults(searchContext.queryResult().topDocs().topDocs, (long) numDocs, false);
-        reader.close();
-        dir.close();
-    }
-
     // used to check that numeric long or date sort optimization was run
     private static IndexSearcher getAssertingSortOptimizedSearcher(IndexReader reader, int queryType) {
         return new IndexSearcher(reader) {

From e13761badbfb59e22ce9a416f877f0c7f089c3b3 Mon Sep 17 00:00:00 2001
From: Mayya Sharipova <mayya.sharipova@elastic.co>
Date: Wed, 19 Jun 2019 10:13:25 -0400
Subject: [PATCH 2/4] Address Jim's feedback

---
 .../search/query/QueryPhase.java              | 28 ++++++++++---------
 .../search/query/QueryPhaseTests.java         |  2 +-
 2 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/server/src/main/java/org/elasticsearch/search/query/QueryPhase.java b/server/src/main/java/org/elasticsearch/search/query/QueryPhase.java
index dc86c842cc685..643699a67b1b0 100644
--- a/server/src/main/java/org/elasticsearch/search/query/QueryPhase.java
+++ b/server/src/main/java/org/elasticsearch/search/query/QueryPhase.java
@@ -236,6 +236,9 @@ static boolean execute(SearchContext searchContext,
                     System.arraycopy(oldFormats, 0, newFormats, 1, oldFormats.length);
                     sortAndFormatsForRewrittenNumericSort = searchContext.sort(); // stash SortAndFormats to restore it later
                     searchContext.sort(new SortAndFormats(new Sort(newSortFields), newFormats));
+                    if (LOGGER.isTraceEnabled()) {
+                        LOGGER.trace("Sort optimization on the field [" + oldSortFields[0].getField() + "] was enabled!");
+                    }
                 }
             }
 
@@ -388,7 +391,7 @@ private static Query tryRewriteLongSort(SearchContext searchContext, IndexReader
         if (missingValuesAccordingToSort == false) return null;
 
         int docCount = PointValues.getDocCount(reader, fieldName);
-         // is not worth to run optimization on small index, also estimation of duplicate data doesn't work well on small index
+         // is not worth to run optimization on small index
         if (docCount <= 512) return null;
 
         // check for multiple values
@@ -477,26 +480,26 @@ private static boolean canEarlyTerminate(IndexReader reader, SortAndFormats sort
     /**
     * Returns true if more than 50% of data in the index have the same value
     * The evaluation is approximation based on finding the median value and estimating its count
-    * Returns true if most of the segments have duplicate data, false - otherwise
+    * Returns true if the total count of median values is greater or equal to half of the total count of documents
     */
     static boolean indexFieldHasDuplicateData(IndexReader reader, String field) throws IOException {
-        int duplicateSegments = 0;
-        int noDuplicateSegments = 0;
+        long globalDocCount = 0;
+        long globalMedianCount = 0;
         for (LeafReaderContext lrc : reader.leaves()) {
             PointValues pointValues = lrc.reader().getPointValues(field);
-            int thresholdDocCount = pointValues.getDocCount()/2;
+            int docCount = pointValues.getDocCount();
+            if (docCount <= 512) { // skipping small segments as estimateMedianCount doesn't work well on them
+                continue;
+            }
+            globalDocCount += docCount;
             byte[] minValueAsBytes = pointValues.getMinPackedValue();
             byte[] maxValueAsBytes = pointValues.getMaxPackedValue();
             long minValue = LongPoint.decodeDimension(minValueAsBytes, 0);
             long maxValue = LongPoint.decodeDimension(maxValueAsBytes, 0);
-            long medianCount = estimateMedianCount(pointValues, minValue, maxValue, thresholdDocCount);
-            if (medianCount > thresholdDocCount) {
-                duplicateSegments++;
-            } else {
-                noDuplicateSegments++;
-            }
+            long medianCount = estimateMedianCount(pointValues, minValue, maxValue, docCount/2);
+            globalMedianCount += medianCount;
         }
-        return (duplicateSegments >= noDuplicateSegments);
+        return (globalMedianCount >= globalDocCount/2);
     }
 
     private static long estimateMedianCount(PointValues pointValues, long minValue, long maxValue, long threshold) {
@@ -517,7 +520,6 @@ private static long estimateMedianCount(PointValues pointValues, long minValue,
         return medianCount;
     }
 
-
     private static long estimatePointCount(PointValues pointValues, long minValue, long maxValue) {
         final byte[] minValueAsBytes = new byte[Long.BYTES];
         LongPoint.encodeDimension(minValue, minValueAsBytes, 0);
diff --git a/server/src/test/java/org/elasticsearch/search/query/QueryPhaseTests.java b/server/src/test/java/org/elasticsearch/search/query/QueryPhaseTests.java
index e3a3b45d3c4bd..6108eb20ed323 100644
--- a/server/src/test/java/org/elasticsearch/search/query/QueryPhaseTests.java
+++ b/server/src/test/java/org/elasticsearch/search/query/QueryPhaseTests.java
@@ -654,7 +654,7 @@ public void testNumericLongOrDateSortOptimization() throws Exception {
         TestSearchContext searchContext = spy(new TestSearchContext(null, indexShard));
         when(searchContext.mapperService()).thenReturn(mapperService);
 
-        final int numDocs = 10000;
+        final int numDocs = 1000;
         Directory dir = newDirectory();
         IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(null));
         for (int i = 0; i < numDocs; ++i) {

From 9c71827301e75be2378387c020299b90958137f8 Mon Sep 17 00:00:00 2001
From: Mayya Sharipova <mayya.sharipova@elastic.co>
Date: Mon, 24 Jun 2019 17:27:41 -0400
Subject: [PATCH 3/4] Add test for finding duplicate data in BKD-tree

This allows to control the number of points in the leaf node
---
 .../search/query/QueryPhase.java              |  35 ++----
 .../search/query/QueryPhaseTests.java         | 101 ++++++++++++------
 2 files changed, 80 insertions(+), 56 deletions(-)

diff --git a/server/src/main/java/org/elasticsearch/search/query/QueryPhase.java b/server/src/main/java/org/elasticsearch/search/query/QueryPhase.java
index 643699a67b1b0..0b401d0ddb3d9 100644
--- a/server/src/main/java/org/elasticsearch/search/query/QueryPhase.java
+++ b/server/src/main/java/org/elasticsearch/search/query/QueryPhase.java
@@ -487,40 +487,36 @@ static boolean indexFieldHasDuplicateData(IndexReader reader, String field) thro
         long globalMedianCount = 0;
         for (LeafReaderContext lrc : reader.leaves()) {
             PointValues pointValues = lrc.reader().getPointValues(field);
+            if (pointValues == null) continue;
             int docCount = pointValues.getDocCount();
             if (docCount <= 512) { // skipping small segments as estimateMedianCount doesn't work well on them
                 continue;
             }
             globalDocCount += docCount;
-            byte[] minValueAsBytes = pointValues.getMinPackedValue();
-            byte[] maxValueAsBytes = pointValues.getMaxPackedValue();
-            long minValue = LongPoint.decodeDimension(minValueAsBytes, 0);
-            long maxValue = LongPoint.decodeDimension(maxValueAsBytes, 0);
-            long medianCount = estimateMedianCount(pointValues, minValue, maxValue, docCount/2);
+            long medianValue = estimateMedianValue(pointValues);
+            long medianCount = estimatePointCount(pointValues, medianValue, medianValue);
             globalMedianCount += medianCount;
         }
         return (globalMedianCount >= globalDocCount/2);
     }
 
-    private static long estimateMedianCount(PointValues pointValues, long minValue, long maxValue, long threshold) {
+    static long estimateMedianValue(PointValues pointValues) throws IOException {
+        long minValue = LongPoint.decodeDimension(pointValues.getMinPackedValue(), 0);
+        long maxValue = LongPoint.decodeDimension(pointValues.getMaxPackedValue(), 0);
         while (minValue < maxValue) {
             long avgValue = Math.floorDiv(minValue + maxValue, 2);
             long countLeft = estimatePointCount(pointValues, minValue, avgValue);
-            if (countLeft >= threshold) {
+            long countRight = estimatePointCount(pointValues, avgValue + 1, maxValue);
+            if (countLeft >= countRight) {
                 maxValue = avgValue;
-                threshold = countLeft/2;
             } else {
-                long countRight = estimatePointCount(pointValues, avgValue + 1, maxValue);
                 minValue = avgValue + 1;
-                threshold = countRight/2;
             }
         }
-        // maxValue is the approximate median value, estimate its count
-        long medianCount = estimatePointCount(pointValues, maxValue, maxValue);
-        return medianCount;
+        return maxValue;
     }
 
-    private static long estimatePointCount(PointValues pointValues, long minValue, long maxValue) {
+    static long estimatePointCount(PointValues pointValues, long minValue, long maxValue) {
         final byte[] minValueAsBytes = new byte[Long.BYTES];
         LongPoint.encodeDimension(minValue, minValueAsBytes, 0);
         final byte[] maxValueAsBytes = new byte[Long.BYTES];
@@ -534,16 +530,7 @@ public void grow(int count) {}
             public void visit(int docID) {}
 
             @Override
-            public void visit(int docID, byte[] packedValue) {
-                if (Arrays.compareUnsigned(packedValue, 0, Long.BYTES, minValueAsBytes, 0, Long.BYTES) < 0) {
-                    // Doc's value is too low, in this dimension
-                    return;
-                }
-                if (Arrays.compareUnsigned(packedValue, 0, Long.BYTES, maxValueAsBytes, 0, Long.BYTES) > 0) {
-                    // Doc's value is too high, in this dimension
-                    return;
-                }
-            }
+            public void visit(int docID, byte[] packedValue) {}
 
             @Override
             public PointValues.Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
diff --git a/server/src/test/java/org/elasticsearch/search/query/QueryPhaseTests.java b/server/src/test/java/org/elasticsearch/search/query/QueryPhaseTests.java
index 6108eb20ed323..1e885b4cabc70 100644
--- a/server/src/test/java/org/elasticsearch/search/query/QueryPhaseTests.java
+++ b/server/src/test/java/org/elasticsearch/search/query/QueryPhaseTests.java
@@ -66,8 +66,13 @@
 import org.apache.lucene.search.spans.SpanNearQuery;
 import org.apache.lucene.search.spans.SpanTermQuery;
 import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.IOContext;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.bkd.BKDReader;
+import org.apache.lucene.util.bkd.BKDWriter;
 import org.elasticsearch.action.search.SearchTask;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.index.mapper.DateFieldMapper;
@@ -89,11 +94,14 @@
 import java.util.Collections;
 import java.util.List;
 
-import static org.elasticsearch.search.query.QueryPhase.indexFieldHasDuplicateData;
+import static org.elasticsearch.search.query.QueryPhase.estimateMedianValue;
+import static org.elasticsearch.search.query.QueryPhase.estimatePointCount;
 import static org.hamcrest.Matchers.anyOf;
 import static org.hamcrest.Matchers.equalTo;
 import static org.hamcrest.Matchers.greaterThanOrEqualTo;
 import static org.hamcrest.Matchers.instanceOf;
+import static org.hamcrest.Matchers.lessThan;
+import static org.hamcrest.Matchers.lessThanOrEqualTo;
 import static org.mockito.Mockito.mock;
 import static org.mockito.Mockito.when;
 import static org.mockito.Mockito.spy;
@@ -654,7 +662,7 @@ public void testNumericLongOrDateSortOptimization() throws Exception {
         TestSearchContext searchContext = spy(new TestSearchContext(null, indexShard));
         when(searchContext.mapperService()).thenReturn(mapperService);
 
-        final int numDocs = 1000;
+        final int numDocs = 4000;
         Directory dir = newDirectory();
         IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(null));
         for (int i = 0; i < numDocs; ++i) {
@@ -710,39 +718,68 @@ public void testNumericLongOrDateSortOptimization() throws Exception {
         dir.close();
     }
 
-    public void testIndexFieldHasDuplicateData() throws IOException {
-        final int numDocs = 10000;
-        final int threshold1 = numDocs * 60 / 100;
-        final int threshold2 = numDocs * 40 / 100;
-        final int threshold3 = numDocs * 5 / 100;
-
-        final String fieldName = "duplicateField";
-        final String fieldName2 = "notMuchDuplicateField";
-        final String fieldName3 = "notDuplicateField";
-
-        long duplicateValue = randomLongBetween(-10000000L, 10000000L);
-        long value, value2, value3;
-        Directory dir = newDirectory();
-        IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(null));
-        for (int i = 0; i < numDocs; ++i) {
-            value = i < threshold1 ? duplicateValue : i;
-            value2 = i < threshold2 ? duplicateValue : i;
-            value3 = i < threshold3 ? duplicateValue : i;
-            Document doc = new Document();
-            doc.add(new LongPoint(fieldName, value));
-            doc.add(new LongPoint(fieldName2, value2));
-            doc.add(new LongPoint(fieldName3, value3));
-            writer.addDocument(doc);
+    public void testIndexHasDuplicateData() throws IOException {
+        int valuesCount = 5000;
+        int maxPointsInLeafNode = 40;
+        long expectedMedianCount = (long)(valuesCount * 0.6);
+        long expectedMedianValue = randomLongBetween(-10000000L, 10000000L);
+
+        try (Directory dir = newDirectory()) {
+            BKDWriter w = new BKDWriter(valuesCount, dir, "tmp", 1, 1, 8, maxPointsInLeafNode, 1, valuesCount);
+            byte[] longBytes = new byte[8];
+            for (int docId = 0; docId < valuesCount; docId++) {
+                long value = docId < expectedMedianCount ? expectedMedianValue : randomLongBetween(-10000000L, 10000000L);
+                LongPoint.encodeDimension(value, longBytes, 0);
+                w.add(longBytes, docId);
+            }
+            long indexFP;
+            try (IndexOutput out = dir.createOutput("bkd", IOContext.DEFAULT)) {
+                indexFP = w.finish(out);
+            }
+            try (IndexInput in = dir.openInput("bkd", IOContext.DEFAULT)) {
+                in.seek(indexFP);
+                BKDReader r = new BKDReader(in);
+                long medianValue = estimateMedianValue(r);
+                long medianCount = estimatePointCount(r, medianValue, medianValue);
+
+                assertEquals(expectedMedianValue, medianValue);
+                assertThat(medianCount, greaterThanOrEqualTo((long) (valuesCount/2))); //assert that Index has duplicate data
+                assertThat(medianCount, greaterThanOrEqualTo((long) (0.75 * expectedMedianCount)));
+                assertThat(medianCount, lessThanOrEqualTo((long) (1.25 * expectedMedianCount)));
+            }
         }
-        writer.close();
-        final IndexReader reader = DirectoryReader.open(dir);
-        assertTrue(indexFieldHasDuplicateData(reader, fieldName));
-        assertFalse(indexFieldHasDuplicateData(reader, fieldName2));
-        assertFalse(indexFieldHasDuplicateData(reader, fieldName3));
-        reader.close();
-        dir.close();
     }
 
+    public void testIndexHasNotDuplicateData() throws IOException {
+        int valuesCount = 5000;
+        int maxPointsInLeafNode = 40;
+        long expectedMedianCount = (long)(valuesCount * 0.35);
+        long expectedMedianValue = randomLongBetween(-10000000L, 10000000L);
+
+        try (Directory dir = newDirectory()) {
+            BKDWriter w = new BKDWriter(valuesCount, dir, "tmp", 1, 1, 8, maxPointsInLeafNode, 1, valuesCount);
+            byte[] longBytes = new byte[8];
+            for (int docId = 0; docId < valuesCount; docId++) {
+                long value = docId < expectedMedianCount ? expectedMedianValue : randomLongBetween(-10000000L, 10000000L);
+                LongPoint.encodeDimension(value, longBytes, 0);
+                w.add(longBytes, docId);
+            }
+            long indexFP;
+            try (IndexOutput out = dir.createOutput("bkd", IOContext.DEFAULT)) {
+                indexFP = w.finish(out);
+            }
+            try (IndexInput in = dir.openInput("bkd", IOContext.DEFAULT)) {
+                in.seek(indexFP);
+                BKDReader r = new BKDReader(in);
+                long medianValue = estimateMedianValue(r);
+                long medianCount = estimatePointCount(r, medianValue, medianValue);
+
+                // can't make any assertion about the values of medianValue and medianCount
+                // as BKDReader::estimatePointCount can be really off for non-duplicate data
+                assertThat(medianCount, lessThan((long) (valuesCount/2))); //assert that Index does NOT have duplicate data
+            }
+        }
+    }
 
     public void testMaxScoreQueryVisitor() {
         BitSetProducer producer = context -> new FixedBitSet(1);

From 05df4e9fe7ab642dc96a1f985253fc2abfc70ddd Mon Sep 17 00:00:00 2001
From: Mayya Sharipova <mayya.sharipova@elastic.co>
Date: Tue, 2 Jul 2019 09:32:34 -0400
Subject: [PATCH 4/4] Ensure single values

---
 .../main/java/org/elasticsearch/search/query/QueryPhase.java  | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/server/src/main/java/org/elasticsearch/search/query/QueryPhase.java b/server/src/main/java/org/elasticsearch/search/query/QueryPhase.java
index 0b401d0ddb3d9..b3ca4a04ea75f 100644
--- a/server/src/main/java/org/elasticsearch/search/query/QueryPhase.java
+++ b/server/src/main/java/org/elasticsearch/search/query/QueryPhase.java
@@ -236,9 +236,6 @@ static boolean execute(SearchContext searchContext,
                     System.arraycopy(oldFormats, 0, newFormats, 1, oldFormats.length);
                     sortAndFormatsForRewrittenNumericSort = searchContext.sort(); // stash SortAndFormats to restore it later
                     searchContext.sort(new SortAndFormats(new Sort(newSortFields), newFormats));
-                    if (LOGGER.isTraceEnabled()) {
-                        LOGGER.trace("Sort optimization on the field [" + oldSortFields[0].getField() + "] was enabled!");
-                    }
                 }
             }
 
@@ -492,6 +489,7 @@ static boolean indexFieldHasDuplicateData(IndexReader reader, String field) thro
             if (docCount <= 512) { // skipping small segments as estimateMedianCount doesn't work well on them
                 continue;
             }
+            assert(pointValues.size() == docCount); // TODO: modify the code to handle multiple values
             globalDocCount += docCount;
             long medianValue = estimateMedianValue(pointValues);
             long medianCount = estimatePointCount(pointValues, medianValue, medianValue);