From ab174c533d8c1c1763635c497f8b8dae9f870c40 Mon Sep 17 00:00:00 2001 From: Mayya Sharipova Date: Tue, 11 Jun 2019 16:15:14 -0400 Subject: [PATCH 1/4] Skip optimization if the index has duplicate data Skip sort optimization if the index has 50% or more data with the same value. When index has a lot of docs with the same value, sort optimization doesn't make sense, as DistanceFeatureQuery will produce same scores for these docs, and Lucene will use the second sort to tie-break. This could be slower than usual sorting. --- .../search/query/QueryPhase.java | 92 ++++++++++++++++++- .../search/query/QueryPhaseTests.java | 75 ++++++++------- 2 files changed, 128 insertions(+), 39 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/search/query/QueryPhase.java b/server/src/main/java/org/elasticsearch/search/query/QueryPhase.java index 48e238c3e3168..dc86c842cc685 100644 --- a/server/src/main/java/org/elasticsearch/search/query/QueryPhase.java +++ b/server/src/main/java/org/elasticsearch/search/query/QueryPhase.java @@ -387,8 +387,12 @@ private static Query tryRewriteLongSort(SearchContext searchContext, IndexReader ((sortField.getReverse() == false) && (missingValue == Long.MAX_VALUE)); if (missingValuesAccordingToSort == false) return null; + int docCount = PointValues.getDocCount(reader, fieldName); + // is not worth to run optimization on small index, also estimation of duplicate data doesn't work well on small index + if (docCount <= 512) return null; + // check for multiple values - if (PointValues.size(reader, fieldName) != PointValues.getDocCount(reader, fieldName)) return null; //TODO: handle multiple values + if (PointValues.size(reader, fieldName) != docCount) return null; //TODO: handle multiple values // check if the optimization makes sense with the track_total_hits setting if (searchContext.trackTotalHitsUpTo() == Integer.MAX_VALUE) { @@ -408,6 +412,7 @@ private static Query tryRewriteLongSort(SearchContext searchContext, IndexReader if (minValue == maxValue) { rewrittenQuery = new DocValuesFieldExistsQuery(fieldName); } else { + if (indexFieldHasDuplicateData(reader, fieldName)) return null; long origin = (sortField.getReverse()) ? maxValue : minValue; long pivotDistance = (maxValue - minValue) >>> 1; // division by 2 on the unsigned representation to avoid overflow if (pivotDistance == 0) { // 0 if maxValue = (minValue + 1) @@ -469,5 +474,90 @@ private static boolean canEarlyTerminate(IndexReader reader, SortAndFormats sort return true; } + /** + * Returns true if more than 50% of data in the index have the same value + * The evaluation is approximation based on finding the median value and estimating its count + * Returns true if most of the segments have duplicate data, false - otherwise + */ + static boolean indexFieldHasDuplicateData(IndexReader reader, String field) throws IOException { + int duplicateSegments = 0; + int noDuplicateSegments = 0; + for (LeafReaderContext lrc : reader.leaves()) { + PointValues pointValues = lrc.reader().getPointValues(field); + int thresholdDocCount = pointValues.getDocCount()/2; + byte[] minValueAsBytes = pointValues.getMinPackedValue(); + byte[] maxValueAsBytes = pointValues.getMaxPackedValue(); + long minValue = LongPoint.decodeDimension(minValueAsBytes, 0); + long maxValue = LongPoint.decodeDimension(maxValueAsBytes, 0); + long medianCount = estimateMedianCount(pointValues, minValue, maxValue, thresholdDocCount); + if (medianCount > thresholdDocCount) { + duplicateSegments++; + } else { + noDuplicateSegments++; + } + } + return (duplicateSegments >= noDuplicateSegments); + } + + private static long estimateMedianCount(PointValues pointValues, long minValue, long maxValue, long threshold) { + while (minValue < maxValue) { + long avgValue = Math.floorDiv(minValue + maxValue, 2); + long countLeft = estimatePointCount(pointValues, minValue, avgValue); + if (countLeft >= threshold) { + maxValue = avgValue; + threshold = countLeft/2; + } else { + long countRight = estimatePointCount(pointValues, avgValue + 1, maxValue); + minValue = avgValue + 1; + threshold = countRight/2; + } + } + // maxValue is the approximate median value, estimate its count + long medianCount = estimatePointCount(pointValues, maxValue, maxValue); + return medianCount; + } + + + private static long estimatePointCount(PointValues pointValues, long minValue, long maxValue) { + final byte[] minValueAsBytes = new byte[Long.BYTES]; + LongPoint.encodeDimension(minValue, minValueAsBytes, 0); + final byte[] maxValueAsBytes = new byte[Long.BYTES]; + LongPoint.encodeDimension(maxValue, maxValueAsBytes, 0); + + PointValues.IntersectVisitor visitor = new PointValues.IntersectVisitor() { + @Override + public void grow(int count) {} + + @Override + public void visit(int docID) {} + + @Override + public void visit(int docID, byte[] packedValue) { + if (Arrays.compareUnsigned(packedValue, 0, Long.BYTES, minValueAsBytes, 0, Long.BYTES) < 0) { + // Doc's value is too low, in this dimension + return; + } + if (Arrays.compareUnsigned(packedValue, 0, Long.BYTES, maxValueAsBytes, 0, Long.BYTES) > 0) { + // Doc's value is too high, in this dimension + return; + } + } + + @Override + public PointValues.Relation compare(byte[] minPackedValue, byte[] maxPackedValue) { + if (Arrays.compareUnsigned(minPackedValue, 0, Long.BYTES, maxValueAsBytes, 0, Long.BYTES) > 0 || + Arrays.compareUnsigned(maxPackedValue, 0, Long.BYTES, minValueAsBytes, 0, Long.BYTES) < 0) { + return PointValues.Relation.CELL_OUTSIDE_QUERY; + } + if (Arrays.compareUnsigned(minPackedValue, 0, Long.BYTES, minValueAsBytes, 0, Long.BYTES) < 0 || + Arrays.compareUnsigned(maxPackedValue, 0, Long.BYTES, maxValueAsBytes, 0, Long.BYTES) > 0) { + return PointValues.Relation.CELL_CROSSES_QUERY; + } + return PointValues.Relation.CELL_INSIDE_QUERY; + } + }; + return pointValues.estimatePointCount(visitor); + } + private static class TimeExceededException extends RuntimeException {} } diff --git a/server/src/test/java/org/elasticsearch/search/query/QueryPhaseTests.java b/server/src/test/java/org/elasticsearch/search/query/QueryPhaseTests.java index bf0c8fca9a15c..e3a3b45d3c4bd 100644 --- a/server/src/test/java/org/elasticsearch/search/query/QueryPhaseTests.java +++ b/server/src/test/java/org/elasticsearch/search/query/QueryPhaseTests.java @@ -31,6 +31,7 @@ import org.apache.lucene.document.TextField; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.NoMergePolicy; @@ -88,6 +89,7 @@ import java.util.Collections; import java.util.List; +import static org.elasticsearch.search.query.QueryPhase.indexFieldHasDuplicateData; import static org.hamcrest.Matchers.anyOf; import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.greaterThanOrEqualTo; @@ -652,9 +654,9 @@ public void testNumericLongOrDateSortOptimization() throws Exception { TestSearchContext searchContext = spy(new TestSearchContext(null, indexShard)); when(searchContext.mapperService()).thenReturn(mapperService); - final int numDocs = scaledRandomIntBetween(50, 100); + final int numDocs = 10000; Directory dir = newDirectory(); - RandomIndexWriter writer = new RandomIndexWriter(random(), dir); + IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(null)); for (int i = 0; i < numDocs; ++i) { Document doc = new Document(); long longValue = randomLongBetween(-10000000L, 10000000L); @@ -708,6 +710,39 @@ public void testNumericLongOrDateSortOptimization() throws Exception { dir.close(); } + public void testIndexFieldHasDuplicateData() throws IOException { + final int numDocs = 10000; + final int threshold1 = numDocs * 60 / 100; + final int threshold2 = numDocs * 40 / 100; + final int threshold3 = numDocs * 5 / 100; + + final String fieldName = "duplicateField"; + final String fieldName2 = "notMuchDuplicateField"; + final String fieldName3 = "notDuplicateField"; + + long duplicateValue = randomLongBetween(-10000000L, 10000000L); + long value, value2, value3; + Directory dir = newDirectory(); + IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(null)); + for (int i = 0; i < numDocs; ++i) { + value = i < threshold1 ? duplicateValue : i; + value2 = i < threshold2 ? duplicateValue : i; + value3 = i < threshold3 ? duplicateValue : i; + Document doc = new Document(); + doc.add(new LongPoint(fieldName, value)); + doc.add(new LongPoint(fieldName2, value2)); + doc.add(new LongPoint(fieldName3, value3)); + writer.addDocument(doc); + } + writer.close(); + final IndexReader reader = DirectoryReader.open(dir); + assertTrue(indexFieldHasDuplicateData(reader, fieldName)); + assertFalse(indexFieldHasDuplicateData(reader, fieldName2)); + assertFalse(indexFieldHasDuplicateData(reader, fieldName3)); + reader.close(); + dir.close(); + } + public void testMaxScoreQueryVisitor() { BitSetProducer producer = context -> new FixedBitSet(1); @@ -760,42 +795,6 @@ public void testMaxScoreQueryVisitor() { } } - public void testNumericLongSortOptimizationDocsHaveTheSameValue() throws Exception { - final String fieldNameLong = "long-field"; - MappedFieldType fieldTypeLong = new NumberFieldMapper.NumberFieldType(NumberFieldMapper.NumberType.LONG); - MapperService mapperService = mock(MapperService.class); - when(mapperService.fullName(fieldNameLong)).thenReturn(fieldTypeLong); - TestSearchContext searchContext = spy(new TestSearchContext(null, indexShard)); - when(searchContext.mapperService()).thenReturn(mapperService); - - final int numDocs = scaledRandomIntBetween(5, 10); - long longValue = randomLongBetween(-10000000L, 10000000L); // all docs have the same value - Directory dir = newDirectory(); - RandomIndexWriter writer = new RandomIndexWriter(random(), dir); - for (int i = 0; i < numDocs; ++i) { - Document doc = new Document(); - doc.add(new LongPoint(fieldNameLong, longValue)); - doc.add(new NumericDocValuesField(fieldNameLong, longValue)); - writer.addDocument(doc); - } - writer.close(); - final IndexReader reader = DirectoryReader.open(dir); - IndexSearcher searcher = getAssertingSortOptimizedSearcher(reader, 1); - - final SortField sortFieldLong = new SortField(fieldNameLong, SortField.Type.LONG); - sortFieldLong.setMissingValue(Long.MAX_VALUE); - final Sort longSort = new Sort(sortFieldLong); - SortAndFormats sortAndFormats = new SortAndFormats(longSort, new DocValueFormat[]{DocValueFormat.RAW}); - searchContext.sort(sortAndFormats); - searchContext.parsedQuery(new ParsedQuery(new MatchAllDocsQuery())); - searchContext.setTask(new SearchTask(123L, "", "", "", null, Collections.emptyMap())); - searchContext.setSize(10); - QueryPhase.execute(searchContext, searcher, checkCancelled -> {}); - assertSortResults(searchContext.queryResult().topDocs().topDocs, (long) numDocs, false); - reader.close(); - dir.close(); - } - // used to check that numeric long or date sort optimization was run private static IndexSearcher getAssertingSortOptimizedSearcher(IndexReader reader, int queryType) { return new IndexSearcher(reader) { From e13761badbfb59e22ce9a416f877f0c7f089c3b3 Mon Sep 17 00:00:00 2001 From: Mayya Sharipova Date: Wed, 19 Jun 2019 10:13:25 -0400 Subject: [PATCH 2/4] Address Jim's feedback --- .../search/query/QueryPhase.java | 28 ++++++++++--------- .../search/query/QueryPhaseTests.java | 2 +- 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/search/query/QueryPhase.java b/server/src/main/java/org/elasticsearch/search/query/QueryPhase.java index dc86c842cc685..643699a67b1b0 100644 --- a/server/src/main/java/org/elasticsearch/search/query/QueryPhase.java +++ b/server/src/main/java/org/elasticsearch/search/query/QueryPhase.java @@ -236,6 +236,9 @@ static boolean execute(SearchContext searchContext, System.arraycopy(oldFormats, 0, newFormats, 1, oldFormats.length); sortAndFormatsForRewrittenNumericSort = searchContext.sort(); // stash SortAndFormats to restore it later searchContext.sort(new SortAndFormats(new Sort(newSortFields), newFormats)); + if (LOGGER.isTraceEnabled()) { + LOGGER.trace("Sort optimization on the field [" + oldSortFields[0].getField() + "] was enabled!"); + } } } @@ -388,7 +391,7 @@ private static Query tryRewriteLongSort(SearchContext searchContext, IndexReader if (missingValuesAccordingToSort == false) return null; int docCount = PointValues.getDocCount(reader, fieldName); - // is not worth to run optimization on small index, also estimation of duplicate data doesn't work well on small index + // is not worth to run optimization on small index if (docCount <= 512) return null; // check for multiple values @@ -477,26 +480,26 @@ private static boolean canEarlyTerminate(IndexReader reader, SortAndFormats sort /** * Returns true if more than 50% of data in the index have the same value * The evaluation is approximation based on finding the median value and estimating its count - * Returns true if most of the segments have duplicate data, false - otherwise + * Returns true if the total count of median values is greater or equal to half of the total count of documents */ static boolean indexFieldHasDuplicateData(IndexReader reader, String field) throws IOException { - int duplicateSegments = 0; - int noDuplicateSegments = 0; + long globalDocCount = 0; + long globalMedianCount = 0; for (LeafReaderContext lrc : reader.leaves()) { PointValues pointValues = lrc.reader().getPointValues(field); - int thresholdDocCount = pointValues.getDocCount()/2; + int docCount = pointValues.getDocCount(); + if (docCount <= 512) { // skipping small segments as estimateMedianCount doesn't work well on them + continue; + } + globalDocCount += docCount; byte[] minValueAsBytes = pointValues.getMinPackedValue(); byte[] maxValueAsBytes = pointValues.getMaxPackedValue(); long minValue = LongPoint.decodeDimension(minValueAsBytes, 0); long maxValue = LongPoint.decodeDimension(maxValueAsBytes, 0); - long medianCount = estimateMedianCount(pointValues, minValue, maxValue, thresholdDocCount); - if (medianCount > thresholdDocCount) { - duplicateSegments++; - } else { - noDuplicateSegments++; - } + long medianCount = estimateMedianCount(pointValues, minValue, maxValue, docCount/2); + globalMedianCount += medianCount; } - return (duplicateSegments >= noDuplicateSegments); + return (globalMedianCount >= globalDocCount/2); } private static long estimateMedianCount(PointValues pointValues, long minValue, long maxValue, long threshold) { @@ -517,7 +520,6 @@ private static long estimateMedianCount(PointValues pointValues, long minValue, return medianCount; } - private static long estimatePointCount(PointValues pointValues, long minValue, long maxValue) { final byte[] minValueAsBytes = new byte[Long.BYTES]; LongPoint.encodeDimension(minValue, minValueAsBytes, 0); diff --git a/server/src/test/java/org/elasticsearch/search/query/QueryPhaseTests.java b/server/src/test/java/org/elasticsearch/search/query/QueryPhaseTests.java index e3a3b45d3c4bd..6108eb20ed323 100644 --- a/server/src/test/java/org/elasticsearch/search/query/QueryPhaseTests.java +++ b/server/src/test/java/org/elasticsearch/search/query/QueryPhaseTests.java @@ -654,7 +654,7 @@ public void testNumericLongOrDateSortOptimization() throws Exception { TestSearchContext searchContext = spy(new TestSearchContext(null, indexShard)); when(searchContext.mapperService()).thenReturn(mapperService); - final int numDocs = 10000; + final int numDocs = 1000; Directory dir = newDirectory(); IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(null)); for (int i = 0; i < numDocs; ++i) { From 9c71827301e75be2378387c020299b90958137f8 Mon Sep 17 00:00:00 2001 From: Mayya Sharipova Date: Mon, 24 Jun 2019 17:27:41 -0400 Subject: [PATCH 3/4] Add test for finding duplicate data in BKD-tree This allows to control the number of points in the leaf node --- .../search/query/QueryPhase.java | 35 ++---- .../search/query/QueryPhaseTests.java | 101 ++++++++++++------ 2 files changed, 80 insertions(+), 56 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/search/query/QueryPhase.java b/server/src/main/java/org/elasticsearch/search/query/QueryPhase.java index 643699a67b1b0..0b401d0ddb3d9 100644 --- a/server/src/main/java/org/elasticsearch/search/query/QueryPhase.java +++ b/server/src/main/java/org/elasticsearch/search/query/QueryPhase.java @@ -487,40 +487,36 @@ static boolean indexFieldHasDuplicateData(IndexReader reader, String field) thro long globalMedianCount = 0; for (LeafReaderContext lrc : reader.leaves()) { PointValues pointValues = lrc.reader().getPointValues(field); + if (pointValues == null) continue; int docCount = pointValues.getDocCount(); if (docCount <= 512) { // skipping small segments as estimateMedianCount doesn't work well on them continue; } globalDocCount += docCount; - byte[] minValueAsBytes = pointValues.getMinPackedValue(); - byte[] maxValueAsBytes = pointValues.getMaxPackedValue(); - long minValue = LongPoint.decodeDimension(minValueAsBytes, 0); - long maxValue = LongPoint.decodeDimension(maxValueAsBytes, 0); - long medianCount = estimateMedianCount(pointValues, minValue, maxValue, docCount/2); + long medianValue = estimateMedianValue(pointValues); + long medianCount = estimatePointCount(pointValues, medianValue, medianValue); globalMedianCount += medianCount; } return (globalMedianCount >= globalDocCount/2); } - private static long estimateMedianCount(PointValues pointValues, long minValue, long maxValue, long threshold) { + static long estimateMedianValue(PointValues pointValues) throws IOException { + long minValue = LongPoint.decodeDimension(pointValues.getMinPackedValue(), 0); + long maxValue = LongPoint.decodeDimension(pointValues.getMaxPackedValue(), 0); while (minValue < maxValue) { long avgValue = Math.floorDiv(minValue + maxValue, 2); long countLeft = estimatePointCount(pointValues, minValue, avgValue); - if (countLeft >= threshold) { + long countRight = estimatePointCount(pointValues, avgValue + 1, maxValue); + if (countLeft >= countRight) { maxValue = avgValue; - threshold = countLeft/2; } else { - long countRight = estimatePointCount(pointValues, avgValue + 1, maxValue); minValue = avgValue + 1; - threshold = countRight/2; } } - // maxValue is the approximate median value, estimate its count - long medianCount = estimatePointCount(pointValues, maxValue, maxValue); - return medianCount; + return maxValue; } - private static long estimatePointCount(PointValues pointValues, long minValue, long maxValue) { + static long estimatePointCount(PointValues pointValues, long minValue, long maxValue) { final byte[] minValueAsBytes = new byte[Long.BYTES]; LongPoint.encodeDimension(minValue, minValueAsBytes, 0); final byte[] maxValueAsBytes = new byte[Long.BYTES]; @@ -534,16 +530,7 @@ public void grow(int count) {} public void visit(int docID) {} @Override - public void visit(int docID, byte[] packedValue) { - if (Arrays.compareUnsigned(packedValue, 0, Long.BYTES, minValueAsBytes, 0, Long.BYTES) < 0) { - // Doc's value is too low, in this dimension - return; - } - if (Arrays.compareUnsigned(packedValue, 0, Long.BYTES, maxValueAsBytes, 0, Long.BYTES) > 0) { - // Doc's value is too high, in this dimension - return; - } - } + public void visit(int docID, byte[] packedValue) {} @Override public PointValues.Relation compare(byte[] minPackedValue, byte[] maxPackedValue) { diff --git a/server/src/test/java/org/elasticsearch/search/query/QueryPhaseTests.java b/server/src/test/java/org/elasticsearch/search/query/QueryPhaseTests.java index 6108eb20ed323..1e885b4cabc70 100644 --- a/server/src/test/java/org/elasticsearch/search/query/QueryPhaseTests.java +++ b/server/src/test/java/org/elasticsearch/search/query/QueryPhaseTests.java @@ -66,8 +66,13 @@ import org.apache.lucene.search.spans.SpanNearQuery; import org.apache.lucene.search.spans.SpanTermQuery; import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.bkd.BKDReader; +import org.apache.lucene.util.bkd.BKDWriter; import org.elasticsearch.action.search.SearchTask; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.index.mapper.DateFieldMapper; @@ -89,11 +94,14 @@ import java.util.Collections; import java.util.List; -import static org.elasticsearch.search.query.QueryPhase.indexFieldHasDuplicateData; +import static org.elasticsearch.search.query.QueryPhase.estimateMedianValue; +import static org.elasticsearch.search.query.QueryPhase.estimatePointCount; import static org.hamcrest.Matchers.anyOf; import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.greaterThanOrEqualTo; import static org.hamcrest.Matchers.instanceOf; +import static org.hamcrest.Matchers.lessThan; +import static org.hamcrest.Matchers.lessThanOrEqualTo; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; import static org.mockito.Mockito.spy; @@ -654,7 +662,7 @@ public void testNumericLongOrDateSortOptimization() throws Exception { TestSearchContext searchContext = spy(new TestSearchContext(null, indexShard)); when(searchContext.mapperService()).thenReturn(mapperService); - final int numDocs = 1000; + final int numDocs = 4000; Directory dir = newDirectory(); IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(null)); for (int i = 0; i < numDocs; ++i) { @@ -710,39 +718,68 @@ public void testNumericLongOrDateSortOptimization() throws Exception { dir.close(); } - public void testIndexFieldHasDuplicateData() throws IOException { - final int numDocs = 10000; - final int threshold1 = numDocs * 60 / 100; - final int threshold2 = numDocs * 40 / 100; - final int threshold3 = numDocs * 5 / 100; - - final String fieldName = "duplicateField"; - final String fieldName2 = "notMuchDuplicateField"; - final String fieldName3 = "notDuplicateField"; - - long duplicateValue = randomLongBetween(-10000000L, 10000000L); - long value, value2, value3; - Directory dir = newDirectory(); - IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(null)); - for (int i = 0; i < numDocs; ++i) { - value = i < threshold1 ? duplicateValue : i; - value2 = i < threshold2 ? duplicateValue : i; - value3 = i < threshold3 ? duplicateValue : i; - Document doc = new Document(); - doc.add(new LongPoint(fieldName, value)); - doc.add(new LongPoint(fieldName2, value2)); - doc.add(new LongPoint(fieldName3, value3)); - writer.addDocument(doc); + public void testIndexHasDuplicateData() throws IOException { + int valuesCount = 5000; + int maxPointsInLeafNode = 40; + long expectedMedianCount = (long)(valuesCount * 0.6); + long expectedMedianValue = randomLongBetween(-10000000L, 10000000L); + + try (Directory dir = newDirectory()) { + BKDWriter w = new BKDWriter(valuesCount, dir, "tmp", 1, 1, 8, maxPointsInLeafNode, 1, valuesCount); + byte[] longBytes = new byte[8]; + for (int docId = 0; docId < valuesCount; docId++) { + long value = docId < expectedMedianCount ? expectedMedianValue : randomLongBetween(-10000000L, 10000000L); + LongPoint.encodeDimension(value, longBytes, 0); + w.add(longBytes, docId); + } + long indexFP; + try (IndexOutput out = dir.createOutput("bkd", IOContext.DEFAULT)) { + indexFP = w.finish(out); + } + try (IndexInput in = dir.openInput("bkd", IOContext.DEFAULT)) { + in.seek(indexFP); + BKDReader r = new BKDReader(in); + long medianValue = estimateMedianValue(r); + long medianCount = estimatePointCount(r, medianValue, medianValue); + + assertEquals(expectedMedianValue, medianValue); + assertThat(medianCount, greaterThanOrEqualTo((long) (valuesCount/2))); //assert that Index has duplicate data + assertThat(medianCount, greaterThanOrEqualTo((long) (0.75 * expectedMedianCount))); + assertThat(medianCount, lessThanOrEqualTo((long) (1.25 * expectedMedianCount))); + } } - writer.close(); - final IndexReader reader = DirectoryReader.open(dir); - assertTrue(indexFieldHasDuplicateData(reader, fieldName)); - assertFalse(indexFieldHasDuplicateData(reader, fieldName2)); - assertFalse(indexFieldHasDuplicateData(reader, fieldName3)); - reader.close(); - dir.close(); } + public void testIndexHasNotDuplicateData() throws IOException { + int valuesCount = 5000; + int maxPointsInLeafNode = 40; + long expectedMedianCount = (long)(valuesCount * 0.35); + long expectedMedianValue = randomLongBetween(-10000000L, 10000000L); + + try (Directory dir = newDirectory()) { + BKDWriter w = new BKDWriter(valuesCount, dir, "tmp", 1, 1, 8, maxPointsInLeafNode, 1, valuesCount); + byte[] longBytes = new byte[8]; + for (int docId = 0; docId < valuesCount; docId++) { + long value = docId < expectedMedianCount ? expectedMedianValue : randomLongBetween(-10000000L, 10000000L); + LongPoint.encodeDimension(value, longBytes, 0); + w.add(longBytes, docId); + } + long indexFP; + try (IndexOutput out = dir.createOutput("bkd", IOContext.DEFAULT)) { + indexFP = w.finish(out); + } + try (IndexInput in = dir.openInput("bkd", IOContext.DEFAULT)) { + in.seek(indexFP); + BKDReader r = new BKDReader(in); + long medianValue = estimateMedianValue(r); + long medianCount = estimatePointCount(r, medianValue, medianValue); + + // can't make any assertion about the values of medianValue and medianCount + // as BKDReader::estimatePointCount can be really off for non-duplicate data + assertThat(medianCount, lessThan((long) (valuesCount/2))); //assert that Index does NOT have duplicate data + } + } + } public void testMaxScoreQueryVisitor() { BitSetProducer producer = context -> new FixedBitSet(1); From 05df4e9fe7ab642dc96a1f985253fc2abfc70ddd Mon Sep 17 00:00:00 2001 From: Mayya Sharipova Date: Tue, 2 Jul 2019 09:32:34 -0400 Subject: [PATCH 4/4] Ensure single values --- .../main/java/org/elasticsearch/search/query/QueryPhase.java | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/search/query/QueryPhase.java b/server/src/main/java/org/elasticsearch/search/query/QueryPhase.java index 0b401d0ddb3d9..b3ca4a04ea75f 100644 --- a/server/src/main/java/org/elasticsearch/search/query/QueryPhase.java +++ b/server/src/main/java/org/elasticsearch/search/query/QueryPhase.java @@ -236,9 +236,6 @@ static boolean execute(SearchContext searchContext, System.arraycopy(oldFormats, 0, newFormats, 1, oldFormats.length); sortAndFormatsForRewrittenNumericSort = searchContext.sort(); // stash SortAndFormats to restore it later searchContext.sort(new SortAndFormats(new Sort(newSortFields), newFormats)); - if (LOGGER.isTraceEnabled()) { - LOGGER.trace("Sort optimization on the field [" + oldSortFields[0].getField() + "] was enabled!"); - } } } @@ -492,6 +489,7 @@ static boolean indexFieldHasDuplicateData(IndexReader reader, String field) thro if (docCount <= 512) { // skipping small segments as estimateMedianCount doesn't work well on them continue; } + assert(pointValues.size() == docCount); // TODO: modify the code to handle multiple values globalDocCount += docCount; long medianValue = estimateMedianValue(pointValues); long medianCount = estimatePointCount(pointValues, medianValue, medianValue);