Skip to content

Commit

Permalink
Enable numeric sort optimization support for all numeric types (#6424)
Browse files Browse the repository at this point in the history
* Adding numeric optimization support for all numeric types

Signed-off-by: gashutos <gashutos@amazon.com>

* modifying CHANGELOG.md

Signed-off-by: gashutos <gashutos@amazon.com>

* Handling multi-cluster scenario where SortField serialization was failing

Signed-off-by: gashutos <gashutos@amazon.com>

* Fixing javadoc errors

Signed-off-by: gashutos <gashutos@amazon.com>

* Fixing nested sort integ tests

Signed-off-by: gashutos <gashutos@amazon.com>

* Stremlining behaviour of custom comparator tests too

Signed-off-by: gashutos <gashutos@amazon.com>

* Adding more integ tests for IntValuesComparatorSource & fixing few ITs

Signed-off-by: gashutos <gashutos@amazon.com>

* Fixing few more integ tests

Signed-off-by: gashutos <gashutos@amazon.com>

* Streamlining applySortWidening method with CreateSort and avoid modifying cteated objects of sort

Signed-off-by: gashutos <gashutos@amazon.com>

* Correcting licence header

Signed-off-by: gashutos <gashutos@amazon.com>

---------

Signed-off-by: gashutos <gashutos@amazon.com>
Co-authored-by: Daniel (dB.) Doubrovkine <dblock@amazon.com>
  • Loading branch information
gashutos and dblock authored Mar 27, 2023
1 parent 16797d6 commit e8425fc
Show file tree
Hide file tree
Showing 14 changed files with 703 additions and 172 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
- [Segment Replication] Apply backpressure when replicas fall behind ([#6563](https://github.com/opensearch-project/OpenSearch/pull/6563))
- [Remote Store] Integrate remote segment store in peer recovery flow ([#6664](https://github.com/opensearch-project/OpenSearch/pull/6664))
- [Segment Replication] Add new cluster setting to set replication strategy by default for all indices in cluster. ([#6791](https://github.com/opensearch-project/OpenSearch/pull/6791))
- Enable sort optimization for all NumericTypes ([#6464](https://github.com/opensearch-project/OpenSearch/pull/6464)

### Dependencies
- Bump `org.apache.logging.log4j:log4j-core` from 2.18.0 to 2.20.0 ([#6490](https://github.com/opensearch-project/OpenSearch/pull/6490))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -81,8 +81,8 @@ private static XContentBuilder createTestMapping() {
public void testIndexSort() {
SortField dateSort = new SortedNumericSortField("date", SortField.Type.LONG, false);
dateSort.setMissingValue(Long.MAX_VALUE);
SortField numericSort = new SortedNumericSortField("numeric_dv", SortField.Type.LONG, false);
numericSort.setMissingValue(Long.MAX_VALUE);
SortField numericSort = new SortedNumericSortField("numeric_dv", SortField.Type.INT, false);
numericSort.setMissingValue(Integer.MAX_VALUE);
SortField keywordSort = new SortedSetSortField("keyword_dv", false);
keywordSort.setMissingValue(SortField.STRING_LAST);
Sort indexSort = new Sort(dateSort, numericSort, keywordSort);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -382,24 +382,22 @@ private void createIndexMappingsFromObjectType(String indexName, List<Object> ty
ensureGreen();
}

// Convert Integer, Short, Byte and Boolean to Long in order to match the conversion done
// Convert Integer, Short, Byte and Boolean to Int in order to match the conversion done
// by the internal hits when populating the sort values.
private List<Object> convertSortValues(List<Object> sortValues) {
List<Object> converted = new ArrayList<>();
for (int i = 0; i < sortValues.size(); i++) {
Object from = sortValues.get(i);
if (from instanceof Integer) {
converted.add(((Integer) from).longValue());
} else if (from instanceof Short) {
converted.add(((Short) from).longValue());
if (from instanceof Short) {
converted.add(((Short) from).intValue());
} else if (from instanceof Byte) {
converted.add(((Byte) from).longValue());
converted.add(((Byte) from).intValue());
} else if (from instanceof Boolean) {
boolean b = (boolean) from;
if (b) {
converted.add(1L);
converted.add(1);
} else {
converted.add(0L);
converted.add(0);
}
} else {
converted.add(from);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.SortedNumericSortField;
import org.apache.lucene.search.TermStatistics;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.TopFieldDocs;
Expand Down Expand Up @@ -68,6 +69,7 @@
import org.opensearch.search.profile.ProfileShardResult;
import org.opensearch.search.profile.SearchProfileShardResults;
import org.opensearch.search.query.QuerySearchResult;
import org.opensearch.search.sort.SortedWiderNumericSortField;
import org.opensearch.search.suggest.Suggest;
import org.opensearch.search.suggest.Suggest.Suggestion;
import org.opensearch.search.suggest.completion.CompletionSuggestion;
Expand Down Expand Up @@ -235,14 +237,12 @@ static TopDocs mergeTopDocs(Collection<TopDocs> results, int topN, int from) {
if (numShards == 1 && from == 0) { // only one shard and no pagination we can just return the topDocs as we got them.
return topDocs;
} else if (topDocs instanceof CollapseTopFieldDocs) {
CollapseTopFieldDocs firstTopDocs = (CollapseTopFieldDocs) topDocs;
final Sort sort = new Sort(firstTopDocs.fields);
final CollapseTopFieldDocs[] shardTopDocs = results.toArray(new CollapseTopFieldDocs[numShards]);
final Sort sort = createSort(shardTopDocs);
mergedTopDocs = CollapseTopFieldDocs.merge(sort, from, topN, shardTopDocs, false);
} else if (topDocs instanceof TopFieldDocs) {
TopFieldDocs firstTopDocs = (TopFieldDocs) topDocs;
final Sort sort = new Sort(firstTopDocs.fields);
final TopFieldDocs[] shardTopDocs = results.toArray(new TopFieldDocs[numShards]);
final Sort sort = createSort(shardTopDocs);
mergedTopDocs = TopDocs.merge(sort, from, topN, shardTopDocs);
} else {
final TopDocs[] shardTopDocs = results.toArray(new TopDocs[numShards]);
Expand Down Expand Up @@ -600,6 +600,49 @@ private static void validateMergeSortValueFormats(Collection<? extends SearchPha
}
}

/**
* Creates Sort object from topFieldsDocs fields.
* It is necessary to widen the SortField.Type to maximum byte size for merging sorted docs.
* Different indices might have different types. This will avoid user to do re-index of data
* in case of mapping field change for newly indexed data.
* This will support Int to Long and Float to Double.
* Earlier widening of type was taken care in IndexNumericFieldData, but since we now want to
* support sort optimization, we removed type widening there and taking care here during merging.
* More details here https://github.com/opensearch-project/OpenSearch/issues/6326
*/
private static Sort createSort(TopFieldDocs[] topFieldDocs) {
final SortField[] firstTopDocFields = topFieldDocs[0].fields;
final SortField[] newFields = new SortField[firstTopDocFields.length];

for (int i = 0; i < firstTopDocFields.length; i++) {
if (firstTopDocFields[i] instanceof SortedNumericSortField && isSortWideningRequired(topFieldDocs, i)) {
final SortedNumericSortField delegate = (SortedNumericSortField) firstTopDocFields[i];
newFields[i] = new SortedWiderNumericSortField(
delegate.getField(),
delegate.getNumericType(),
delegate.getReverse(),
delegate.getSelector()
);
} else {
newFields[i] = firstTopDocFields[i];
}
}
return new Sort(newFields);
}

/**
* It will compare respective SortField between shards to see if any shard results have different
* field mapping type, accordingly it will decide to widen the sort fields.
*/
private static boolean isSortWideningRequired(TopFieldDocs[] topFieldDocs, int sortFieldindex) {
for (int i = 0; i < topFieldDocs.length - 1; i++) {
if (topFieldDocs[i].fields[sortFieldindex] != topFieldDocs[i + 1].fields[sortFieldindex]) {
return true;
}
}
return false;
}

/*
* Returns the size of the requested top documents (from + size)
*/
Expand Down
5 changes: 3 additions & 2 deletions server/src/main/java/org/opensearch/common/lucene/Lucene.java
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@
import org.opensearch.index.analysis.AnalyzerScope;
import org.opensearch.index.analysis.NamedAnalyzer;
import org.opensearch.index.fielddata.IndexFieldData;
import org.opensearch.search.sort.SortedWiderNumericSortField;

import java.io.IOException;
import java.math.BigInteger;
Expand Down Expand Up @@ -599,8 +600,8 @@ public static void writeSortField(StreamOutput out, SortField sortField) throws
SortField newSortField = new SortField(sortField.getField(), SortField.Type.STRING, sortField.getReverse());
newSortField.setMissingValue(sortField.getMissingValue());
sortField = newSortField;
} else if (sortField.getClass() == SortedNumericSortField.class) {
// for multi-valued sort field, we replace the SortedSetSortField with a simple SortField.
} else if (sortField.getClass() == SortedNumericSortField.class || sortField.getClass() == SortedWiderNumericSortField.class) {
// for multi-valued sort field, we replace the SortedNumericSortField/SortedWiderNumericSortField with a simple SortField.
// It works because the sort field is only used to merge results from different shards.
SortField newSortField = new SortField(
sortField.getField(),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
import org.opensearch.index.fielddata.IndexFieldData.XFieldComparatorSource.Nested;
import org.opensearch.index.fielddata.fieldcomparator.DoubleValuesComparatorSource;
import org.opensearch.index.fielddata.fieldcomparator.FloatValuesComparatorSource;
import org.opensearch.index.fielddata.fieldcomparator.IntValuesComparatorSource;
import org.opensearch.index.fielddata.fieldcomparator.LongValuesComparatorSource;
import org.opensearch.search.DocValueFormat;
import org.opensearch.search.MultiValueMode;
Expand All @@ -65,76 +66,16 @@ public abstract class IndexNumericFieldData implements IndexFieldData<LeafNumeri
* @opensearch.internal
*/
public enum NumericType {
BOOLEAN(false, SortField.Type.LONG, CoreValuesSourceType.BOOLEAN) {
@Deprecated
@Override
protected PointSortOptimization applyPointSortOptimization() {
return PointSortOptimization.DISABLED;
}
},
BYTE(false, SortField.Type.LONG, CoreValuesSourceType.NUMERIC) {
@Deprecated
@Override
protected PointSortOptimization applyPointSortOptimization() {
return PointSortOptimization.DISABLED;
}
},
SHORT(false, SortField.Type.LONG, CoreValuesSourceType.NUMERIC) {
@Deprecated
@Override
protected PointSortOptimization applyPointSortOptimization() {
return PointSortOptimization.DISABLED;
}
},
INT(false, SortField.Type.LONG, CoreValuesSourceType.NUMERIC) {
@Deprecated
@Override
protected PointSortOptimization applyPointSortOptimization() {
return PointSortOptimization.DISABLED;
}
},
LONG(false, SortField.Type.LONG, CoreValuesSourceType.NUMERIC) {
@Deprecated
@Override
protected PointSortOptimization applyPointSortOptimization() {
return PointSortOptimization.ENABLED;
}
},
DATE(false, SortField.Type.LONG, CoreValuesSourceType.DATE) {
@Deprecated
@Override
protected PointSortOptimization applyPointSortOptimization() {
return PointSortOptimization.ENABLED;
}
},
DATE_NANOSECONDS(false, SortField.Type.LONG, CoreValuesSourceType.DATE) {
@Deprecated
@Override
public PointSortOptimization applyPointSortOptimization() {
return PointSortOptimization.ENABLED;
}
},
HALF_FLOAT(true, SortField.Type.LONG, CoreValuesSourceType.NUMERIC) {
@Deprecated
@Override
protected PointSortOptimization applyPointSortOptimization() {
return PointSortOptimization.DISABLED;
}
},
FLOAT(true, SortField.Type.FLOAT, CoreValuesSourceType.NUMERIC) {
@Deprecated
@Override
protected PointSortOptimization applyPointSortOptimization() {
return PointSortOptimization.DISABLED;
}
},
DOUBLE(true, SortField.Type.DOUBLE, CoreValuesSourceType.NUMERIC) {
@Deprecated
@Override
protected PointSortOptimization applyPointSortOptimization() {
return PointSortOptimization.ENABLED;
}
};
BOOLEAN(false, SortField.Type.INT, CoreValuesSourceType.BOOLEAN),
BYTE(false, SortField.Type.INT, CoreValuesSourceType.NUMERIC),
SHORT(false, SortField.Type.INT, CoreValuesSourceType.NUMERIC),
INT(false, SortField.Type.INT, CoreValuesSourceType.NUMERIC),
LONG(false, SortField.Type.LONG, CoreValuesSourceType.NUMERIC),
DATE(false, SortField.Type.LONG, CoreValuesSourceType.DATE),
DATE_NANOSECONDS(false, SortField.Type.LONG, CoreValuesSourceType.DATE),
HALF_FLOAT(true, SortField.Type.LONG, CoreValuesSourceType.NUMERIC),
FLOAT(true, SortField.Type.FLOAT, CoreValuesSourceType.NUMERIC),
DOUBLE(true, SortField.Type.DOUBLE, CoreValuesSourceType.NUMERIC);

private final boolean floatingPoint;
private final ValuesSourceType valuesSourceType;
Expand All @@ -153,24 +94,6 @@ public final boolean isFloatingPoint() {
public final ValuesSourceType getValuesSourceType() {
return valuesSourceType;
}

@Deprecated
protected abstract PointSortOptimization applyPointSortOptimization();
}

/**
* Controls whether to apply sort optimization to skip non-competitive docs
* based on the BKD index.
*
* @deprecated this control will be removed in a future version of OpenSearch
*
* @opensearch.internal
* @opensearch.experimental
*/
@Deprecated
private enum PointSortOptimization {
ENABLED,
DISABLED
}

/**
Expand Down Expand Up @@ -211,21 +134,6 @@ public final SortField sortField(
: SortedNumericSelector.Type.MIN;
SortField sortField = new SortedNumericSortField(getFieldName(), getNumericType().sortFieldType, reverse, selectorType);
sortField.setMissingValue(source.missingObject(missingValue, reverse));

// LUCENE-9280 added the ability for collectors to skip non-competitive
// documents when top docs are sorted by other fields different from the _score.
// However, from Lucene 9 onwards, numeric sort optimisation requires the byte size
// for points (BKD index) and doc values (columnar) and SortField.Type to be matched.
// NumericType violates this requirement
// (see: https://github.com/opensearch-project/OpenSearch/issues/2063#issuecomment-1069358826 test failure)
// because it uses the largest byte size (LONG) for the SortField of most types. The section below disables
// the BKD based sort optimization for numeric types whose encoded BYTE size does not match the comparator (LONG)/
// So as of now, we can only enable for DATE, DATE_NANOSECONDS, LONG, DOUBLE.
// BOOLEAN, BYTE, SHORT, INT, HALF_FLOAT, FLOAT (use long for doc values, but fewer for BKD Points)
// todo : Enable other SortField.Type as well, that will require wider change
if (getNumericType().applyPointSortOptimization() == PointSortOptimization.DISABLED) {
sortField.setOptimizeSortWithPoints(false);
}
return sortField;
}

Expand Down Expand Up @@ -298,9 +206,11 @@ private XFieldComparatorSource comparatorSource(
return dateComparatorSource(missingValue, sortMode, nested);
case DATE_NANOSECONDS:
return dateNanosComparatorSource(missingValue, sortMode, nested);
case LONG:
return new LongValuesComparatorSource(this, missingValue, sortMode, nested);
default:
assert !targetNumericType.isFloatingPoint();
return new LongValuesComparatorSource(this, missingValue, sortMode, nested);
return new IntValuesComparatorSource(this, missingValue, sortMode, nested);
}
}

Expand Down
Loading

0 comments on commit e8425fc

Please sign in to comment.