Skip to content

Commit

Permalink
Introduce a new KeywordField. (#12054)
Browse files Browse the repository at this point in the history
`KeywordField` is a combination of `StringField` and `SortedSetDocValuesField`,
similarly to how `LongField` is a combination of `LongPoint` and
`SortedNumericDocValuesField`. This makes it easier for users to create fields
that can be used for filtering, sorting and faceting.
  • Loading branch information
jpountz authored Feb 7, 2023
1 parent d693264 commit ab074d5
Show file tree
Hide file tree
Showing 6 changed files with 355 additions and 41 deletions.
4 changes: 3 additions & 1 deletion lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,9 @@ API Changes

New Features
---------------------
(No changes)

* GITHUB#12054: Introduce a new KeywordField for simple and efficient
filtering, sorting and faceting. (Adrien Grand)

Improvements
---------------------
Expand Down
188 changes: 188 additions & 0 deletions lucene/core/src/java/org/apache/lucene/document/KeywordField.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.document;

import java.util.Objects;
import org.apache.lucene.index.DocValuesType;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.ConstantScoreQuery;
import org.apache.lucene.search.IndexOrDocValuesQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.SortedSetSelector;
import org.apache.lucene.search.SortedSetSortField;
import org.apache.lucene.search.TermInSetQuery;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.util.BytesRef;

/**
* Field that indexes a per-document String or {@link BytesRef} into an inverted index for fast
* filtering, stores values in a columnar fashion using {@link DocValuesType#SORTED_SET} doc values
* for sorting and faceting, and optionally stores values as stored fields for top-hits retrieval.
* This field does not support scoring: queries produce constant scores. If you need more
* fine-grained control you can use {@link StringField}, {@link SortedDocValuesField} or {@link
* SortedSetDocValuesField}, and {@link StoredField}.
*
* <p>This field defines static factory methods for creating common query objects:
*
* <ul>
* <li>{@link #newExactQuery} for matching a value.
* <li>{@link #newSetQuery} for matching any of the values coming from a set.
* <li>{@link #newSortField} for matching a value.
* </ul>
*/
public class KeywordField extends Field {

private static final FieldType FIELD_TYPE = new FieldType();
private static final FieldType FIELD_TYPE_STORED;

static {
FIELD_TYPE.setIndexOptions(IndexOptions.DOCS);
FIELD_TYPE.setOmitNorms(true);
FIELD_TYPE.setTokenized(false);
FIELD_TYPE.setDocValuesType(DocValuesType.SORTED_SET);
FIELD_TYPE.freeze();

FIELD_TYPE_STORED = new FieldType(FIELD_TYPE);
FIELD_TYPE_STORED.setStored(true);
FIELD_TYPE_STORED.freeze();
}

private final StoredValue storedValue;

/**
* Creates a new KeywordField.
*
* @param name field name
* @param value the BytesRef value
* @param stored whether to store the field
* @throws IllegalArgumentException if the field name or value is null.
*/
public KeywordField(String name, BytesRef value, Store stored) {
super(name, value, stored == Field.Store.YES ? FIELD_TYPE_STORED : FIELD_TYPE);
if (stored == Store.YES) {
storedValue = new StoredValue(value);
} else {
storedValue = null;
}
}

/**
* Creates a new KeywordField from a String value, by indexing its UTF-8 representation.
*
* @param name field name
* @param value the BytesRef value
* @param stored whether to store the field
* @throws IllegalArgumentException if the field name or value is null.
*/
public KeywordField(String name, String value, Store stored) {
super(name, value, stored == Field.Store.YES ? FIELD_TYPE_STORED : FIELD_TYPE);
if (stored == Store.YES) {
storedValue = new StoredValue(value);
} else {
storedValue = null;
}
}

@Override
public BytesRef binaryValue() {
BytesRef binaryValue = super.binaryValue();
if (binaryValue != null) {
return binaryValue;
} else {
return new BytesRef(stringValue());
}
}

@Override
public void setStringValue(String value) {
super.setStringValue(value);
if (storedValue != null) {
storedValue.setStringValue(value);
}
}

@Override
public void setBytesValue(BytesRef value) {
super.setBytesValue(value);
if (storedValue != null) {
storedValue.setBinaryValue(value);
}
}

@Override
public StoredValue storedValue() {
return storedValue;
}

/**
* Create a query for matching an exact {@link BytesRef} value.
*
* @param field field name. must not be {@code null}.
* @param value exact value
* @throws NullPointerException if {@code field} is null.
* @return a query matching documents with this exact value
*/
public static Query newExactQuery(String field, BytesRef value) {
Objects.requireNonNull(field, "field must not be null");
Objects.requireNonNull(value, "value must not be null");
return new ConstantScoreQuery(new TermQuery(new Term(field, value)));
}

/**
* Create a query for matching an exact {@link String} value.
*
* @param field field name. must not be {@code null}.
* @param value exact value
* @throws NullPointerException if {@code field} is null.
* @return a query matching documents with this exact value
*/
public static Query newExactQuery(String field, String value) {
Objects.requireNonNull(value, "value must not be null");
return newExactQuery(field, new BytesRef(value));
}

/**
* Create a query for matching any of a set of provided {@link BytesRef} values.
*
* @param field field name. must not be {@code null}.
* @param values the set of values to match
* @throws NullPointerException if {@code field} is null.
* @return a query matching documents with this exact value
*/
public static Query newSetQuery(String field, BytesRef... values) {
Objects.requireNonNull(field, "field must not be null");
Objects.requireNonNull(values, "values must not be null");
return new IndexOrDocValuesQuery(
new TermInSetQuery(field, values), new SortedSetDocValuesSetQuery(field, values));
}

/**
* Create a new {@link SortField} for {@link BytesRef} values.
*
* @param field field name. must not be {@code null}.
* @param reverse true if natural order should be reversed.
* @param selector custom selector type for choosing the sort value from the set.
*/
public static SortField newSortField(
String field, boolean reverse, SortedSetSelector.Type selector) {
Objects.requireNonNull(field, "field must not be null");
Objects.requireNonNull(selector, "selector must not be null");
return new SortedSetSortField(field, reverse, selector);
}
}
125 changes: 125 additions & 0 deletions lucene/core/src/test/org/apache/lucene/document/TestKeywordField.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.document;

import java.io.IOException;
import java.util.Collections;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.Directory;
import org.apache.lucene.tests.util.LuceneTestCase;
import org.apache.lucene.util.BytesRef;

public class TestKeywordField extends LuceneTestCase {

public void testSetBytesValue() {
Field[] fields =
new Field[] {
new KeywordField("name", newBytesRef("value"), Field.Store.NO),
new KeywordField("name", newBytesRef("value"), Field.Store.YES)
};
for (Field field : fields) {
assertEquals(newBytesRef("value"), field.binaryValue());
assertNull(field.stringValue());
if (field.fieldType().stored()) {
assertEquals(newBytesRef("value"), field.storedValue().getBinaryValue());
} else {
assertNull(field.storedValue());
}
field.setBytesValue(newBytesRef("value2"));
assertEquals(newBytesRef("value2"), field.binaryValue());
assertNull(field.stringValue());
if (field.fieldType().stored()) {
assertEquals(newBytesRef("value2"), field.storedValue().getBinaryValue());
} else {
assertNull(field.storedValue());
}
}
}

public void testSetStringValue() {
Field[] fields =
new Field[] {
new KeywordField("name", "value", Field.Store.NO),
new KeywordField("name", "value", Field.Store.YES)
};
for (Field field : fields) {
assertEquals("value", field.stringValue());
assertEquals(newBytesRef("value"), field.binaryValue());
if (field.fieldType().stored()) {
assertEquals("value", field.storedValue().getStringValue());
} else {
assertNull(field.storedValue());
}
field.setStringValue("value2");
assertEquals("value2", field.stringValue());
assertEquals(newBytesRef("value2"), field.binaryValue());
if (field.fieldType().stored()) {
assertEquals("value2", field.storedValue().getStringValue());
} else {
assertNull(field.storedValue());
}
}
}

public void testIndexBytesValue() throws IOException {
Directory dir = newDirectory();
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig());
w.addDocument(
Collections.singleton(new KeywordField("field", newBytesRef("value"), Field.Store.YES)));
IndexReader reader = DirectoryReader.open(w);
w.close();
LeafReader leaf = getOnlyLeafReader(reader);
TermsEnum terms = leaf.terms("field").iterator();
assertEquals(new BytesRef("value"), terms.next());
assertNull(terms.next());
SortedSetDocValues values = leaf.getSortedSetDocValues("field");
assertTrue(values.advanceExact(0));
assertEquals(1, values.docValueCount());
assertEquals(0L, values.nextOrd());
assertEquals(new BytesRef("value"), values.lookupOrd(0));
Document storedDoc = leaf.storedFields().document(0);
assertEquals(new BytesRef("value"), storedDoc.getBinaryValue("field"));
reader.close();
dir.close();
}

public void testIndexStringValue() throws IOException {
Directory dir = newDirectory();
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig());
w.addDocument(Collections.singleton(new KeywordField("field", "value", Field.Store.YES)));
IndexReader reader = DirectoryReader.open(w);
w.close();
LeafReader leaf = getOnlyLeafReader(reader);
TermsEnum terms = leaf.terms("field").iterator();
assertEquals(new BytesRef("value"), terms.next());
assertNull(terms.next());
SortedSetDocValues values = leaf.getSortedSetDocValues("field");
assertTrue(values.advanceExact(0));
assertEquals(1, values.docValueCount());
assertEquals(0L, values.nextOrd());
assertEquals(new BytesRef("value"), values.lookupOrd(0));
Document storedDoc = leaf.storedFields().document(0);
assertEquals("value", storedDoc.get("field"));
reader.close();
dir.close();
}
}
Loading

0 comments on commit ab074d5

Please sign in to comment.