Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow reusing indexed binary fields. #12053

Merged
merged 5 commits into from
Jan 12, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions lucene/MIGRATE.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,15 @@ for (ScoreDoc hit : hits.scoreDocs) {
Note that these StoredFields and TermVectors instances should only be consumed in the thread where
they were acquired. For instance, it is illegal to share them across threads.

### Field can no longer configure a TokenStream independently from a value

Lucene 9.x and earlier versions allowed to set a TokenStream on Field instances
independently from a string, binary or numeric value. This is no longer allowed
on the base Field class. If you need to replicate this behavior, you need to
either provide two fields, one with a TokenStream and another one with a value,
or create a sub-class of Field that overrides `TokenStream
tokenStream(Analyzer, TokenStream)` to return a custom TokenStream.

### PersianStemFilter is added to PersianAnalyzer (LUCENE-10312)

PersianAnalyzer now includes PersianStemFilter, that would change analysis results. If you need the exactly same analysis
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,12 @@ public void testBogusTermVectors() throws IOException {
FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
ft.setStoreTermVectors(true);
ft.setStoreTermVectorOffsets(true);
Field field = new Field("foo", "", ft);
field.setTokenStream(
new FixBrokenOffsetsFilter(
new CannedTokenStream(new Token("bar", 5, 10), new Token("bar", 1, 4))));
Field field =
new Field(
"foo",
new FixBrokenOffsetsFilter(
new CannedTokenStream(new Token("bar", 5, 10), new Token("bar", 1, 4))),
ft);
doc.add(field);
iw.addDocument(doc);
iw.close();
Expand Down
54 changes: 30 additions & 24 deletions lucene/core/src/java/org/apache/lucene/document/Field.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
import org.apache.lucene.analysis.tokenattributes.BytesTermAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.index.DocValuesType;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.IndexableFieldType;
Expand Down Expand Up @@ -69,13 +70,8 @@ public class Field implements IndexableField {
protected Object fieldsData;

/**
* Pre-analyzed tokenStream for indexed fields; this is separate from fieldsData because you are
* allowed to have both; eg maybe field has a String value but you customize how it's tokenized
*/
protected TokenStream tokenStream;

/**
* Expert: creates a field with no initial value. Intended only for custom Field subclasses.
* Expert: creates a field with no initial value. This is intended to be used by custom {@link
* Field} sub-classes with pre-configured {@link IndexableFieldType}s.
*
* @param name field name
* @param type field type
Expand Down Expand Up @@ -149,8 +145,7 @@ public Field(String name, TokenStream tokenStream, IndexableFieldType type) {
}

this.name = name;
this.fieldsData = null;
this.tokenStream = tokenStream;
this.fieldsData = tokenStream;
this.type = type;
}

Expand Down Expand Up @@ -210,6 +205,20 @@ public Field(String name, BytesRef bytes, IndexableFieldType type) {
if (type == null) {
throw new IllegalArgumentException("type must not be null");
}
if (type.indexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0
|| type.storeTermVectorOffsets()) {
throw new IllegalArgumentException("It doesn't make sense to index offsets on binary fields");
}
if (type.indexOptions() != IndexOptions.NONE && type.tokenized()) {
throw new IllegalArgumentException("cannot set a BytesRef value on a tokenized field");
}
if (type.indexOptions() == IndexOptions.NONE
&& type.pointDimensionCount() == 0
&& type.docValuesType() == DocValuesType.NONE
&& type.stored() == false) {
throw new IllegalArgumentException(
"it doesn't make sense to have a field that is neither indexed, nor doc-valued, nor stored");
}
this.name = name;
this.fieldsData = bytes;
this.type = type;
Expand Down Expand Up @@ -237,9 +246,9 @@ public Field(String name, CharSequence value, IndexableFieldType type) {
if (type == null) {
throw new IllegalArgumentException("type must not be null");
}
if (!type.stored() && type.indexOptions() == IndexOptions.NONE) {
if (type.stored() == false && type.indexOptions() == IndexOptions.NONE) {
throw new IllegalArgumentException(
"it doesn't make sense to have a field that " + "is neither indexed nor stored");
"it doesn't make sense to have a field that is neither indexed nor stored");
}
this.name = name;
this.fieldsData = value;
Expand Down Expand Up @@ -278,7 +287,7 @@ public Reader readerValue() {
* String value is analyzed to produce the indexed tokens.
*/
public TokenStream tokenStreamValue() {
return tokenStream;
return fieldsData instanceof TokenStream ? (TokenStream) fieldsData : null;
}

/**
Expand Down Expand Up @@ -329,9 +338,6 @@ public void setBytesValue(BytesRef value) {
+ fieldsData.getClass().getSimpleName()
+ " to BytesRef");
}
if (type.indexOptions() != IndexOptions.NONE) {
throw new IllegalArgumentException("cannot set a BytesRef value on an indexed field");
}
if (value == null) {
throw new IllegalArgumentException("value must not be null");
}
Expand Down Expand Up @@ -392,15 +398,15 @@ public void setDoubleValue(double value) {
fieldsData = Double.valueOf(value);
}

/**
* Expert: sets the token stream to be used for indexing and causes isIndexed() and isTokenized()
* to return true. May be combined with stored values from stringValue() or binaryValue()
*/
/** Expert: sets the token stream to be used for indexing. */
public void setTokenStream(TokenStream tokenStream) {
if (type.indexOptions() == IndexOptions.NONE || !type.tokenized()) {
throw new IllegalArgumentException("TokenStream fields must be indexed and tokenized");
if (!(fieldsData instanceof TokenStream)) {
throw new IllegalArgumentException(
"cannot change value type from "
+ fieldsData.getClass().getSimpleName()
+ " to TokenStream");
}
this.tokenStream = tokenStream;
this.fieldsData = tokenStream;
}

@Override
Expand Down Expand Up @@ -478,8 +484,8 @@ public TokenStream tokenStream(Analyzer analyzer, TokenStream reuse) {
}
}

if (tokenStream != null) {
return tokenStream;
if (tokenStreamValue() != null) {
return tokenStreamValue();
} else if (readerValue() != null) {
return analyzer.tokenStream(name(), readerValue());
} else if (stringValue() != null) {
Expand Down
58 changes: 56 additions & 2 deletions lucene/core/src/test/org/apache/lucene/document/TestField.java
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import java.nio.charset.StandardCharsets;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
Expand Down Expand Up @@ -424,6 +425,31 @@ public void testStringField() throws Exception {
}
}

public void testBinaryStringField() throws Exception {
Field[] fields =
new Field[] {
new StringField("foo", new BytesRef("bar"), Field.Store.NO),
new StringField("foo", new BytesRef("bar"), Field.Store.YES)
};

for (Field field : fields) {
trySetByteValue(field);
field.setBytesValue("baz".getBytes(StandardCharsets.UTF_8));
assertEquals(new BytesRef("baz"), field.binaryValue());
field.setBytesValue(new BytesRef("baz"));
trySetDoubleValue(field);
trySetIntValue(field);
trySetFloatValue(field);
trySetLongValue(field);
trySetReaderValue(field);
trySetShortValue(field);
trySetStringValue(field);
trySetTokenStreamValue(field);

assertEquals(new BytesRef("baz"), field.binaryValue());
}
}

public void testTextFieldString() throws Exception {
Field[] fields =
new Field[] {
Expand All @@ -441,7 +467,7 @@ public void testTextFieldString() throws Exception {
trySetReaderValue(field);
trySetShortValue(field);
field.setStringValue("baz");
field.setTokenStream(new CannedTokenStream(new Token("foo", 0, 3)));
trySetTokenStreamValue(field);

assertEquals("baz", field.stringValue());
}
Expand All @@ -460,7 +486,7 @@ public void testTextFieldReader() throws Exception {
field.setReaderValue(new StringReader("foobar"));
trySetShortValue(field);
trySetStringValue(field);
field.setTokenStream(new CannedTokenStream(new Token("foo", 0, 3)));
trySetTokenStreamValue(field);

assertNotNull(field.readerValue());
}
Expand Down Expand Up @@ -730,4 +756,32 @@ private void trySetTokenStreamValue(Field f) {
f.setTokenStream(new CannedTokenStream(new Token("foo", 0, 3)));
});
}

public void testDisabledField() {
// neither indexed nor stored
FieldType ft = new FieldType();
expectThrows(IllegalArgumentException.class, () -> new Field("name", "", ft));
}

public void testTokenizedBinaryField() {
FieldType ft = new FieldType();
ft.setTokenized(true);
ft.setIndexOptions(IndexOptions.DOCS);
expectThrows(IllegalArgumentException.class, () -> new Field("name", new BytesRef(), ft));
}

public void testOffsetsBinaryField() {
FieldType ft = new FieldType();
ft.setTokenized(false);
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
expectThrows(IllegalArgumentException.class, () -> new Field("name", new BytesRef(), ft));
}

public void testTermVectorsOffsetsBinaryField() {
FieldType ft = new FieldType();
ft.setTokenized(false);
ft.setStoreTermVectors(true);
ft.setStoreTermVectorOffsets(true);
expectThrows(IllegalArgumentException.class, () -> new Field("name", new BytesRef(), ft));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -880,17 +880,21 @@ public void testExcIndexingDocBeforeDocValues() throws Exception {
IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
IndexWriter w = new IndexWriter(dir, iwc);
Document doc = new Document();
FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
FieldType ft = new FieldType(StringField.TYPE_NOT_STORED);
ft.setDocValuesType(DocValuesType.SORTED);
ft.freeze();
Field field = new Field("test", "value", ft);
field.setTokenStream(
new TokenStream() {
Field field =
new Field("test", new BytesRef("value"), ft) {
@Override
public boolean incrementToken() {
throw new RuntimeException("no");
public TokenStream tokenStream(Analyzer analyzer, TokenStream reuse) {
return new TokenStream() {
@Override
public boolean incrementToken() throws IOException {
throw new RuntimeException();
}
};
}
});
};
doc.add(field);
expectThrows(
RuntimeException.class,
Expand Down
43 changes: 23 additions & 20 deletions lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java
Original file line number Diff line number Diff line change
Expand Up @@ -1166,41 +1166,44 @@ public void testIndexStoreCombos() throws Exception {
FieldType customType = new FieldType(StoredField.TYPE);
customType.setTokenized(true);

Field f = new Field("binary", b, 10, 17, customType);
final MockTokenizer field1 = new MockTokenizer(MockTokenizer.WHITESPACE, false);
Field f =
new Field("binary", b, 10, 17, customType) {
@Override
public TokenStream tokenStream(Analyzer analyzer, TokenStream reuse) {
return field1;
}
};
// TODO: this is evil, changing the type after creating the field:
customType.setIndexOptions(IndexOptions.DOCS);
final MockTokenizer doc1field1 = new MockTokenizer(MockTokenizer.WHITESPACE, false);
doc1field1.setReader(new StringReader("doc1field1"));
f.setTokenStream(doc1field1);
field1.setReader(new StringReader("doc1field1"));

FieldType customType2 = new FieldType(TextField.TYPE_STORED);

Field f2 = newField("string", "value", customType2);
final MockTokenizer doc1field2 = new MockTokenizer(MockTokenizer.WHITESPACE, false);
doc1field2.setReader(new StringReader("doc1field2"));
f2.setTokenStream(doc1field2);
final MockTokenizer field2 = new MockTokenizer(MockTokenizer.WHITESPACE, false);
Field f2 =
new Field("string", "value", customType2) {
@Override
public TokenStream tokenStream(Analyzer analyzer, TokenStream reuse) {
return field2;
}
};

field2.setReader(new StringReader("doc1field2"));
doc.add(f);
doc.add(f2);
w.addDocument(doc);

// add 2 docs to test in-memory merging
final MockTokenizer doc2field1 = new MockTokenizer(MockTokenizer.WHITESPACE, false);
doc2field1.setReader(new StringReader("doc2field1"));
f.setTokenStream(doc2field1);
final MockTokenizer doc2field2 = new MockTokenizer(MockTokenizer.WHITESPACE, false);
doc2field2.setReader(new StringReader("doc2field2"));
f2.setTokenStream(doc2field2);
field1.setReader(new StringReader("doc2field1"));
field2.setReader(new StringReader("doc2field2"));
w.addDocument(doc);

// force segment flush so we can force a segment merge with doc3 later.
w.commit();

final MockTokenizer doc3field1 = new MockTokenizer(MockTokenizer.WHITESPACE, false);
doc3field1.setReader(new StringReader("doc3field1"));
f.setTokenStream(doc3field1);
final MockTokenizer doc3field2 = new MockTokenizer(MockTokenizer.WHITESPACE, false);
doc3field2.setReader(new StringReader("doc3field2"));
f2.setTokenStream(doc3field2);
field1.setReader(new StringReader("doc3field1"));
field2.setReader(new StringReader("doc3field2"));

w.addDocument(doc);
w.commit();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1497,13 +1497,13 @@ public void testAddDocsNonAbortingException() throws Exception {
doc.add(newStringField("id", docCount + "", Field.Store.NO));
doc.add(newTextField("content", "silly content " + docCount, Field.Store.NO));
if (docCount == 4) {
Field f = newTextField("crash", "", Field.Store.NO);
doc.add(f);
MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
tokenizer.setReader(new StringReader("crash me on the 4th token"));
tokenizer.setEnableChecks(
false); // disable workflow checking as we forcefully close() in exceptional cases.
f.setTokenStream(new CrashingFilter("crash", tokenizer));
Field f =
new Field("crash", new CrashingFilter("crash", tokenizer), TextField.TYPE_NOT_STORED);
doc.add(f);
}
}

Expand Down Expand Up @@ -1573,13 +1573,13 @@ public void testUpdateDocsNonAbortingException() throws Exception {
doc.add(newStringField("id", docCount + "", Field.Store.NO));
doc.add(newTextField("content", "silly content " + docCount, Field.Store.NO));
if (docCount == crashAt) {
Field f = newTextField("crash", "", Field.Store.NO);
doc.add(f);
MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
tokenizer.setReader(new StringReader("crash me on the 4th token"));
tokenizer.setEnableChecks(
false); // disable workflow checking as we forcefully close() in exceptional cases.
f.setTokenStream(new CrashingFilter("crash", tokenizer));
Field f =
new Field("crash", new CrashingFilter("crash", tokenizer), TextField.TYPE_NOT_STORED);
doc.add(f);
}
}

Expand Down
Loading