From 525a11091ca9b9076f8c1b3db41b52947090506a Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Thu, 12 Jan 2023 09:48:06 +0100 Subject: [PATCH] Revert "Allow reusing indexed binary fields. (#12053)" This reverts commit 84778549afabc32cd6e5a3459b1b8aaa89b74a74. --- lucene/MIGRATE.md | 9 --- .../TestFixBrokenOffsetsFilter.java | 10 ++-- .../org/apache/lucene/document/Field.java | 54 ++++++++--------- .../org/apache/lucene/document/TestField.java | 58 +------------------ .../lucene/index/TestDocValuesIndexing.java | 18 +++--- .../apache/lucene/index/TestIndexWriter.java | 43 +++++++------- .../index/TestIndexWriterExceptions.java | 12 ++-- .../org/apache/lucene/index/TestPayloads.java | 11 ++-- .../lucene/index/TestPayloadsOnVectors.java | 12 ++-- .../org/apache/lucene/index/TestTerms.java | 6 +- 10 files changed, 81 insertions(+), 152 deletions(-) diff --git a/lucene/MIGRATE.md b/lucene/MIGRATE.md index e731cfd84ab8..7c00aa1ad7a6 100644 --- a/lucene/MIGRATE.md +++ b/lucene/MIGRATE.md @@ -38,15 +38,6 @@ for (ScoreDoc hit : hits.scoreDocs) { Note that these StoredFields and TermVectors instances should only be consumed in the thread where they were acquired. For instance, it is illegal to share them across threads. -### Field can no longer configure a TokenStream independently from a value - -Lucene 9.x and earlier versions allowed to set a TokenStream on Field instances -independently from a string, binary or numeric value. This is no longer allowed -on the base Field class. If you need to replicate this behavior, you need to -either provide two fields, one with a TokenStream and another one with a value, -or create a sub-class of Field that overrides `TokenStream -tokenStream(Analyzer, TokenStream)` to return a custom TokenStream. - ### PersianStemFilter is added to PersianAnalyzer (LUCENE-10312) PersianAnalyzer now includes PersianStemFilter, that would change analysis results. If you need the exactly same analysis diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestFixBrokenOffsetsFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestFixBrokenOffsetsFilter.java index a670c05143f1..4b9c8e174249 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestFixBrokenOffsetsFilter.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestFixBrokenOffsetsFilter.java @@ -38,12 +38,10 @@ public void testBogusTermVectors() throws IOException { FieldType ft = new FieldType(TextField.TYPE_NOT_STORED); ft.setStoreTermVectors(true); ft.setStoreTermVectorOffsets(true); - Field field = - new Field( - "foo", - new FixBrokenOffsetsFilter( - new CannedTokenStream(new Token("bar", 5, 10), new Token("bar", 1, 4))), - ft); + Field field = new Field("foo", "", ft); + field.setTokenStream( + new FixBrokenOffsetsFilter( + new CannedTokenStream(new Token("bar", 5, 10), new Token("bar", 1, 4)))); doc.add(field); iw.addDocument(doc); iw.close(); diff --git a/lucene/core/src/java/org/apache/lucene/document/Field.java b/lucene/core/src/java/org/apache/lucene/document/Field.java index 94e86c7bcdbb..126e85e3e220 100644 --- a/lucene/core/src/java/org/apache/lucene/document/Field.java +++ b/lucene/core/src/java/org/apache/lucene/document/Field.java @@ -23,7 +23,6 @@ import org.apache.lucene.analysis.tokenattributes.BytesTermAttribute; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; -import org.apache.lucene.index.DocValuesType; import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.IndexableField; import org.apache.lucene.index.IndexableFieldType; @@ -70,8 +69,13 @@ public class Field implements IndexableField { protected Object fieldsData; /** - * Expert: creates a field with no initial value. This is intended to be used by custom {@link - * Field} sub-classes with pre-configured {@link IndexableFieldType}s. + * Pre-analyzed tokenStream for indexed fields; this is separate from fieldsData because you are + * allowed to have both; eg maybe field has a String value but you customize how it's tokenized + */ + protected TokenStream tokenStream; + + /** + * Expert: creates a field with no initial value. Intended only for custom Field subclasses. * * @param name field name * @param type field type @@ -145,7 +149,8 @@ public Field(String name, TokenStream tokenStream, IndexableFieldType type) { } this.name = name; - this.fieldsData = tokenStream; + this.fieldsData = null; + this.tokenStream = tokenStream; this.type = type; } @@ -205,20 +210,6 @@ public Field(String name, BytesRef bytes, IndexableFieldType type) { if (type == null) { throw new IllegalArgumentException("type must not be null"); } - if (type.indexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0 - || type.storeTermVectorOffsets()) { - throw new IllegalArgumentException("It doesn't make sense to index offsets on binary fields"); - } - if (type.indexOptions() != IndexOptions.NONE && type.tokenized()) { - throw new IllegalArgumentException("cannot set a BytesRef value on a tokenized field"); - } - if (type.indexOptions() == IndexOptions.NONE - && type.pointDimensionCount() == 0 - && type.docValuesType() == DocValuesType.NONE - && type.stored() == false) { - throw new IllegalArgumentException( - "it doesn't make sense to have a field that is neither indexed, nor doc-valued, nor stored"); - } this.name = name; this.fieldsData = bytes; this.type = type; @@ -246,9 +237,9 @@ public Field(String name, CharSequence value, IndexableFieldType type) { if (type == null) { throw new IllegalArgumentException("type must not be null"); } - if (type.stored() == false && type.indexOptions() == IndexOptions.NONE) { + if (!type.stored() && type.indexOptions() == IndexOptions.NONE) { throw new IllegalArgumentException( - "it doesn't make sense to have a field that is neither indexed nor stored"); + "it doesn't make sense to have a field that " + "is neither indexed nor stored"); } this.name = name; this.fieldsData = value; @@ -287,7 +278,7 @@ public Reader readerValue() { * String value is analyzed to produce the indexed tokens. */ public TokenStream tokenStreamValue() { - return fieldsData instanceof TokenStream ? (TokenStream) fieldsData : null; + return tokenStream; } /** @@ -338,6 +329,9 @@ public void setBytesValue(BytesRef value) { + fieldsData.getClass().getSimpleName() + " to BytesRef"); } + if (type.indexOptions() != IndexOptions.NONE) { + throw new IllegalArgumentException("cannot set a BytesRef value on an indexed field"); + } if (value == null) { throw new IllegalArgumentException("value must not be null"); } @@ -398,15 +392,15 @@ public void setDoubleValue(double value) { fieldsData = Double.valueOf(value); } - /** Expert: sets the token stream to be used for indexing. */ + /** + * Expert: sets the token stream to be used for indexing and causes isIndexed() and isTokenized() + * to return true. May be combined with stored values from stringValue() or binaryValue() + */ public void setTokenStream(TokenStream tokenStream) { - if (!(fieldsData instanceof TokenStream)) { - throw new IllegalArgumentException( - "cannot change value type from " - + fieldsData.getClass().getSimpleName() - + " to TokenStream"); + if (type.indexOptions() == IndexOptions.NONE || !type.tokenized()) { + throw new IllegalArgumentException("TokenStream fields must be indexed and tokenized"); } - this.fieldsData = tokenStream; + this.tokenStream = tokenStream; } @Override @@ -484,8 +478,8 @@ public TokenStream tokenStream(Analyzer analyzer, TokenStream reuse) { } } - if (tokenStreamValue() != null) { - return tokenStreamValue(); + if (tokenStream != null) { + return tokenStream; } else if (readerValue() != null) { return analyzer.tokenStream(name(), readerValue()); } else if (stringValue() != null) { diff --git a/lucene/core/src/test/org/apache/lucene/document/TestField.java b/lucene/core/src/test/org/apache/lucene/document/TestField.java index 0c61a9d2ea62..534f06ff7a6c 100644 --- a/lucene/core/src/test/org/apache/lucene/document/TestField.java +++ b/lucene/core/src/test/org/apache/lucene/document/TestField.java @@ -23,7 +23,6 @@ import org.apache.lucene.codecs.Codec; import org.apache.lucene.index.ByteVectorValues; import org.apache.lucene.index.DirectoryReader; -import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.Term; @@ -426,31 +425,6 @@ public void testStringField() throws Exception { } } - public void testBinaryStringField() throws Exception { - Field[] fields = - new Field[] { - new StringField("foo", new BytesRef("bar"), Field.Store.NO), - new StringField("foo", new BytesRef("bar"), Field.Store.YES) - }; - - for (Field field : fields) { - trySetByteValue(field); - field.setBytesValue("baz".getBytes(StandardCharsets.UTF_8)); - assertEquals(new BytesRef("baz"), field.binaryValue()); - field.setBytesValue(new BytesRef("baz")); - trySetDoubleValue(field); - trySetIntValue(field); - trySetFloatValue(field); - trySetLongValue(field); - trySetReaderValue(field); - trySetShortValue(field); - trySetStringValue(field); - trySetTokenStreamValue(field); - - assertEquals(new BytesRef("baz"), field.binaryValue()); - } - } - public void testTextFieldString() throws Exception { Field[] fields = new Field[] { @@ -468,7 +442,7 @@ public void testTextFieldString() throws Exception { trySetReaderValue(field); trySetShortValue(field); field.setStringValue("baz"); - trySetTokenStreamValue(field); + field.setTokenStream(new CannedTokenStream(new Token("foo", 0, 3))); assertEquals("baz", field.stringValue()); } @@ -487,7 +461,7 @@ public void testTextFieldReader() throws Exception { field.setReaderValue(new StringReader("foobar")); trySetShortValue(field); trySetStringValue(field); - trySetTokenStreamValue(field); + field.setTokenStream(new CannedTokenStream(new Token("foo", 0, 3))); assertNotNull(field.readerValue()); } @@ -754,32 +728,4 @@ private void trySetTokenStreamValue(Field f) { f.setTokenStream(new CannedTokenStream(new Token("foo", 0, 3))); }); } - - public void testDisabledField() { - // neither indexed nor stored - FieldType ft = new FieldType(); - expectThrows(IllegalArgumentException.class, () -> new Field("name", "", ft)); - } - - public void testTokenizedBinaryField() { - FieldType ft = new FieldType(); - ft.setTokenized(true); - ft.setIndexOptions(IndexOptions.DOCS); - expectThrows(IllegalArgumentException.class, () -> new Field("name", new BytesRef(), ft)); - } - - public void testOffsetsBinaryField() { - FieldType ft = new FieldType(); - ft.setTokenized(false); - ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); - expectThrows(IllegalArgumentException.class, () -> new Field("name", new BytesRef(), ft)); - } - - public void testTermVectorsOffsetsBinaryField() { - FieldType ft = new FieldType(); - ft.setTokenized(false); - ft.setStoreTermVectors(true); - ft.setStoreTermVectorOffsets(true); - expectThrows(IllegalArgumentException.class, () -> new Field("name", new BytesRef(), ft)); - } } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestDocValuesIndexing.java b/lucene/core/src/test/org/apache/lucene/index/TestDocValuesIndexing.java index c7381310b371..3ff0664193fc 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestDocValuesIndexing.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestDocValuesIndexing.java @@ -880,21 +880,17 @@ public void testExcIndexingDocBeforeDocValues() throws Exception { IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); IndexWriter w = new IndexWriter(dir, iwc); Document doc = new Document(); - FieldType ft = new FieldType(StringField.TYPE_NOT_STORED); + FieldType ft = new FieldType(TextField.TYPE_NOT_STORED); ft.setDocValuesType(DocValuesType.SORTED); ft.freeze(); - Field field = - new Field("test", new BytesRef("value"), ft) { + Field field = new Field("test", "value", ft); + field.setTokenStream( + new TokenStream() { @Override - public TokenStream tokenStream(Analyzer analyzer, TokenStream reuse) { - return new TokenStream() { - @Override - public boolean incrementToken() throws IOException { - throw new RuntimeException(); - } - }; + public boolean incrementToken() { + throw new RuntimeException("no"); } - }; + }); doc.add(field); expectThrows( RuntimeException.class, diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java index 156d4f596f6d..e9a2fd9141dd 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java @@ -1166,44 +1166,41 @@ public void testIndexStoreCombos() throws Exception { FieldType customType = new FieldType(StoredField.TYPE); customType.setTokenized(true); - final MockTokenizer field1 = new MockTokenizer(MockTokenizer.WHITESPACE, false); - Field f = - new Field("binary", b, 10, 17, customType) { - @Override - public TokenStream tokenStream(Analyzer analyzer, TokenStream reuse) { - return field1; - } - }; + Field f = new Field("binary", b, 10, 17, customType); // TODO: this is evil, changing the type after creating the field: customType.setIndexOptions(IndexOptions.DOCS); - field1.setReader(new StringReader("doc1field1")); + final MockTokenizer doc1field1 = new MockTokenizer(MockTokenizer.WHITESPACE, false); + doc1field1.setReader(new StringReader("doc1field1")); + f.setTokenStream(doc1field1); FieldType customType2 = new FieldType(TextField.TYPE_STORED); - final MockTokenizer field2 = new MockTokenizer(MockTokenizer.WHITESPACE, false); - Field f2 = - new Field("string", "value", customType2) { - @Override - public TokenStream tokenStream(Analyzer analyzer, TokenStream reuse) { - return field2; - } - }; - - field2.setReader(new StringReader("doc1field2")); + Field f2 = newField("string", "value", customType2); + final MockTokenizer doc1field2 = new MockTokenizer(MockTokenizer.WHITESPACE, false); + doc1field2.setReader(new StringReader("doc1field2")); + f2.setTokenStream(doc1field2); doc.add(f); doc.add(f2); w.addDocument(doc); // add 2 docs to test in-memory merging - field1.setReader(new StringReader("doc2field1")); - field2.setReader(new StringReader("doc2field2")); + final MockTokenizer doc2field1 = new MockTokenizer(MockTokenizer.WHITESPACE, false); + doc2field1.setReader(new StringReader("doc2field1")); + f.setTokenStream(doc2field1); + final MockTokenizer doc2field2 = new MockTokenizer(MockTokenizer.WHITESPACE, false); + doc2field2.setReader(new StringReader("doc2field2")); + f2.setTokenStream(doc2field2); w.addDocument(doc); // force segment flush so we can force a segment merge with doc3 later. w.commit(); - field1.setReader(new StringReader("doc3field1")); - field2.setReader(new StringReader("doc3field2")); + final MockTokenizer doc3field1 = new MockTokenizer(MockTokenizer.WHITESPACE, false); + doc3field1.setReader(new StringReader("doc3field1")); + f.setTokenStream(doc3field1); + final MockTokenizer doc3field2 = new MockTokenizer(MockTokenizer.WHITESPACE, false); + doc3field2.setReader(new StringReader("doc3field2")); + f2.setTokenStream(doc3field2); w.addDocument(doc); w.commit(); diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriterExceptions.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriterExceptions.java index c11c1be3b8f4..1b36eeca29b6 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriterExceptions.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriterExceptions.java @@ -1497,13 +1497,13 @@ public void testAddDocsNonAbortingException() throws Exception { doc.add(newStringField("id", docCount + "", Field.Store.NO)); doc.add(newTextField("content", "silly content " + docCount, Field.Store.NO)); if (docCount == 4) { + Field f = newTextField("crash", "", Field.Store.NO); + doc.add(f); MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); tokenizer.setReader(new StringReader("crash me on the 4th token")); tokenizer.setEnableChecks( false); // disable workflow checking as we forcefully close() in exceptional cases. - Field f = - new Field("crash", new CrashingFilter("crash", tokenizer), TextField.TYPE_NOT_STORED); - doc.add(f); + f.setTokenStream(new CrashingFilter("crash", tokenizer)); } } @@ -1573,13 +1573,13 @@ public void testUpdateDocsNonAbortingException() throws Exception { doc.add(newStringField("id", docCount + "", Field.Store.NO)); doc.add(newTextField("content", "silly content " + docCount, Field.Store.NO)); if (docCount == crashAt) { + Field f = newTextField("crash", "", Field.Store.NO); + doc.add(f); MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); tokenizer.setReader(new StringReader("crash me on the 4th token")); tokenizer.setEnableChecks( false); // disable workflow checking as we forcefully close() in exceptional cases. - Field f = - new Field("crash", new CrashingFilter("crash", tokenizer), TextField.TYPE_NOT_STORED); - doc.add(f); + f.setTokenStream(new CrashingFilter("crash", tokenizer)); } } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestPayloads.java b/lucene/core/src/test/org/apache/lucene/index/TestPayloads.java index e05f3ae66331..4174b77827b7 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestPayloads.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestPayloads.java @@ -568,9 +568,10 @@ public void testMixupDocs() throws Exception { iwc.setMergePolicy(newLogMergePolicy()); RandomIndexWriter writer = new RandomIndexWriter(random(), dir, iwc); Document doc = new Document(); + Field field = new TextField("field", "", Field.Store.NO); TokenStream ts = new MockTokenizer(MockTokenizer.WHITESPACE, true); ((Tokenizer) ts).setReader(new StringReader("here we go")); - Field field = new Field("field", ts, TextField.TYPE_NOT_STORED); + field.setTokenStream(ts); doc.add(field); writer.addDocument(doc); Token withPayload = new Token("withPayload", 0, 11); @@ -600,20 +601,22 @@ public void testMixupMultiValued() throws Exception { Directory dir = newDirectory(); RandomIndexWriter writer = new RandomIndexWriter(random(), dir); Document doc = new Document(); + Field field = new TextField("field", "", Field.Store.NO); TokenStream ts = new MockTokenizer(MockTokenizer.WHITESPACE, true); - Field field = new Field("field", ts, TextField.TYPE_NOT_STORED); ((Tokenizer) ts).setReader(new StringReader("here we go")); field.setTokenStream(ts); doc.add(field); + Field field2 = new TextField("field", "", Field.Store.NO); Token withPayload = new Token("withPayload", 0, 11); withPayload.setPayload(new BytesRef("test")); ts = new CannedTokenStream(withPayload); assertTrue(ts.hasAttribute(PayloadAttribute.class)); - Field field2 = new Field("field", ts, TextField.TYPE_NOT_STORED); + field2.setTokenStream(ts); doc.add(field2); + Field field3 = new TextField("field", "", Field.Store.NO); ts = new MockTokenizer(MockTokenizer.WHITESPACE, true); ((Tokenizer) ts).setReader(new StringReader("nopayload")); - Field field3 = new Field("field", ts, TextField.TYPE_NOT_STORED); + field3.setTokenStream(ts); doc.add(field3); writer.addDocument(doc); DirectoryReader reader = writer.getReader(); diff --git a/lucene/core/src/test/org/apache/lucene/index/TestPayloadsOnVectors.java b/lucene/core/src/test/org/apache/lucene/index/TestPayloadsOnVectors.java index 5493b1cd3ee0..8bcb2bff305c 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestPayloadsOnVectors.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestPayloadsOnVectors.java @@ -47,9 +47,10 @@ public void testMixupDocs() throws Exception { customType.setStoreTermVectorPositions(true); customType.setStoreTermVectorPayloads(true); customType.setStoreTermVectorOffsets(random().nextBoolean()); + Field field = new Field("field", "", customType); TokenStream ts = new MockTokenizer(MockTokenizer.WHITESPACE, true); ((Tokenizer) ts).setReader(new StringReader("here we go")); - Field field = new Field("field", ts, customType); + field.setTokenStream(ts); doc.add(field); writer.addDocument(doc); @@ -89,19 +90,22 @@ public void testMixupMultiValued() throws Exception { customType.setStoreTermVectorPositions(true); customType.setStoreTermVectorPayloads(true); customType.setStoreTermVectorOffsets(random().nextBoolean()); + Field field = new Field("field", "", customType); TokenStream ts = new MockTokenizer(MockTokenizer.WHITESPACE, true); ((Tokenizer) ts).setReader(new StringReader("here we go")); - Field field = new Field("field", ts, customType); + field.setTokenStream(ts); doc.add(field); + Field field2 = new Field("field", "", customType); Token withPayload = new Token("withPayload", 0, 11); withPayload.setPayload(new BytesRef("test")); ts = new CannedTokenStream(withPayload); assertTrue(ts.hasAttribute(PayloadAttribute.class)); - Field field2 = new Field("field", ts, customType); + field2.setTokenStream(ts); doc.add(field2); + Field field3 = new Field("field", "", customType); ts = new MockTokenizer(MockTokenizer.WHITESPACE, true); ((Tokenizer) ts).setReader(new StringReader("nopayload")); - Field field3 = new Field("field", ts, customType); + field3.setTokenStream(ts); doc.add(field3); writer.addDocument(doc); DirectoryReader reader = writer.getReader(); diff --git a/lucene/core/src/test/org/apache/lucene/index/TestTerms.java b/lucene/core/src/test/org/apache/lucene/index/TestTerms.java index 9508c383746f..84067ced62cd 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestTerms.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestTerms.java @@ -51,6 +51,8 @@ public void testTermMinMaxRandom() throws Exception { BytesRef maxTerm = null; for (int i = 0; i < numDocs; i++) { Document doc = new Document(); + Field field = new TextField("field", "", Field.Store.NO); + doc.add(field); // System.out.println(" doc " + i); CannedBinaryTokenStream.BinaryToken[] tokens = new CannedBinaryTokenStream.BinaryToken[atLeast(10)]; @@ -69,9 +71,7 @@ public void testTermMinMaxRandom() throws Exception { } tokens[j] = new CannedBinaryTokenStream.BinaryToken(tokenBytes); } - Field field = - new Field("field", new CannedBinaryTokenStream(tokens), TextField.TYPE_NOT_STORED); - doc.add(field); + field.setTokenStream(new CannedBinaryTokenStream(tokens)); w.addDocument(doc); }