Revert "Allow reusing indexed binary fields. (#12053)"

This reverts commit 84778549af.
This commit is contained in:
Adrien Grand 2023-01-12 09:48:06 +01:00
parent 84778549af
commit 525a11091c
10 changed files with 81 additions and 152 deletions

View File

@ -38,15 +38,6 @@ for (ScoreDoc hit : hits.scoreDocs) {
Note that these StoredFields and TermVectors instances should only be consumed in the thread where
they were acquired. For instance, it is illegal to share them across threads.
### Field can no longer configure a TokenStream independently from a value
Lucene 9.x and earlier versions allowed to set a TokenStream on Field instances
independently from a string, binary or numeric value. This is no longer allowed
on the base Field class. If you need to replicate this behavior, you need to
either provide two fields, one with a TokenStream and another one with a value,
or create a sub-class of Field that overrides `TokenStream
tokenStream(Analyzer, TokenStream)` to return a custom TokenStream.
### PersianStemFilter is added to PersianAnalyzer (LUCENE-10312)
PersianAnalyzer now includes PersianStemFilter, that would change analysis results. If you need the exactly same analysis

View File

@ -38,12 +38,10 @@ public class TestFixBrokenOffsetsFilter extends BaseTokenStreamTestCase {
FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
ft.setStoreTermVectors(true);
ft.setStoreTermVectorOffsets(true);
Field field =
new Field(
"foo",
new FixBrokenOffsetsFilter(
new CannedTokenStream(new Token("bar", 5, 10), new Token("bar", 1, 4))),
ft);
Field field = new Field("foo", "", ft);
field.setTokenStream(
new FixBrokenOffsetsFilter(
new CannedTokenStream(new Token("bar", 5, 10), new Token("bar", 1, 4))));
doc.add(field);
iw.addDocument(doc);
iw.close();

View File

@ -23,7 +23,6 @@ import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.BytesTermAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.index.DocValuesType;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.IndexableFieldType;
@ -70,8 +69,13 @@ public class Field implements IndexableField {
protected Object fieldsData;
/**
* Expert: creates a field with no initial value. This is intended to be used by custom {@link
* Field} sub-classes with pre-configured {@link IndexableFieldType}s.
* Pre-analyzed tokenStream for indexed fields; this is separate from fieldsData because you are
* allowed to have both; eg maybe field has a String value but you customize how it's tokenized
*/
protected TokenStream tokenStream;
/**
* Expert: creates a field with no initial value. Intended only for custom Field subclasses.
*
* @param name field name
* @param type field type
@ -145,7 +149,8 @@ public class Field implements IndexableField {
}
this.name = name;
this.fieldsData = tokenStream;
this.fieldsData = null;
this.tokenStream = tokenStream;
this.type = type;
}
@ -205,20 +210,6 @@ public class Field implements IndexableField {
if (type == null) {
throw new IllegalArgumentException("type must not be null");
}
if (type.indexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0
|| type.storeTermVectorOffsets()) {
throw new IllegalArgumentException("It doesn't make sense to index offsets on binary fields");
}
if (type.indexOptions() != IndexOptions.NONE && type.tokenized()) {
throw new IllegalArgumentException("cannot set a BytesRef value on a tokenized field");
}
if (type.indexOptions() == IndexOptions.NONE
&& type.pointDimensionCount() == 0
&& type.docValuesType() == DocValuesType.NONE
&& type.stored() == false) {
throw new IllegalArgumentException(
"it doesn't make sense to have a field that is neither indexed, nor doc-valued, nor stored");
}
this.name = name;
this.fieldsData = bytes;
this.type = type;
@ -246,9 +237,9 @@ public class Field implements IndexableField {
if (type == null) {
throw new IllegalArgumentException("type must not be null");
}
if (type.stored() == false && type.indexOptions() == IndexOptions.NONE) {
if (!type.stored() && type.indexOptions() == IndexOptions.NONE) {
throw new IllegalArgumentException(
"it doesn't make sense to have a field that is neither indexed nor stored");
"it doesn't make sense to have a field that " + "is neither indexed nor stored");
}
this.name = name;
this.fieldsData = value;
@ -287,7 +278,7 @@ public class Field implements IndexableField {
* String value is analyzed to produce the indexed tokens.
*/
public TokenStream tokenStreamValue() {
return fieldsData instanceof TokenStream ? (TokenStream) fieldsData : null;
return tokenStream;
}
/**
@ -338,6 +329,9 @@ public class Field implements IndexableField {
+ fieldsData.getClass().getSimpleName()
+ " to BytesRef");
}
if (type.indexOptions() != IndexOptions.NONE) {
throw new IllegalArgumentException("cannot set a BytesRef value on an indexed field");
}
if (value == null) {
throw new IllegalArgumentException("value must not be null");
}
@ -398,15 +392,15 @@ public class Field implements IndexableField {
fieldsData = Double.valueOf(value);
}
/** Expert: sets the token stream to be used for indexing. */
/**
* Expert: sets the token stream to be used for indexing and causes isIndexed() and isTokenized()
* to return true. May be combined with stored values from stringValue() or binaryValue()
*/
public void setTokenStream(TokenStream tokenStream) {
if (!(fieldsData instanceof TokenStream)) {
throw new IllegalArgumentException(
"cannot change value type from "
+ fieldsData.getClass().getSimpleName()
+ " to TokenStream");
if (type.indexOptions() == IndexOptions.NONE || !type.tokenized()) {
throw new IllegalArgumentException("TokenStream fields must be indexed and tokenized");
}
this.fieldsData = tokenStream;
this.tokenStream = tokenStream;
}
@Override
@ -484,8 +478,8 @@ public class Field implements IndexableField {
}
}
if (tokenStreamValue() != null) {
return tokenStreamValue();
if (tokenStream != null) {
return tokenStream;
} else if (readerValue() != null) {
return analyzer.tokenStream(name(), readerValue());
} else if (stringValue() != null) {

View File

@ -23,7 +23,6 @@ import java.nio.charset.StandardCharsets;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.index.ByteVectorValues;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
@ -426,31 +425,6 @@ public class TestField extends LuceneTestCase {
}
}
public void testBinaryStringField() throws Exception {
Field[] fields =
new Field[] {
new StringField("foo", new BytesRef("bar"), Field.Store.NO),
new StringField("foo", new BytesRef("bar"), Field.Store.YES)
};
for (Field field : fields) {
trySetByteValue(field);
field.setBytesValue("baz".getBytes(StandardCharsets.UTF_8));
assertEquals(new BytesRef("baz"), field.binaryValue());
field.setBytesValue(new BytesRef("baz"));
trySetDoubleValue(field);
trySetIntValue(field);
trySetFloatValue(field);
trySetLongValue(field);
trySetReaderValue(field);
trySetShortValue(field);
trySetStringValue(field);
trySetTokenStreamValue(field);
assertEquals(new BytesRef("baz"), field.binaryValue());
}
}
public void testTextFieldString() throws Exception {
Field[] fields =
new Field[] {
@ -468,7 +442,7 @@ public class TestField extends LuceneTestCase {
trySetReaderValue(field);
trySetShortValue(field);
field.setStringValue("baz");
trySetTokenStreamValue(field);
field.setTokenStream(new CannedTokenStream(new Token("foo", 0, 3)));
assertEquals("baz", field.stringValue());
}
@ -487,7 +461,7 @@ public class TestField extends LuceneTestCase {
field.setReaderValue(new StringReader("foobar"));
trySetShortValue(field);
trySetStringValue(field);
trySetTokenStreamValue(field);
field.setTokenStream(new CannedTokenStream(new Token("foo", 0, 3)));
assertNotNull(field.readerValue());
}
@ -754,32 +728,4 @@ public class TestField extends LuceneTestCase {
f.setTokenStream(new CannedTokenStream(new Token("foo", 0, 3)));
});
}
public void testDisabledField() {
// neither indexed nor stored
FieldType ft = new FieldType();
expectThrows(IllegalArgumentException.class, () -> new Field("name", "", ft));
}
public void testTokenizedBinaryField() {
FieldType ft = new FieldType();
ft.setTokenized(true);
ft.setIndexOptions(IndexOptions.DOCS);
expectThrows(IllegalArgumentException.class, () -> new Field("name", new BytesRef(), ft));
}
public void testOffsetsBinaryField() {
FieldType ft = new FieldType();
ft.setTokenized(false);
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
expectThrows(IllegalArgumentException.class, () -> new Field("name", new BytesRef(), ft));
}
public void testTermVectorsOffsetsBinaryField() {
FieldType ft = new FieldType();
ft.setTokenized(false);
ft.setStoreTermVectors(true);
ft.setStoreTermVectorOffsets(true);
expectThrows(IllegalArgumentException.class, () -> new Field("name", new BytesRef(), ft));
}
}

View File

@ -880,21 +880,17 @@ public class TestDocValuesIndexing extends LuceneTestCase {
IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
IndexWriter w = new IndexWriter(dir, iwc);
Document doc = new Document();
FieldType ft = new FieldType(StringField.TYPE_NOT_STORED);
FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
ft.setDocValuesType(DocValuesType.SORTED);
ft.freeze();
Field field =
new Field("test", new BytesRef("value"), ft) {
Field field = new Field("test", "value", ft);
field.setTokenStream(
new TokenStream() {
@Override
public TokenStream tokenStream(Analyzer analyzer, TokenStream reuse) {
return new TokenStream() {
@Override
public boolean incrementToken() throws IOException {
throw new RuntimeException();
}
};
public boolean incrementToken() {
throw new RuntimeException("no");
}
};
});
doc.add(field);
expectThrows(
RuntimeException.class,

View File

@ -1166,44 +1166,41 @@ public class TestIndexWriter extends LuceneTestCase {
FieldType customType = new FieldType(StoredField.TYPE);
customType.setTokenized(true);
final MockTokenizer field1 = new MockTokenizer(MockTokenizer.WHITESPACE, false);
Field f =
new Field("binary", b, 10, 17, customType) {
@Override
public TokenStream tokenStream(Analyzer analyzer, TokenStream reuse) {
return field1;
}
};
Field f = new Field("binary", b, 10, 17, customType);
// TODO: this is evil, changing the type after creating the field:
customType.setIndexOptions(IndexOptions.DOCS);
field1.setReader(new StringReader("doc1field1"));
final MockTokenizer doc1field1 = new MockTokenizer(MockTokenizer.WHITESPACE, false);
doc1field1.setReader(new StringReader("doc1field1"));
f.setTokenStream(doc1field1);
FieldType customType2 = new FieldType(TextField.TYPE_STORED);
final MockTokenizer field2 = new MockTokenizer(MockTokenizer.WHITESPACE, false);
Field f2 =
new Field("string", "value", customType2) {
@Override
public TokenStream tokenStream(Analyzer analyzer, TokenStream reuse) {
return field2;
}
};
field2.setReader(new StringReader("doc1field2"));
Field f2 = newField("string", "value", customType2);
final MockTokenizer doc1field2 = new MockTokenizer(MockTokenizer.WHITESPACE, false);
doc1field2.setReader(new StringReader("doc1field2"));
f2.setTokenStream(doc1field2);
doc.add(f);
doc.add(f2);
w.addDocument(doc);
// add 2 docs to test in-memory merging
field1.setReader(new StringReader("doc2field1"));
field2.setReader(new StringReader("doc2field2"));
final MockTokenizer doc2field1 = new MockTokenizer(MockTokenizer.WHITESPACE, false);
doc2field1.setReader(new StringReader("doc2field1"));
f.setTokenStream(doc2field1);
final MockTokenizer doc2field2 = new MockTokenizer(MockTokenizer.WHITESPACE, false);
doc2field2.setReader(new StringReader("doc2field2"));
f2.setTokenStream(doc2field2);
w.addDocument(doc);
// force segment flush so we can force a segment merge with doc3 later.
w.commit();
field1.setReader(new StringReader("doc3field1"));
field2.setReader(new StringReader("doc3field2"));
final MockTokenizer doc3field1 = new MockTokenizer(MockTokenizer.WHITESPACE, false);
doc3field1.setReader(new StringReader("doc3field1"));
f.setTokenStream(doc3field1);
final MockTokenizer doc3field2 = new MockTokenizer(MockTokenizer.WHITESPACE, false);
doc3field2.setReader(new StringReader("doc3field2"));
f2.setTokenStream(doc3field2);
w.addDocument(doc);
w.commit();

View File

@ -1497,13 +1497,13 @@ public class TestIndexWriterExceptions extends LuceneTestCase {
doc.add(newStringField("id", docCount + "", Field.Store.NO));
doc.add(newTextField("content", "silly content " + docCount, Field.Store.NO));
if (docCount == 4) {
Field f = newTextField("crash", "", Field.Store.NO);
doc.add(f);
MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
tokenizer.setReader(new StringReader("crash me on the 4th token"));
tokenizer.setEnableChecks(
false); // disable workflow checking as we forcefully close() in exceptional cases.
Field f =
new Field("crash", new CrashingFilter("crash", tokenizer), TextField.TYPE_NOT_STORED);
doc.add(f);
f.setTokenStream(new CrashingFilter("crash", tokenizer));
}
}
@ -1573,13 +1573,13 @@ public class TestIndexWriterExceptions extends LuceneTestCase {
doc.add(newStringField("id", docCount + "", Field.Store.NO));
doc.add(newTextField("content", "silly content " + docCount, Field.Store.NO));
if (docCount == crashAt) {
Field f = newTextField("crash", "", Field.Store.NO);
doc.add(f);
MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
tokenizer.setReader(new StringReader("crash me on the 4th token"));
tokenizer.setEnableChecks(
false); // disable workflow checking as we forcefully close() in exceptional cases.
Field f =
new Field("crash", new CrashingFilter("crash", tokenizer), TextField.TYPE_NOT_STORED);
doc.add(f);
f.setTokenStream(new CrashingFilter("crash", tokenizer));
}
}

View File

@ -568,9 +568,10 @@ public class TestPayloads extends LuceneTestCase {
iwc.setMergePolicy(newLogMergePolicy());
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, iwc);
Document doc = new Document();
Field field = new TextField("field", "", Field.Store.NO);
TokenStream ts = new MockTokenizer(MockTokenizer.WHITESPACE, true);
((Tokenizer) ts).setReader(new StringReader("here we go"));
Field field = new Field("field", ts, TextField.TYPE_NOT_STORED);
field.setTokenStream(ts);
doc.add(field);
writer.addDocument(doc);
Token withPayload = new Token("withPayload", 0, 11);
@ -600,20 +601,22 @@ public class TestPayloads extends LuceneTestCase {
Directory dir = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
Document doc = new Document();
Field field = new TextField("field", "", Field.Store.NO);
TokenStream ts = new MockTokenizer(MockTokenizer.WHITESPACE, true);
Field field = new Field("field", ts, TextField.TYPE_NOT_STORED);
((Tokenizer) ts).setReader(new StringReader("here we go"));
field.setTokenStream(ts);
doc.add(field);
Field field2 = new TextField("field", "", Field.Store.NO);
Token withPayload = new Token("withPayload", 0, 11);
withPayload.setPayload(new BytesRef("test"));
ts = new CannedTokenStream(withPayload);
assertTrue(ts.hasAttribute(PayloadAttribute.class));
Field field2 = new Field("field", ts, TextField.TYPE_NOT_STORED);
field2.setTokenStream(ts);
doc.add(field2);
Field field3 = new TextField("field", "", Field.Store.NO);
ts = new MockTokenizer(MockTokenizer.WHITESPACE, true);
((Tokenizer) ts).setReader(new StringReader("nopayload"));
Field field3 = new Field("field", ts, TextField.TYPE_NOT_STORED);
field3.setTokenStream(ts);
doc.add(field3);
writer.addDocument(doc);
DirectoryReader reader = writer.getReader();

View File

@ -47,9 +47,10 @@ public class TestPayloadsOnVectors extends LuceneTestCase {
customType.setStoreTermVectorPositions(true);
customType.setStoreTermVectorPayloads(true);
customType.setStoreTermVectorOffsets(random().nextBoolean());
Field field = new Field("field", "", customType);
TokenStream ts = new MockTokenizer(MockTokenizer.WHITESPACE, true);
((Tokenizer) ts).setReader(new StringReader("here we go"));
Field field = new Field("field", ts, customType);
field.setTokenStream(ts);
doc.add(field);
writer.addDocument(doc);
@ -89,19 +90,22 @@ public class TestPayloadsOnVectors extends LuceneTestCase {
customType.setStoreTermVectorPositions(true);
customType.setStoreTermVectorPayloads(true);
customType.setStoreTermVectorOffsets(random().nextBoolean());
Field field = new Field("field", "", customType);
TokenStream ts = new MockTokenizer(MockTokenizer.WHITESPACE, true);
((Tokenizer) ts).setReader(new StringReader("here we go"));
Field field = new Field("field", ts, customType);
field.setTokenStream(ts);
doc.add(field);
Field field2 = new Field("field", "", customType);
Token withPayload = new Token("withPayload", 0, 11);
withPayload.setPayload(new BytesRef("test"));
ts = new CannedTokenStream(withPayload);
assertTrue(ts.hasAttribute(PayloadAttribute.class));
Field field2 = new Field("field", ts, customType);
field2.setTokenStream(ts);
doc.add(field2);
Field field3 = new Field("field", "", customType);
ts = new MockTokenizer(MockTokenizer.WHITESPACE, true);
((Tokenizer) ts).setReader(new StringReader("nopayload"));
Field field3 = new Field("field", ts, customType);
field3.setTokenStream(ts);
doc.add(field3);
writer.addDocument(doc);
DirectoryReader reader = writer.getReader();

View File

@ -51,6 +51,8 @@ public class TestTerms extends LuceneTestCase {
BytesRef maxTerm = null;
for (int i = 0; i < numDocs; i++) {
Document doc = new Document();
Field field = new TextField("field", "", Field.Store.NO);
doc.add(field);
// System.out.println(" doc " + i);
CannedBinaryTokenStream.BinaryToken[] tokens =
new CannedBinaryTokenStream.BinaryToken[atLeast(10)];
@ -69,9 +71,7 @@ public class TestTerms extends LuceneTestCase {
}
tokens[j] = new CannedBinaryTokenStream.BinaryToken(tokenBytes);
}
Field field =
new Field("field", new CannedBinaryTokenStream(tokens), TextField.TYPE_NOT_STORED);
doc.add(field);
field.setTokenStream(new CannedBinaryTokenStream(tokens));
w.addDocument(doc);
}