LUCENE-5989: allow passing BytesRef to StringField to make it easier to index arbitrary binary tokens

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1672781 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2015-04-10 22:24:46 +00:00
parent 09ddb7d912
commit 249d0d25fe
20 changed files with 242 additions and 89 deletions

View File

@ -58,6 +58,9 @@ New Features
accuracy of SDV. Includes optimized Intersect predicate to avoid many
geometry checks. Uses TwoPhaseIterator. (David Smiley)
* LUCENE-5989: Allow passing BytesRef to StringField to make it easier
to index arbitrary binary tokens (Mike McCandless)
Optimizations
* LUCENE-6379: IndexWriter.deleteDocuments(Query...) now detects if

View File

@ -17,6 +17,7 @@
package org.apache.lucene.benchmark.quality.utils;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
@ -50,7 +51,8 @@ public class DocNameExtractor {
final List<String> name = new ArrayList<>();
searcher.getIndexReader().document(docid, new StoredFieldVisitor() {
@Override
public void stringField(FieldInfo fieldInfo, String value) {
public void stringField(FieldInfo fieldInfo, byte[] bytes) {
String value = new String(bytes, StandardCharsets.UTF_8);
name.add(value);
}

View File

@ -18,7 +18,6 @@ package org.apache.lucene.codecs.simpletext;
*/
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import org.apache.lucene.codecs.StoredFieldsReader;
import org.apache.lucene.index.FieldInfo;
@ -153,7 +152,9 @@ public class SimpleTextStoredFieldsReader extends StoredFieldsReader {
readLine();
assert StringHelper.startsWith(scratch.get(), VALUE);
if (type == TYPE_STRING) {
visitor.stringField(fieldInfo, new String(scratch.bytes(), VALUE.length, scratch.length()-VALUE.length, StandardCharsets.UTF_8));
byte[] bytes = new byte[scratch.length() - VALUE.length];
System.arraycopy(scratch.bytes(), VALUE.length, bytes, 0, bytes.length);
visitor.stringField(fieldInfo, bytes);
} else if (type == TYPE_BINARY) {
byte[] copy = new byte[scratch.length()-VALUE.length];
System.arraycopy(scratch.bytes(), VALUE.length, copy, 0, copy.length);

View File

@ -19,6 +19,7 @@ package org.apache.lucene.codecs;
import java.io.Closeable;
import java.io.IOException;
import java.io.Reader;
import java.nio.charset.StandardCharsets;
import org.apache.lucene.document.StoredField;
import org.apache.lucene.index.FieldInfo;
@ -140,14 +141,16 @@ public abstract class StoredFieldsWriter implements Closeable {
@Override
public void binaryField(FieldInfo fieldInfo, byte[] value) throws IOException {
reset(fieldInfo);
// TODO: can we avoid new BR here?
binaryValue = new BytesRef(value);
write();
}
@Override
public void stringField(FieldInfo fieldInfo, String value) throws IOException {
public void stringField(FieldInfo fieldInfo, byte[] value) throws IOException {
reset(fieldInfo);
stringValue = value;
// TODO: can we avoid new String here?
stringValue = new String(value, StandardCharsets.UTF_8);
write();
}

View File

@ -41,7 +41,6 @@ import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter
import java.io.EOFException;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
@ -220,7 +219,7 @@ public final class CompressingStoredFieldsReader extends StoredFieldsReader {
length = in.readVInt();
data = new byte[length];
in.readBytes(data, 0, length);
visitor.stringField(info, new String(data, StandardCharsets.UTF_8));
visitor.stringField(info, data);
break;
case NUMERIC_INT:
visitor.intField(info, in.readZInt());

View File

@ -18,8 +18,9 @@ package org.apache.lucene.document;
*/
import java.io.IOException;
import java.util.Set;
import java.nio.charset.StandardCharsets;
import java.util.HashSet;
import java.util.Set;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexReader;
@ -66,12 +67,12 @@ public class DocumentStoredFieldVisitor extends StoredFieldVisitor {
}
@Override
public void stringField(FieldInfo fieldInfo, String value) throws IOException {
public void stringField(FieldInfo fieldInfo, byte[] value) throws IOException {
final FieldType ft = new FieldType(TextField.TYPE_STORED);
ft.setStoreTermVectors(fieldInfo.hasVectors());
ft.setOmitNorms(fieldInfo.omitsNorms());
ft.setIndexOptions(fieldInfo.getIndexOptions());
doc.add(new StoredField(fieldInfo.name, value, ft));
doc.add(new StoredField(fieldInfo.name, new String(value, StandardCharsets.UTF_8), ft));
}
@Override

View File

@ -25,6 +25,7 @@ import org.apache.lucene.analysis.NumericTokenStream;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
import org.apache.lucene.document.FieldType.NumericType;
import org.apache.lucene.index.FieldInvertState; // javadocs
import org.apache.lucene.index.IndexOptions;
@ -32,6 +33,7 @@ import org.apache.lucene.index.IndexWriter; // javadocs
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.IndexableFieldType;
import org.apache.lucene.index.StorableField;
import org.apache.lucene.util.AttributeImpl;
import org.apache.lucene.util.BytesRef;
/**
@ -215,9 +217,6 @@ public class Field implements IndexableField, StorableField {
if (bytes == null) {
throw new IllegalArgumentException("bytes cannot be null");
}
if (type.indexOptions() != IndexOptions.NONE) {
throw new IllegalArgumentException("Fields with BytesRef values cannot be indexed");
}
this.fieldsData = bytes;
this.type = type;
this.name = name;
@ -538,16 +537,25 @@ public class Field implements IndexableField, StorableField {
}
if (!fieldType().tokenized()) {
if (stringValue() == null) {
if (stringValue() != null) {
if (!(reuse instanceof StringTokenStream)) {
// lazy init the TokenStream as it is heavy to instantiate
// (attributes,...) if not needed
reuse = new StringTokenStream();
}
((StringTokenStream) reuse).setValue(stringValue());
return reuse;
} else if (binaryValue() != null) {
if (!(reuse instanceof BinaryTokenStream)) {
// lazy init the TokenStream as it is heavy to instantiate
// (attributes,...) if not needed
reuse = new BinaryTokenStream();
}
((BinaryTokenStream) reuse).setValue(binaryValue());
return reuse;
} else {
throw new IllegalArgumentException("Non-Tokenized Fields must have a String value");
}
if (!(reuse instanceof StringTokenStream)) {
// lazy init the TokenStream as it is heavy to instantiate
// (attributes,...) if not needed (stored field loading)
reuse = new StringTokenStream();
}
((StringTokenStream) reuse).setValue(stringValue());
return reuse;
}
if (tokenStream != null) {
@ -561,7 +569,69 @@ public class Field implements IndexableField, StorableField {
throw new IllegalArgumentException("Field must have either TokenStream, String, Reader or Number value; got " + this);
}
static final class StringTokenStream extends TokenStream {
private static final class BinaryTokenStream extends TokenStream {
private final ByteTermAttribute bytesAtt = addAttribute(ByteTermAttribute.class);
// Do not init this to true, becase caller must first call reset:
private boolean available;
public BinaryTokenStream() {
}
public void setValue(BytesRef value) {
bytesAtt.setBytesRef(value);
}
@Override
public boolean incrementToken() {
if (available) {
clearAttributes();
available = false;
return true;
}
return false;
}
@Override
public void reset() {
available = true;
}
public interface ByteTermAttribute extends TermToBytesRefAttribute {
public void setBytesRef(BytesRef bytes);
}
public static class ByteTermAttributeImpl extends AttributeImpl implements ByteTermAttribute, TermToBytesRefAttribute {
private BytesRef bytes;
@Override
public void fillBytesRef() {
// no-op: the bytes was already filled by our owner's incrementToken
}
@Override
public BytesRef getBytesRef() {
return bytes;
}
@Override
public void setBytesRef(BytesRef bytes) {
this.bytes = bytes;
}
@Override
public void clear() {
}
@Override
public void copyTo(AttributeImpl target) {
ByteTermAttributeImpl other = (ByteTermAttributeImpl) target;
other.bytes = bytes;
}
}
}
private static final class StringTokenStream extends TokenStream {
private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
private boolean used = false;

View File

@ -18,6 +18,7 @@ package org.apache.lucene.document;
*/
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.util.BytesRef;
/** A field that is indexed but not tokenized: the entire
* String value is indexed as a single token. For example
@ -48,7 +49,9 @@ public final class StringField extends Field {
TYPE_STORED.freeze();
}
/** Creates a new StringField.
/** Creates a new textual StringField, indexing the provided String value
* as a single token.
*
* @param name field name
* @param value String value
* @param stored Store.YES if the content should also be stored
@ -57,4 +60,18 @@ public final class StringField extends Field {
public StringField(String name, String value, Store stored) {
super(name, value, stored == Store.YES ? TYPE_STORED : TYPE_NOT_STORED);
}
/** Creates a new binary StringField, indexing the provided binary (BytesRef)
* value as a single token.
*
* @param name field name
* @param value BytesRef value. The provided value is not cloned so
* you must not change it until the document(s) holding it
* have been indexed.
* @param stored Store.YES if the content should also be stored
* @throws IllegalArgumentException if the field name or value is null.
*/
public StringField(String name, BytesRef value, Store stored) {
super(name, value, stored == Store.YES ? TYPE_STORED : TYPE_NOT_STORED);
}
}

View File

@ -53,8 +53,8 @@ public abstract class StoredFieldVisitor {
public void binaryField(FieldInfo fieldInfo, byte[] value) throws IOException {
}
/** Process a string field */
public void stringField(FieldInfo fieldInfo, String value) throws IOException {
/** Process a string field; the provided byte[] value is a UTF-8 encoded string value. */
public void stringField(FieldInfo fieldInfo, byte[] value) throws IOException {
}
/** Process a int numeric field. */
@ -93,4 +93,4 @@ public abstract class StoredFieldVisitor {
/** STOP: don't visit this field and stop processing any other fields for this document. */
STOP
}
}
}

View File

@ -22,6 +22,14 @@ import java.nio.charset.StandardCharsets;
import org.apache.lucene.analysis.CannedTokenStream;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.StoredDocument;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
@ -408,6 +416,28 @@ public class TestField extends LuceneTestCase {
assertEquals(5L, field.numericValue().longValue());
}
public void testIndexedBinaryField() throws Exception {
Directory dir = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), dir);
Document doc = new Document();
BytesRef br = new BytesRef(new byte[5]);
Field field = new StringField("binary", br, Field.Store.YES);
assertEquals(br, field.binaryValue());
doc.add(field);
w.addDocument(doc);
IndexReader r = w.getReader();
IndexSearcher s = newSearcher(r);
TopDocs hits = s.search(new TermQuery(new Term("binary", br)), 1);
assertEquals(1, hits.totalHits);
StoredDocument storedDoc = s.doc(hits.scoreDocs[0].doc);
assertEquals(br, storedDoc.getField("binary").binaryValue());
r.close();
w.close();
dir.close();
}
private void trySetByteValue(Field f) {
try {

View File

@ -213,8 +213,9 @@ public class TestRollingUpdates extends LuceneTestCase {
DirectoryReader open = null;
for (int i = 0; i < num; i++) {
Document doc = new Document();// docs.nextDoc();
doc.add(newStringField("id", "test", Field.Store.NO));
writer.updateDocument(new Term("id", "test"), doc);
BytesRef br = new BytesRef("test");
doc.add(newStringField("id", br, Field.Store.NO));
writer.updateDocument(new Term("id", br), doc);
if (random().nextInt(3) == 0) {
if (open == null) {
open = DirectoryReader.open(writer, true);

View File

@ -31,13 +31,13 @@ import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.IntField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.StoredDocument;
import org.apache.lucene.index.Term;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
@ -108,7 +108,7 @@ public class TestLiveFieldValues extends LuceneTestCase {
if (threadRandom.nextDouble() <= addChance) {
String id = String.format(Locale.ROOT, "%d_%04x", threadID, threadRandom.nextInt(idCount));
Integer field = threadRandom.nextInt(Integer.MAX_VALUE);
doc.add(new StringField("id", id, Field.Store.YES));
doc.add(newStringField("id", new BytesRef(id), Field.Store.YES));
doc.add(new IntField("field", field.intValue(), Field.Store.YES));
w.updateDocument(new Term("id", id), doc);
rt.add(id, field);
@ -119,7 +119,7 @@ public class TestLiveFieldValues extends LuceneTestCase {
if (allIDs.size() > 0 && threadRandom.nextDouble() <= deleteChance) {
String randomID = allIDs.get(threadRandom.nextInt(allIDs.size()));
w.deleteDocuments(new Term("id", randomID));
w.deleteDocuments(new Term("id", new BytesRef(randomID)));
rt.delete(randomID);
values.put(randomID, missing);
}

View File

@ -18,6 +18,7 @@ package org.apache.lucene.search.postingshighlight;
*/
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.text.BreakIterator;
import java.util.ArrayList;
import java.util.Arrays;
@ -792,7 +793,8 @@ public class PostingsHighlighter {
}
@Override
public void stringField(FieldInfo fieldInfo, String value) throws IOException {
public void stringField(FieldInfo fieldInfo, byte[] bytes) throws IOException {
String value = new String(bytes, StandardCharsets.UTF_8);
assert currentField >= 0;
StringBuilder builder = builders[currentField];
if (builder.length() > 0 && builder.length() < maxLength) {

View File

@ -17,6 +17,16 @@ package org.apache.lucene.search.vectorhighlight;
* limitations under the License.
*/
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.TextField;
@ -25,19 +35,10 @@ import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.StoredFieldVisitor;
import org.apache.lucene.search.highlight.DefaultEncoder;
import org.apache.lucene.search.highlight.Encoder;
import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo;
import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo.SubInfo;
import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo;
import org.apache.lucene.search.vectorhighlight.FieldPhraseList.WeightedPhraseInfo.Toffs;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
/**
* Base FragmentsBuilder implementation that supports colored pre/post
* tags and multivalued fields.
@ -152,7 +153,8 @@ public abstract class BaseFragmentsBuilder implements FragmentsBuilder {
reader.document(docId, new StoredFieldVisitor() {
@Override
public void stringField(FieldInfo fieldInfo, String value) {
public void stringField(FieldInfo fieldInfo, byte[] bytes) {
String value = new String(bytes, StandardCharsets.UTF_8);
FieldType ft = new FieldType(TextField.TYPE_STORED);
ft.setStoreTermVectors(fieldInfo.hasVectors());
fields.add(new Field(fieldInfo.name, value, ft));

View File

@ -16,19 +16,19 @@
*/
package org.apache.lucene.document;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import java.util.Set;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.HashMap;
import java.io.IOException;
import java.util.Set;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.store.*;
import org.apache.lucene.analysis.*;
import org.apache.lucene.index.*;
import org.apache.lucene.search.*;
import org.apache.lucene.store.*;
import org.apache.lucene.util.LuceneTestCase;
import org.junit.After;
import org.junit.Before;
@ -209,7 +209,8 @@ public class TestLazyDocument extends LuceneTestCase {
}
@Override
public void stringField(FieldInfo fieldInfo, String value) throws IOException {
public void stringField(FieldInfo fieldInfo, byte[] bytes) throws IOException {
String value = new String(bytes, StandardCharsets.UTF_8);
final FieldType ft = new FieldType(TextField.TYPE_STORED);
ft.setStoreTermVectors(fieldInfo.hasVectors());
ft.setOmitNorms(fieldInfo.omitsNorms());

View File

@ -78,7 +78,7 @@ public final class FieldFilterLeafReader extends FilterLeafReader {
}
@Override
public void stringField(FieldInfo fieldInfo, String value) throws IOException {
public void stringField(FieldInfo fieldInfo, byte[] value) throws IOException {
visitor.stringField(fieldInfo, value);
}

View File

@ -92,7 +92,7 @@ public class MismatchedLeafReader extends FilterLeafReader {
}
@Override
public void stringField(FieldInfo fieldInfo, String value) throws IOException {
public void stringField(FieldInfo fieldInfo, byte[] value) throws IOException {
in.stringField(renumber(fieldInfo), value);
}

View File

@ -56,49 +56,24 @@ import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicReference;
import java.util.logging.Logger;
import com.carrotsearch.randomizedtesting.JUnit4MethodProvider;
import com.carrotsearch.randomizedtesting.LifecycleScope;
import com.carrotsearch.randomizedtesting.MixWithSuiteName;
import com.carrotsearch.randomizedtesting.RandomizedContext;
import com.carrotsearch.randomizedtesting.RandomizedRunner;
import com.carrotsearch.randomizedtesting.RandomizedTest;
import com.carrotsearch.randomizedtesting.annotations.Listeners;
import com.carrotsearch.randomizedtesting.annotations.SeedDecorators;
import com.carrotsearch.randomizedtesting.annotations.TestGroup;
import com.carrotsearch.randomizedtesting.annotations.TestMethodProviders;
import com.carrotsearch.randomizedtesting.annotations.ThreadLeakAction;
import com.carrotsearch.randomizedtesting.annotations.ThreadLeakAction.Action;
import com.carrotsearch.randomizedtesting.annotations.ThreadLeakFilters;
import com.carrotsearch.randomizedtesting.annotations.ThreadLeakGroup;
import com.carrotsearch.randomizedtesting.annotations.ThreadLeakGroup.Group;
import com.carrotsearch.randomizedtesting.annotations.ThreadLeakLingering;
import com.carrotsearch.randomizedtesting.annotations.ThreadLeakScope;
import com.carrotsearch.randomizedtesting.annotations.ThreadLeakScope.Scope;
import com.carrotsearch.randomizedtesting.annotations.ThreadLeakZombies;
import com.carrotsearch.randomizedtesting.annotations.ThreadLeakZombies.Consequence;
import com.carrotsearch.randomizedtesting.annotations.TimeoutSuite;
import com.carrotsearch.randomizedtesting.generators.RandomPicks;
import com.carrotsearch.randomizedtesting.rules.NoClassHooksShadowingRule;
import com.carrotsearch.randomizedtesting.rules.NoInstanceHooksOverridesRule;
import com.carrotsearch.randomizedtesting.rules.StaticFieldsInvariantRule;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.*;
import org.apache.lucene.index.IndexReader.ReaderClosedListener;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.TermsEnum.SeekStatus;
import org.apache.lucene.search.AssertingIndexSearcher;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.LRUQueryCache;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.QueryCache;
import org.apache.lucene.search.QueryCachingPolicy;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.QueryUtils.FCInvisibleMultiReader;
import org.apache.lucene.store.BaseDirectoryWrapper;
import org.apache.lucene.store.Directory;
@ -108,8 +83,8 @@ import org.apache.lucene.store.FlushInfo;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.LockFactory;
import org.apache.lucene.store.MergeInfo;
import org.apache.lucene.store.MockDirectoryWrapper;
import org.apache.lucene.store.MockDirectoryWrapper.Throttling;
import org.apache.lucene.store.MockDirectoryWrapper;
import org.apache.lucene.store.NRTCachingDirectory;
import org.apache.lucene.store.RawDirectoryWrapper;
import org.apache.lucene.util.automaton.AutomatonTestUtil;
@ -126,6 +101,31 @@ import org.junit.Test;
import org.junit.rules.RuleChain;
import org.junit.rules.TestRule;
import org.junit.runner.RunWith;
import com.carrotsearch.randomizedtesting.JUnit4MethodProvider;
import com.carrotsearch.randomizedtesting.LifecycleScope;
import com.carrotsearch.randomizedtesting.MixWithSuiteName;
import com.carrotsearch.randomizedtesting.RandomizedContext;
import com.carrotsearch.randomizedtesting.RandomizedRunner;
import com.carrotsearch.randomizedtesting.RandomizedTest;
import com.carrotsearch.randomizedtesting.annotations.Listeners;
import com.carrotsearch.randomizedtesting.annotations.SeedDecorators;
import com.carrotsearch.randomizedtesting.annotations.TestGroup;
import com.carrotsearch.randomizedtesting.annotations.TestMethodProviders;
import com.carrotsearch.randomizedtesting.annotations.ThreadLeakAction.Action;
import com.carrotsearch.randomizedtesting.annotations.ThreadLeakAction;
import com.carrotsearch.randomizedtesting.annotations.ThreadLeakFilters;
import com.carrotsearch.randomizedtesting.annotations.ThreadLeakGroup.Group;
import com.carrotsearch.randomizedtesting.annotations.ThreadLeakGroup;
import com.carrotsearch.randomizedtesting.annotations.ThreadLeakLingering;
import com.carrotsearch.randomizedtesting.annotations.ThreadLeakScope.Scope;
import com.carrotsearch.randomizedtesting.annotations.ThreadLeakScope;
import com.carrotsearch.randomizedtesting.annotations.ThreadLeakZombies.Consequence;
import com.carrotsearch.randomizedtesting.annotations.ThreadLeakZombies;
import com.carrotsearch.randomizedtesting.annotations.TimeoutSuite;
import com.carrotsearch.randomizedtesting.generators.RandomPicks;
import com.carrotsearch.randomizedtesting.rules.NoClassHooksShadowingRule;
import com.carrotsearch.randomizedtesting.rules.NoInstanceHooksOverridesRule;
import com.carrotsearch.randomizedtesting.rules.StaticFieldsInvariantRule;
import static com.carrotsearch.randomizedtesting.RandomizedTest.systemPropertyAsBoolean;
import static com.carrotsearch.randomizedtesting.RandomizedTest.systemPropertyAsInt;
@ -1336,6 +1336,10 @@ public abstract class LuceneTestCase extends Assert {
return newField(random(), name, value, stored == Store.YES ? StringField.TYPE_STORED : StringField.TYPE_NOT_STORED);
}
public static Field newStringField(String name, BytesRef value, Store stored) {
return newField(random(), name, value, stored == Store.YES ? StringField.TYPE_STORED : StringField.TYPE_NOT_STORED);
}
public static Field newTextField(String name, String value, Store stored) {
return newField(random(), name, value, stored == Store.YES ? TextField.TYPE_STORED : TextField.TYPE_NOT_STORED);
}
@ -1343,6 +1347,10 @@ public abstract class LuceneTestCase extends Assert {
public static Field newStringField(Random random, String name, String value, Store stored) {
return newField(random, name, value, stored == Store.YES ? StringField.TYPE_STORED : StringField.TYPE_NOT_STORED);
}
public static Field newStringField(Random random, String name, BytesRef value, Store stored) {
return newField(random, name, value, stored == Store.YES ? StringField.TYPE_STORED : StringField.TYPE_NOT_STORED);
}
public static Field newTextField(Random random, String name, String value, Store stored) {
return newField(random, name, value, stored == Store.YES ? TextField.TYPE_STORED : TextField.TYPE_NOT_STORED);
@ -1372,7 +1380,7 @@ public abstract class LuceneTestCase extends Assert {
// write-once schema sort of helper class then we can
// remove the sync here. We can also fold the random
// "enable norms" (now commented out, below) into that:
public synchronized static Field newField(Random random, String name, String value, FieldType type) {
public synchronized static Field newField(Random random, String name, Object value, FieldType type) {
// Defeat any consumers that illegally rely on intern'd
// strings (we removed this from Lucene a while back):
@ -1388,7 +1396,7 @@ public abstract class LuceneTestCase extends Assert {
type = mergeTermVectorOptions(type, prevType);
}
return new Field(name, value, type);
return createField(name, value, type);
}
// TODO: once all core & test codecs can index
@ -1434,7 +1442,17 @@ public abstract class LuceneTestCase extends Assert {
}
*/
return new Field(name, value, newType);
return createField(name, value, newType);
}
private static Field createField(String name, Object value, FieldType fieldType) {
if (value instanceof String) {
return new Field(name, (String) value, fieldType);
} else if (value instanceof BytesRef) {
return new Field(name, (BytesRef) value, fieldType);
} else {
throw new IllegalArgumentException("value must be String or BytesRef");
}
}
/**

View File

@ -1,6 +1,7 @@
package org.apache.solr.handler.component;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
@ -275,8 +276,8 @@ public class TermVectorComponent extends SearchComponent implements SolrCoreAwar
// once we find it...
final StoredFieldVisitor getUniqValue = new StoredFieldVisitor() {
@Override
public void stringField(FieldInfo fieldInfo, String value) {
uniqValues.add(value);
public void stringField(FieldInfo fieldInfo, byte[] bytes) {
uniqValues.add(new String(bytes, StandardCharsets.UTF_8));
}
@Override

View File

@ -20,6 +20,7 @@ package org.apache.solr.search;
import java.io.Closeable;
import java.io.IOException;
import java.net.URL;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
@ -80,10 +81,10 @@ import org.apache.solr.core.SolrInfoMBean;
import org.apache.solr.request.LocalSolrQueryRequest;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.request.SolrRequestInfo;
import org.apache.solr.search.facet.UnInvertedField;
import org.apache.solr.response.SolrQueryResponse;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.search.facet.UnInvertedField;
import org.apache.solr.search.stats.StatsSource;
import org.apache.solr.update.SolrIndexConfig;
import org.slf4j.Logger;
@ -618,7 +619,8 @@ public class SolrIndexSearcher extends IndexSearcher implements Closeable,SolrIn
}
@Override
public void stringField(FieldInfo fieldInfo, String value) throws IOException {
public void stringField(FieldInfo fieldInfo, byte[] bytes) throws IOException {
String value = new String(bytes, StandardCharsets.UTF_8);
final FieldType ft = new FieldType(TextField.TYPE_STORED);
ft.setStoreTermVectors(fieldInfo.hasVectors());
ft.setOmitNorms(fieldInfo.omitsNorms());
@ -708,7 +710,7 @@ public class SolrIndexSearcher extends IndexSearcher implements Closeable,SolrIn
throw new AssertionError();
}
} else {
visitor.stringField(info, f.stringValue());
visitor.stringField(info, f.stringValue().getBytes(StandardCharsets.UTF_8));
}
break;
case NO: