LUCENE-5989: allow passing BytesRef to StringField to make it easier to index arbitrary binary tokens

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1672781 13f79535-47bb-0310-9956-ffa450edef68
2015-04-10 22:24:46 +00:00 · 2015-04-10 22:24:46 +00:00 · 249d0d25fe
parent 09ddb7d912
commit 249d0d25fe
20 changed files with 242 additions and 89 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -58,6 +58,9 @@ New Features
  accuracy of SDV. Includes optimized Intersect predicate to avoid many
  geometry checks. Uses TwoPhaseIterator. (David Smiley)

+* LUCENE-5989: Allow passing BytesRef to StringField to make it easier
+  to index arbitrary binary tokens (Mike McCandless)
+
 Optimizations

 * LUCENE-6379: IndexWriter.deleteDocuments(Query...) now detects if
--- a/lucene/benchmark/src/java/org/apache/lucene/benchmark/quality/utils/DocNameExtractor.java
+++ b/lucene/benchmark/src/java/org/apache/lucene/benchmark/quality/utils/DocNameExtractor.java
@ -17,6 +17,7 @@
 package org.apache.lucene.benchmark.quality.utils;

 import java.io.IOException;
+import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
 import java.util.List;

@ -50,7 +51,8 @@ public class DocNameExtractor {
    final List<String> name = new ArrayList<>();
    searcher.getIndexReader().document(docid, new StoredFieldVisitor() {
        @Override
-        public void stringField(FieldInfo fieldInfo, String value) {
+        public void stringField(FieldInfo fieldInfo, byte[] bytes) {
+          String value = new String(bytes, StandardCharsets.UTF_8);
          name.add(value);
        }

--- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextStoredFieldsReader.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextStoredFieldsReader.java
@ -18,7 +18,6 @@ package org.apache.lucene.codecs.simpletext;
 */

 import java.io.IOException;
-import java.nio.charset.StandardCharsets;

 import org.apache.lucene.codecs.StoredFieldsReader;
 import org.apache.lucene.index.FieldInfo;
@ -153,7 +152,9 @@ public class SimpleTextStoredFieldsReader extends StoredFieldsReader {
    readLine();
    assert StringHelper.startsWith(scratch.get(), VALUE);
    if (type == TYPE_STRING) {
-      visitor.stringField(fieldInfo, new String(scratch.bytes(), VALUE.length, scratch.length()-VALUE.length, StandardCharsets.UTF_8));
+      byte[] bytes = new byte[scratch.length() - VALUE.length];
+      System.arraycopy(scratch.bytes(), VALUE.length, bytes, 0, bytes.length);
+      visitor.stringField(fieldInfo, bytes);
    } else if (type == TYPE_BINARY) {
      byte[] copy = new byte[scratch.length()-VALUE.length];
      System.arraycopy(scratch.bytes(), VALUE.length, copy, 0, copy.length);
--- a/lucene/core/src/java/org/apache/lucene/codecs/StoredFieldsWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/StoredFieldsWriter.java
@ -19,6 +19,7 @@ package org.apache.lucene.codecs;
 import java.io.Closeable;
 import java.io.IOException;
 import java.io.Reader;
+import java.nio.charset.StandardCharsets;

 import org.apache.lucene.document.StoredField;
 import org.apache.lucene.index.FieldInfo;
@ -140,14 +141,16 @@ public abstract class StoredFieldsWriter implements Closeable {
    @Override
    public void binaryField(FieldInfo fieldInfo, byte[] value) throws IOException {
      reset(fieldInfo);
+      // TODO: can we avoid new BR here?
      binaryValue = new BytesRef(value);
      write();
    }

    @Override
-    public void stringField(FieldInfo fieldInfo, String value) throws IOException {
+    public void stringField(FieldInfo fieldInfo, byte[] value) throws IOException {
      reset(fieldInfo);
-      stringValue = value;
+      // TODO: can we avoid new String here?
+      stringValue = new String(value, StandardCharsets.UTF_8);
      write();
    }

--- a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsReader.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsReader.java
@ -41,7 +41,6 @@ import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter

 import java.io.EOFException;
 import java.io.IOException;
-import java.nio.charset.StandardCharsets;
 import java.util.Arrays;
 import java.util.Collection;
 import java.util.Collections;
@ -220,7 +219,7 @@ public final class CompressingStoredFieldsReader extends StoredFieldsReader {
        length = in.readVInt();
        data = new byte[length];
        in.readBytes(data, 0, length);
-        visitor.stringField(info, new String(data, StandardCharsets.UTF_8));
+        visitor.stringField(info, data);
        break;
      case NUMERIC_INT:
        visitor.intField(info, in.readZInt());
--- a/lucene/core/src/java/org/apache/lucene/document/DocumentStoredFieldVisitor.java
+++ b/lucene/core/src/java/org/apache/lucene/document/DocumentStoredFieldVisitor.java
@ -18,8 +18,9 @@ package org.apache.lucene.document;
 */

 import java.io.IOException;
-import java.util.Set;
+import java.nio.charset.StandardCharsets;
 import java.util.HashSet;
+import java.util.Set;

 import org.apache.lucene.index.FieldInfo;
 import org.apache.lucene.index.IndexReader;
@ -66,12 +67,12 @@ public class DocumentStoredFieldVisitor extends StoredFieldVisitor {
  }

  @Override
-  public void stringField(FieldInfo fieldInfo, String value) throws IOException {
+  public void stringField(FieldInfo fieldInfo, byte[] value) throws IOException {
    final FieldType ft = new FieldType(TextField.TYPE_STORED);
    ft.setStoreTermVectors(fieldInfo.hasVectors());
    ft.setOmitNorms(fieldInfo.omitsNorms());
    ft.setIndexOptions(fieldInfo.getIndexOptions());
-    doc.add(new StoredField(fieldInfo.name, value, ft));
+    doc.add(new StoredField(fieldInfo.name, new String(value, StandardCharsets.UTF_8), ft));
  }

  @Override
--- a/lucene/core/src/java/org/apache/lucene/document/Field.java
+++ b/lucene/core/src/java/org/apache/lucene/document/Field.java
@ -25,6 +25,7 @@ import org.apache.lucene.analysis.NumericTokenStream;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
 import org.apache.lucene.document.FieldType.NumericType;
 import org.apache.lucene.index.FieldInvertState; // javadocs
 import org.apache.lucene.index.IndexOptions;
@ -32,6 +33,7 @@ import org.apache.lucene.index.IndexWriter; // javadocs
 import org.apache.lucene.index.IndexableField;
 import org.apache.lucene.index.IndexableFieldType;
 import org.apache.lucene.index.StorableField;
+import org.apache.lucene.util.AttributeImpl;
 import org.apache.lucene.util.BytesRef;

 /**
@ -215,9 +217,6 @@ public class Field implements IndexableField, StorableField {
    if (bytes == null) {
      throw new IllegalArgumentException("bytes cannot be null");
    }
-    if (type.indexOptions() != IndexOptions.NONE) {
-      throw new IllegalArgumentException("Fields with BytesRef values cannot be indexed");
-    }
    this.fieldsData = bytes;
    this.type = type;
    this.name = name;
@ -538,16 +537,25 @@ public class Field implements IndexableField, StorableField {
    }

    if (!fieldType().tokenized()) {
-      if (stringValue() == null) {
+      if (stringValue() != null) {
+        if (!(reuse instanceof StringTokenStream)) {
+          // lazy init the TokenStream as it is heavy to instantiate
+          // (attributes,...) if not needed
+          reuse = new StringTokenStream();
+        }
+        ((StringTokenStream) reuse).setValue(stringValue());
+        return reuse;
+      } else if (binaryValue() != null) {
+        if (!(reuse instanceof BinaryTokenStream)) {
+          // lazy init the TokenStream as it is heavy to instantiate
+          // (attributes,...) if not needed
+          reuse = new BinaryTokenStream();
+        }
+        ((BinaryTokenStream) reuse).setValue(binaryValue());
+        return reuse;
+      } else {
        throw new IllegalArgumentException("Non-Tokenized Fields must have a String value");
      }
-      if (!(reuse instanceof StringTokenStream)) {
-        // lazy init the TokenStream as it is heavy to instantiate
-        // (attributes,...) if not needed (stored field loading)
-        reuse = new StringTokenStream();
-      }
-      ((StringTokenStream) reuse).setValue(stringValue());
-      return reuse;
    }

    if (tokenStream != null) {
@ -561,7 +569,69 @@ public class Field implements IndexableField, StorableField {
    throw new IllegalArgumentException("Field must have either TokenStream, String, Reader or Number value; got " + this);
  }
  
-  static final class StringTokenStream extends TokenStream {
+  private static final class BinaryTokenStream extends TokenStream {
+    private final ByteTermAttribute bytesAtt = addAttribute(ByteTermAttribute.class);
+
+    // Do not init this to true, becase caller must first call reset:
+    private boolean available;
+  
+    public BinaryTokenStream() {
+    }
+
+    public void setValue(BytesRef value) {
+      bytesAtt.setBytesRef(value);
+    }
+  
+    @Override
+    public boolean incrementToken() {
+      if (available) {
+        clearAttributes();
+        available = false;
+        return true;
+      }
+      return false;
+    }
+  
+    @Override
+    public void reset() {
+      available = true;
+    }
+  
+    public interface ByteTermAttribute extends TermToBytesRefAttribute {
+      public void setBytesRef(BytesRef bytes);
+    }
+  
+    public static class ByteTermAttributeImpl extends AttributeImpl implements ByteTermAttribute, TermToBytesRefAttribute {
+      private BytesRef bytes;
+    
+      @Override
+      public void fillBytesRef() {
+        // no-op: the bytes was already filled by our owner's incrementToken
+      }
+    
+      @Override
+      public BytesRef getBytesRef() {
+        return bytes;
+      }
+
+      @Override
+      public void setBytesRef(BytesRef bytes) {
+        this.bytes = bytes;
+      }
+    
+      @Override
+      public void clear() {
+      }
+    
+      @Override
+      public void copyTo(AttributeImpl target) {
+        ByteTermAttributeImpl other = (ByteTermAttributeImpl) target;
+        other.bytes = bytes;
+      }
+    }
+  }
+
+  private static final class StringTokenStream extends TokenStream {
    private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
    private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
    private boolean used = false;
--- a/lucene/core/src/java/org/apache/lucene/document/StringField.java
+++ b/lucene/core/src/java/org/apache/lucene/document/StringField.java
@ -18,6 +18,7 @@ package org.apache.lucene.document;
 */

 import org.apache.lucene.index.IndexOptions;
+import org.apache.lucene.util.BytesRef;

 /** A field that is indexed but not tokenized: the entire
 *  String value is indexed as a single token.  For example
@ -48,7 +49,9 @@ public final class StringField extends Field {
    TYPE_STORED.freeze();
  }

-  /** Creates a new StringField. 
+  /** Creates a new textual StringField, indexing the provided String value
+   *  as a single token.
+   *
   *  @param name field name
   *  @param value String value
   *  @param stored Store.YES if the content should also be stored
@ -57,4 +60,18 @@ public final class StringField extends Field {
  public StringField(String name, String value, Store stored) {
    super(name, value, stored == Store.YES ? TYPE_STORED : TYPE_NOT_STORED);
  }
+
+  /** Creates a new binary StringField, indexing the provided binary (BytesRef)
+   *  value as a single token.
+   *
+   *  @param name field name
+   *  @param value BytesRef value.  The provided value is not cloned so
+   *         you must not change it until the document(s) holding it
+   *         have been indexed.
+   *  @param stored Store.YES if the content should also be stored
+   *  @throws IllegalArgumentException if the field name or value is null.
+   */
+  public StringField(String name, BytesRef value, Store stored) {
+    super(name, value, stored == Store.YES ? TYPE_STORED : TYPE_NOT_STORED);
+  }
 }
--- a/lucene/core/src/java/org/apache/lucene/index/StoredFieldVisitor.java
+++ b/lucene/core/src/java/org/apache/lucene/index/StoredFieldVisitor.java
@ -53,8 +53,8 @@ public abstract class StoredFieldVisitor {
  public void binaryField(FieldInfo fieldInfo, byte[] value) throws IOException {
  }

-  /** Process a string field */
-  public void stringField(FieldInfo fieldInfo, String value) throws IOException {
+  /** Process a string field; the provided byte[] value is a UTF-8 encoded string value. */
+  public void stringField(FieldInfo fieldInfo, byte[] value) throws IOException {
  }

  /** Process a int numeric field. */
--- a/lucene/core/src/test/org/apache/lucene/document/TestField.java
+++ b/lucene/core/src/test/org/apache/lucene/document/TestField.java
@ -22,6 +22,14 @@ import java.nio.charset.StandardCharsets;

 import org.apache.lucene.analysis.CannedTokenStream;
 import org.apache.lucene.analysis.Token;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.RandomIndexWriter;
+import org.apache.lucene.index.StoredDocument;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.TopDocs;
+import org.apache.lucene.store.Directory;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.LuceneTestCase;

@ -409,6 +417,28 @@ public class TestField extends LuceneTestCase {
    assertEquals(5L, field.numericValue().longValue());
  }

+  public void testIndexedBinaryField() throws Exception {
+    Directory dir = newDirectory();
+    RandomIndexWriter w = new RandomIndexWriter(random(), dir);
+    Document doc = new Document();
+    BytesRef br = new BytesRef(new byte[5]);
+    Field field = new StringField("binary", br, Field.Store.YES);
+    assertEquals(br, field.binaryValue());
+    doc.add(field);
+    w.addDocument(doc);
+    IndexReader r = w.getReader();
+
+    IndexSearcher s = newSearcher(r);
+    TopDocs hits = s.search(new TermQuery(new Term("binary", br)), 1);
+    assertEquals(1, hits.totalHits);
+    StoredDocument storedDoc = s.doc(hits.scoreDocs[0].doc);
+    assertEquals(br, storedDoc.getField("binary").binaryValue());
+
+    r.close();
+    w.close();
+    dir.close();
+  }
+  
  private void trySetByteValue(Field f) {
    try {
      f.setByteValue((byte) 10);
--- a/lucene/core/src/test/org/apache/lucene/index/TestRollingUpdates.java
+++ b/lucene/core/src/test/org/apache/lucene/index/TestRollingUpdates.java
@ -213,8 +213,9 @@ public class TestRollingUpdates extends LuceneTestCase {
        DirectoryReader open = null;
        for (int i = 0; i < num; i++) {
          Document doc = new Document();// docs.nextDoc();
-          doc.add(newStringField("id", "test", Field.Store.NO));
-          writer.updateDocument(new Term("id", "test"), doc);
+          BytesRef br = new BytesRef("test");
+          doc.add(newStringField("id", br, Field.Store.NO));
+          writer.updateDocument(new Term("id", br), doc);
          if (random().nextInt(3) == 0) {
            if (open == null) {
              open = DirectoryReader.open(writer, true);
--- a/lucene/core/src/test/org/apache/lucene/search/TestLiveFieldValues.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestLiveFieldValues.java
@ -31,13 +31,13 @@ import org.apache.lucene.analysis.MockAnalyzer;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.IntField;
-import org.apache.lucene.document.StringField;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.index.IndexWriterConfig;
 import org.apache.lucene.index.StoredDocument;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.LuceneTestCase;
 import org.apache.lucene.util.TestUtil;

@ -108,7 +108,7 @@ public class TestLiveFieldValues extends LuceneTestCase {
                if (threadRandom.nextDouble() <= addChance) {
                  String id = String.format(Locale.ROOT, "%d_%04x", threadID, threadRandom.nextInt(idCount));
                  Integer field = threadRandom.nextInt(Integer.MAX_VALUE);
-                  doc.add(new StringField("id", id, Field.Store.YES));
+                  doc.add(newStringField("id", new BytesRef(id), Field.Store.YES));
                  doc.add(new IntField("field", field.intValue(), Field.Store.YES));
                  w.updateDocument(new Term("id", id), doc);
                  rt.add(id, field);
@ -119,7 +119,7 @@ public class TestLiveFieldValues extends LuceneTestCase {

                if (allIDs.size() > 0 && threadRandom.nextDouble() <= deleteChance) {
                  String randomID = allIDs.get(threadRandom.nextInt(allIDs.size()));
-                  w.deleteDocuments(new Term("id", randomID));
+                  w.deleteDocuments(new Term("id", new BytesRef(randomID)));
                  rt.delete(randomID);
                  values.put(randomID, missing);
                }
--- a/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PostingsHighlighter.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PostingsHighlighter.java
@ -18,6 +18,7 @@ package org.apache.lucene.search.postingshighlight;
 */

 import java.io.IOException;
+import java.nio.charset.StandardCharsets;
 import java.text.BreakIterator;
 import java.util.ArrayList;
 import java.util.Arrays;
@ -792,7 +793,8 @@ public class PostingsHighlighter {
    }
    
    @Override
-    public void stringField(FieldInfo fieldInfo, String value) throws IOException {
+    public void stringField(FieldInfo fieldInfo, byte[] bytes) throws IOException {
+      String value = new String(bytes, StandardCharsets.UTF_8);
      assert currentField >= 0;
      StringBuilder builder = builders[currentField];
      if (builder.length() > 0 && builder.length() < maxLength) {
--- a/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BaseFragmentsBuilder.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BaseFragmentsBuilder.java
@ -17,6 +17,16 @@ package org.apache.lucene.search.vectorhighlight;
 * limitations under the License.
 */

+import java.io.IOException;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.FieldType;
 import org.apache.lucene.document.TextField;
@ -25,19 +35,10 @@ import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.StoredFieldVisitor;
 import org.apache.lucene.search.highlight.DefaultEncoder;
 import org.apache.lucene.search.highlight.Encoder;
-import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo;
 import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo.SubInfo;
+import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo;
 import org.apache.lucene.search.vectorhighlight.FieldPhraseList.WeightedPhraseInfo.Toffs;

-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.HashMap;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
-
 /**
 * Base FragmentsBuilder implementation that supports colored pre/post
 * tags and multivalued fields.
@ -152,7 +153,8 @@ public abstract class BaseFragmentsBuilder implements FragmentsBuilder {
    reader.document(docId, new StoredFieldVisitor() {
        
        @Override
-        public void stringField(FieldInfo fieldInfo, String value) {
+        public void stringField(FieldInfo fieldInfo, byte[] bytes) {
+          String value = new String(bytes, StandardCharsets.UTF_8);
          FieldType ft = new FieldType(TextField.TYPE_STORED);
          ft.setStoreTermVectors(fieldInfo.hasVectors());
          fields.add(new Field(fieldInfo.name, value, ft));
--- a/lucene/misc/src/test/org/apache/lucene/document/TestLazyDocument.java
+++ b/lucene/misc/src/test/org/apache/lucene/document/TestLazyDocument.java
@ -16,19 +16,19 @@
 */
 package org.apache.lucene.document;

+import java.io.IOException;
+import java.nio.charset.StandardCharsets;
 import java.util.Arrays;
-import java.util.Set;
+import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Map;
-import java.util.HashMap;
-import java.io.IOException;
+import java.util.Set;

-import org.apache.lucene.util.LuceneTestCase;
-import org.apache.lucene.store.*;
 import org.apache.lucene.analysis.*;
 import org.apache.lucene.index.*;
 import org.apache.lucene.search.*;
-
+import org.apache.lucene.store.*;
+import org.apache.lucene.util.LuceneTestCase;
 import org.junit.After;
 import org.junit.Before;

@ -209,7 +209,8 @@ public class TestLazyDocument extends LuceneTestCase {
    }

    @Override
-    public void stringField(FieldInfo fieldInfo, String value) throws IOException {
+    public void stringField(FieldInfo fieldInfo, byte[] bytes) throws IOException {
+      String value = new String(bytes, StandardCharsets.UTF_8);
      final FieldType ft = new FieldType(TextField.TYPE_STORED);
      ft.setStoreTermVectors(fieldInfo.hasVectors());
      ft.setOmitNorms(fieldInfo.omitsNorms());
--- a/lucene/test-framework/src/java/org/apache/lucene/index/FieldFilterLeafReader.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/index/FieldFilterLeafReader.java
@ -78,7 +78,7 @@ public final class FieldFilterLeafReader extends FilterLeafReader {
      }

      @Override
-      public void stringField(FieldInfo fieldInfo, String value) throws IOException {
+      public void stringField(FieldInfo fieldInfo, byte[] value) throws IOException {
        visitor.stringField(fieldInfo, value);
      }

--- a/lucene/test-framework/src/java/org/apache/lucene/index/MismatchedLeafReader.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/index/MismatchedLeafReader.java
@ -92,7 +92,7 @@ public class MismatchedLeafReader extends FilterLeafReader {
    }

    @Override
-    public void stringField(FieldInfo fieldInfo, String value) throws IOException {
+    public void stringField(FieldInfo fieldInfo, byte[] value) throws IOException {
      in.stringField(renumber(fieldInfo), value);
    }

--- a/lucene/test-framework/src/java/org/apache/lucene/util/LuceneTestCase.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/util/LuceneTestCase.java
@ -56,49 +56,24 @@ import java.util.concurrent.TimeUnit;
 import java.util.concurrent.atomic.AtomicReference;
 import java.util.logging.Logger;

-import com.carrotsearch.randomizedtesting.JUnit4MethodProvider;
-import com.carrotsearch.randomizedtesting.LifecycleScope;
-import com.carrotsearch.randomizedtesting.MixWithSuiteName;
-import com.carrotsearch.randomizedtesting.RandomizedContext;
-import com.carrotsearch.randomizedtesting.RandomizedRunner;
-import com.carrotsearch.randomizedtesting.RandomizedTest;
-import com.carrotsearch.randomizedtesting.annotations.Listeners;
-import com.carrotsearch.randomizedtesting.annotations.SeedDecorators;
-import com.carrotsearch.randomizedtesting.annotations.TestGroup;
-import com.carrotsearch.randomizedtesting.annotations.TestMethodProviders;
-import com.carrotsearch.randomizedtesting.annotations.ThreadLeakAction;
-import com.carrotsearch.randomizedtesting.annotations.ThreadLeakAction.Action;
-import com.carrotsearch.randomizedtesting.annotations.ThreadLeakFilters;
-import com.carrotsearch.randomizedtesting.annotations.ThreadLeakGroup;
-import com.carrotsearch.randomizedtesting.annotations.ThreadLeakGroup.Group;
-import com.carrotsearch.randomizedtesting.annotations.ThreadLeakLingering;
-import com.carrotsearch.randomizedtesting.annotations.ThreadLeakScope;
-import com.carrotsearch.randomizedtesting.annotations.ThreadLeakScope.Scope;
-import com.carrotsearch.randomizedtesting.annotations.ThreadLeakZombies;
-import com.carrotsearch.randomizedtesting.annotations.ThreadLeakZombies.Consequence;
-import com.carrotsearch.randomizedtesting.annotations.TimeoutSuite;
-import com.carrotsearch.randomizedtesting.generators.RandomPicks;
-import com.carrotsearch.randomizedtesting.rules.NoClassHooksShadowingRule;
-import com.carrotsearch.randomizedtesting.rules.NoInstanceHooksOverridesRule;
-import com.carrotsearch.randomizedtesting.rules.StaticFieldsInvariantRule;
-
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.MockAnalyzer;
-import org.apache.lucene.document.Field;
 import org.apache.lucene.document.Field.Store;
+import org.apache.lucene.document.Field;
 import org.apache.lucene.document.FieldType;
 import org.apache.lucene.document.StringField;
 import org.apache.lucene.document.TextField;
 import org.apache.lucene.index.*;
 import org.apache.lucene.index.IndexReader.ReaderClosedListener;
+import org.apache.lucene.index.LeafReaderContext;
 import org.apache.lucene.index.TermsEnum.SeekStatus;
 import org.apache.lucene.search.AssertingIndexSearcher;
 import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.LRUQueryCache;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.QueryCache;
 import org.apache.lucene.search.QueryCachingPolicy;
-import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.QueryUtils.FCInvisibleMultiReader;
 import org.apache.lucene.store.BaseDirectoryWrapper;
 import org.apache.lucene.store.Directory;
@ -108,8 +83,8 @@ import org.apache.lucene.store.FlushInfo;
 import org.apache.lucene.store.IOContext;
 import org.apache.lucene.store.LockFactory;
 import org.apache.lucene.store.MergeInfo;
-import org.apache.lucene.store.MockDirectoryWrapper;
 import org.apache.lucene.store.MockDirectoryWrapper.Throttling;
+import org.apache.lucene.store.MockDirectoryWrapper;
 import org.apache.lucene.store.NRTCachingDirectory;
 import org.apache.lucene.store.RawDirectoryWrapper;
 import org.apache.lucene.util.automaton.AutomatonTestUtil;
@ -126,6 +101,31 @@ import org.junit.Test;
 import org.junit.rules.RuleChain;
 import org.junit.rules.TestRule;
 import org.junit.runner.RunWith;
+import com.carrotsearch.randomizedtesting.JUnit4MethodProvider;
+import com.carrotsearch.randomizedtesting.LifecycleScope;
+import com.carrotsearch.randomizedtesting.MixWithSuiteName;
+import com.carrotsearch.randomizedtesting.RandomizedContext;
+import com.carrotsearch.randomizedtesting.RandomizedRunner;
+import com.carrotsearch.randomizedtesting.RandomizedTest;
+import com.carrotsearch.randomizedtesting.annotations.Listeners;
+import com.carrotsearch.randomizedtesting.annotations.SeedDecorators;
+import com.carrotsearch.randomizedtesting.annotations.TestGroup;
+import com.carrotsearch.randomizedtesting.annotations.TestMethodProviders;
+import com.carrotsearch.randomizedtesting.annotations.ThreadLeakAction.Action;
+import com.carrotsearch.randomizedtesting.annotations.ThreadLeakAction;
+import com.carrotsearch.randomizedtesting.annotations.ThreadLeakFilters;
+import com.carrotsearch.randomizedtesting.annotations.ThreadLeakGroup.Group;
+import com.carrotsearch.randomizedtesting.annotations.ThreadLeakGroup;
+import com.carrotsearch.randomizedtesting.annotations.ThreadLeakLingering;
+import com.carrotsearch.randomizedtesting.annotations.ThreadLeakScope.Scope;
+import com.carrotsearch.randomizedtesting.annotations.ThreadLeakScope;
+import com.carrotsearch.randomizedtesting.annotations.ThreadLeakZombies.Consequence;
+import com.carrotsearch.randomizedtesting.annotations.ThreadLeakZombies;
+import com.carrotsearch.randomizedtesting.annotations.TimeoutSuite;
+import com.carrotsearch.randomizedtesting.generators.RandomPicks;
+import com.carrotsearch.randomizedtesting.rules.NoClassHooksShadowingRule;
+import com.carrotsearch.randomizedtesting.rules.NoInstanceHooksOverridesRule;
+import com.carrotsearch.randomizedtesting.rules.StaticFieldsInvariantRule;

 import static com.carrotsearch.randomizedtesting.RandomizedTest.systemPropertyAsBoolean;
 import static com.carrotsearch.randomizedtesting.RandomizedTest.systemPropertyAsInt;
@ -1336,6 +1336,10 @@ public abstract class LuceneTestCase extends Assert {
    return newField(random(), name, value, stored == Store.YES ? StringField.TYPE_STORED : StringField.TYPE_NOT_STORED);
  }

+  public static Field newStringField(String name, BytesRef value, Store stored) {
+    return newField(random(), name, value, stored == Store.YES ? StringField.TYPE_STORED : StringField.TYPE_NOT_STORED);
+  }
+
  public static Field newTextField(String name, String value, Store stored) {
    return newField(random(), name, value, stored == Store.YES ? TextField.TYPE_STORED : TextField.TYPE_NOT_STORED);
  }
@ -1344,6 +1348,10 @@ public abstract class LuceneTestCase extends Assert {
    return newField(random, name, value, stored == Store.YES ? StringField.TYPE_STORED : StringField.TYPE_NOT_STORED);
  }

+  public static Field newStringField(Random random, String name, BytesRef value, Store stored) {
+    return newField(random, name, value, stored == Store.YES ? StringField.TYPE_STORED : StringField.TYPE_NOT_STORED);
+  }
+  
  public static Field newTextField(Random random, String name, String value, Store stored) {
    return newField(random, name, value, stored == Store.YES ? TextField.TYPE_STORED : TextField.TYPE_NOT_STORED);
  }
@ -1372,7 +1380,7 @@ public abstract class LuceneTestCase extends Assert {
  // write-once schema sort of helper class then we can
  // remove the sync here.  We can also fold the random
  // "enable norms" (now commented out, below) into that:
-  public synchronized static Field newField(Random random, String name, String value, FieldType type) {
+  public synchronized static Field newField(Random random, String name, Object value, FieldType type) {

    // Defeat any consumers that illegally rely on intern'd
    // strings (we removed this from Lucene a while back):
@ -1388,7 +1396,7 @@ public abstract class LuceneTestCase extends Assert {
        type = mergeTermVectorOptions(type, prevType);
      }

-      return new Field(name, value, type);
+      return createField(name, value, type);
    }

    // TODO: once all core & test codecs can index
@ -1434,7 +1442,17 @@ public abstract class LuceneTestCase extends Assert {
    }
    */
    
-    return new Field(name, value, newType);
+    return createField(name, value, newType);
+  }
+
+  private static Field createField(String name, Object value, FieldType fieldType) {
+    if (value instanceof String) {
+      return new Field(name, (String) value, fieldType);
+    } else if (value instanceof BytesRef) {
+      return new Field(name, (BytesRef) value, fieldType);
+    } else {
+      throw new IllegalArgumentException("value must be String or BytesRef");
+    }
  }

  /** 
--- a/solr/core/src/java/org/apache/solr/handler/component/TermVectorComponent.java
+++ b/solr/core/src/java/org/apache/solr/handler/component/TermVectorComponent.java
@ -1,6 +1,7 @@
 package org.apache.solr.handler.component;

 import java.io.IOException;
+import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
@ -275,8 +276,8 @@ public class TermVectorComponent extends SearchComponent implements SolrCoreAwar
    // once we find it...
    final StoredFieldVisitor getUniqValue = new StoredFieldVisitor() {
      @Override 
-      public void stringField(FieldInfo fieldInfo, String value) {
-        uniqValues.add(value);
+      public void stringField(FieldInfo fieldInfo, byte[] bytes) {
+        uniqValues.add(new String(bytes, StandardCharsets.UTF_8));
      }

      @Override 
--- a/solr/core/src/java/org/apache/solr/search/SolrIndexSearcher.java
+++ b/solr/core/src/java/org/apache/solr/search/SolrIndexSearcher.java
@ -20,6 +20,7 @@ package org.apache.solr.search;
 import java.io.Closeable;
 import java.io.IOException;
 import java.net.URL;
+import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
@ -80,10 +81,10 @@ import org.apache.solr.core.SolrInfoMBean;
 import org.apache.solr.request.LocalSolrQueryRequest;
 import org.apache.solr.request.SolrQueryRequest;
 import org.apache.solr.request.SolrRequestInfo;
-import org.apache.solr.search.facet.UnInvertedField;
 import org.apache.solr.response.SolrQueryResponse;
 import org.apache.solr.schema.IndexSchema;
 import org.apache.solr.schema.SchemaField;
+import org.apache.solr.search.facet.UnInvertedField;
 import org.apache.solr.search.stats.StatsSource;
 import org.apache.solr.update.SolrIndexConfig;
 import org.slf4j.Logger;
@ -618,7 +619,8 @@ public class SolrIndexSearcher extends IndexSearcher implements Closeable,SolrIn
    }

    @Override
-    public void stringField(FieldInfo fieldInfo, String value) throws IOException {
+    public void stringField(FieldInfo fieldInfo, byte[] bytes) throws IOException {
+      String value = new String(bytes, StandardCharsets.UTF_8);
      final FieldType ft = new FieldType(TextField.TYPE_STORED);
      ft.setStoreTermVectors(fieldInfo.hasVectors());
      ft.setOmitNorms(fieldInfo.omitsNorms());
@ -708,7 +710,7 @@ public class SolrIndexSearcher extends IndexSearcher implements Closeable,SolrIn
              throw new AssertionError();
            }
          } else {
-            visitor.stringField(info, f.stringValue());
+            visitor.stringField(info, f.stringValue().getBytes(StandardCharsets.UTF_8));
          }
          break;
        case NO: