diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesReader.java index 6c36e277ad1..08d582f45c0 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesReader.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesReader.java @@ -302,6 +302,7 @@ class SimpleTextDocValuesReader extends DocValuesProducer { IntFunction values = new IntFunction() { final BytesRefBuilder term = new BytesRefBuilder(); + final BytesRefBuilder termByteArray = new BytesRefBuilder(); @Override public BytesRef apply(int docID) { @@ -329,9 +330,10 @@ class SimpleTextDocValuesReader extends DocValuesProducer { } catch (ParseException pe) { throw new CorruptIndexException("failed to parse int length", in, pe); } - term.grow(len); - term.setLength(len); - in.readBytes(term.bytes(), 0, len); + termByteArray.grow(len); + termByteArray.setLength(len); + in.readBytes(termByteArray.bytes(), 0, len); + term.copyBytes(SimpleTextUtil.fromBytesRefString(termByteArray.get().utf8ToString())); return term.get(); } catch (IOException ioe) { throw new RuntimeException(ioe); diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesWriter.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesWriter.java index bd0c307acc6..a49041d2730 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesWriter.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesWriter.java @@ -169,7 +169,7 @@ class SimpleTextDocValuesWriter extends DocValuesConsumer { int maxLength = 0; BinaryDocValues values = valuesProducer.getBinary(field); for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) { - maxLength = Math.max(maxLength, values.binaryValue().length); + maxLength = Math.max(maxLength, values.binaryValue().toString().length()); } writeFieldEntry(field, DocValuesType.BINARY); @@ -197,17 +197,16 @@ class SimpleTextDocValuesWriter extends DocValuesConsumer { values.nextDoc(); assert values.docID() >= i; } + String stringVal = values.docID() == i ? values.binaryValue().toString() : null; // write length - final int length = values.docID() != i ? 0 : values.binaryValue().length; + final int length = stringVal == null ? 0 : stringVal.length(); SimpleTextUtil.write(data, LENGTH); SimpleTextUtil.write(data, encoder.format(length), scratch); SimpleTextUtil.writeNewline(data); - // write bytes -- don't use SimpleText.write - // because it escapes: - if (values.docID() == i) { - BytesRef value = values.binaryValue(); - data.writeBytes(value.bytes, value.offset, value.length); + // write bytes as hex array + if (stringVal != null) { + SimpleTextUtil.write(data, stringVal, scratch); } // pad to fit @@ -215,7 +214,7 @@ class SimpleTextDocValuesWriter extends DocValuesConsumer { data.writeByte((byte) ' '); } SimpleTextUtil.writeNewline(data); - if (values.docID() != i) { + if (stringVal == null) { SimpleTextUtil.write(data, "F", scratch); } else { SimpleTextUtil.write(data, "T", scratch); diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextUtil.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextUtil.java index 011e7560076..a18bb3f3727 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextUtil.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextUtil.java @@ -111,6 +111,9 @@ class SimpleTextUtil { throw new IllegalArgumentException( "string " + s + " was not created from BytesRef.toString?"); } + if (s.length() == 2) { + return new BytesRef(BytesRef.EMPTY_BYTES); + } String[] parts = s.substring(1, s.length() - 1).split(" "); byte[] bytes = new byte[parts.length]; for (int i = 0; i < parts.length; i++) { diff --git a/lucene/codecs/src/test/org/apache/lucene/codecs/simpletext/TestSimpleTextDocValuesFormat.java b/lucene/codecs/src/test/org/apache/lucene/codecs/simpletext/TestSimpleTextDocValuesFormat.java index ed13e2bbad8..5c1504679ec 100644 --- a/lucene/codecs/src/test/org/apache/lucene/codecs/simpletext/TestSimpleTextDocValuesFormat.java +++ b/lucene/codecs/src/test/org/apache/lucene/codecs/simpletext/TestSimpleTextDocValuesFormat.java @@ -16,8 +16,22 @@ */ package org.apache.lucene.codecs.simpletext; +import java.io.IOException; +import java.util.List; import org.apache.lucene.codecs.Codec; +import org.apache.lucene.document.BinaryDocValuesField; +import org.apache.lucene.document.NumericDocValuesField; +import org.apache.lucene.document.SortedDocValuesField; +import org.apache.lucene.document.SortedSetDocValuesField; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.Term; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.tests.analysis.MockAnalyzer; import org.apache.lucene.tests.index.BaseDocValuesFormatTestCase; +import org.apache.lucene.tests.index.RandomIndexWriter; +import org.apache.lucene.tests.util.TestUtil; +import org.apache.lucene.util.BytesRef; /** Tests SimpleTextDocValuesFormat */ public class TestSimpleTextDocValuesFormat extends BaseDocValuesFormatTestCase { @@ -27,4 +41,37 @@ public class TestSimpleTextDocValuesFormat extends BaseDocValuesFormatTestCase { protected Codec getCodec() { return codec; } + + public void testFileIsUTF8() throws IOException { + try (Directory dir = newDirectory()) { + IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random())); + try (RandomIndexWriter writer = new RandomIndexWriter(random(), dir, conf)) { + for (int i = 0; i < 100; i++) { + writer.addDocument( + List.of( + new SortedDocValuesField( + "sortedVal", newBytesRef(TestUtil.randomSimpleString(random()))), + new SortedSetDocValuesField( + "sortedSetVal", newBytesRef(TestUtil.randomSimpleString(random()))), + new NumericDocValuesField("numberVal", random().nextLong()), + new BinaryDocValuesField("binaryVal", TestUtil.randomBinaryTerm(random())))); + } + } + for (String file : dir.listAll()) { + if (file.endsWith("dat")) { + try (IndexInput input = dir.openChecksumInput(file)) { + long length = input.length(); + if (length > 20_000) { + // Avoid allocating a huge array if the length is wrong + fail("Doc values should not be this large"); + } + byte[] bytes = new byte[(int) length]; + input.readBytes(bytes, 0, bytes.length); + BytesRef bytesRef = new BytesRef(bytes); + assertNotEquals(bytesRef.toString(), Term.toString(bytesRef)); + } + } + } + } + } }