Output binary doc values as hex array in SimpleTextCodec (#12987)

Binary doc values were being written directly in SimpleTextCodec, though
they may not be valid UTF-8 (i.e. they may not be "text"). This change
encodes them as a string representing an array of hexadecimal bytes.
This commit is contained in:
Michael Froh 2024-01-12 15:09:18 +00:00 committed by GitHub
parent 8bee41880e
commit 7dfef017e3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 62 additions and 11 deletions

View File

@ -302,6 +302,7 @@ class SimpleTextDocValuesReader extends DocValuesProducer {
IntFunction<BytesRef> values =
new IntFunction<BytesRef>() {
final BytesRefBuilder term = new BytesRefBuilder();
final BytesRefBuilder termByteArray = new BytesRefBuilder();
@Override
public BytesRef apply(int docID) {
@ -329,9 +330,10 @@ class SimpleTextDocValuesReader extends DocValuesProducer {
} catch (ParseException pe) {
throw new CorruptIndexException("failed to parse int length", in, pe);
}
term.grow(len);
term.setLength(len);
in.readBytes(term.bytes(), 0, len);
termByteArray.grow(len);
termByteArray.setLength(len);
in.readBytes(termByteArray.bytes(), 0, len);
term.copyBytes(SimpleTextUtil.fromBytesRefString(termByteArray.get().utf8ToString()));
return term.get();
} catch (IOException ioe) {
throw new RuntimeException(ioe);

View File

@ -169,7 +169,7 @@ class SimpleTextDocValuesWriter extends DocValuesConsumer {
int maxLength = 0;
BinaryDocValues values = valuesProducer.getBinary(field);
for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) {
maxLength = Math.max(maxLength, values.binaryValue().length);
maxLength = Math.max(maxLength, values.binaryValue().toString().length());
}
writeFieldEntry(field, DocValuesType.BINARY);
@ -197,17 +197,16 @@ class SimpleTextDocValuesWriter extends DocValuesConsumer {
values.nextDoc();
assert values.docID() >= i;
}
String stringVal = values.docID() == i ? values.binaryValue().toString() : null;
// write length
final int length = values.docID() != i ? 0 : values.binaryValue().length;
final int length = stringVal == null ? 0 : stringVal.length();
SimpleTextUtil.write(data, LENGTH);
SimpleTextUtil.write(data, encoder.format(length), scratch);
SimpleTextUtil.writeNewline(data);
// write bytes -- don't use SimpleText.write
// because it escapes:
if (values.docID() == i) {
BytesRef value = values.binaryValue();
data.writeBytes(value.bytes, value.offset, value.length);
// write bytes as hex array
if (stringVal != null) {
SimpleTextUtil.write(data, stringVal, scratch);
}
// pad to fit
@ -215,7 +214,7 @@ class SimpleTextDocValuesWriter extends DocValuesConsumer {
data.writeByte((byte) ' ');
}
SimpleTextUtil.writeNewline(data);
if (values.docID() != i) {
if (stringVal == null) {
SimpleTextUtil.write(data, "F", scratch);
} else {
SimpleTextUtil.write(data, "T", scratch);

View File

@ -111,6 +111,9 @@ class SimpleTextUtil {
throw new IllegalArgumentException(
"string " + s + " was not created from BytesRef.toString?");
}
if (s.length() == 2) {
return new BytesRef(BytesRef.EMPTY_BYTES);
}
String[] parts = s.substring(1, s.length() - 1).split(" ");
byte[] bytes = new byte[parts.length];
for (int i = 0; i < parts.length; i++) {

View File

@ -16,8 +16,22 @@
*/
package org.apache.lucene.codecs.simpletext;
import java.io.IOException;
import java.util.List;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.document.BinaryDocValuesField;
import org.apache.lucene.document.NumericDocValuesField;
import org.apache.lucene.document.SortedDocValuesField;
import org.apache.lucene.document.SortedSetDocValuesField;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.tests.analysis.MockAnalyzer;
import org.apache.lucene.tests.index.BaseDocValuesFormatTestCase;
import org.apache.lucene.tests.index.RandomIndexWriter;
import org.apache.lucene.tests.util.TestUtil;
import org.apache.lucene.util.BytesRef;
/** Tests SimpleTextDocValuesFormat */
public class TestSimpleTextDocValuesFormat extends BaseDocValuesFormatTestCase {
@ -27,4 +41,37 @@ public class TestSimpleTextDocValuesFormat extends BaseDocValuesFormatTestCase {
protected Codec getCodec() {
return codec;
}
public void testFileIsUTF8() throws IOException {
try (Directory dir = newDirectory()) {
IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random()));
try (RandomIndexWriter writer = new RandomIndexWriter(random(), dir, conf)) {
for (int i = 0; i < 100; i++) {
writer.addDocument(
List.of(
new SortedDocValuesField(
"sortedVal", newBytesRef(TestUtil.randomSimpleString(random()))),
new SortedSetDocValuesField(
"sortedSetVal", newBytesRef(TestUtil.randomSimpleString(random()))),
new NumericDocValuesField("numberVal", random().nextLong()),
new BinaryDocValuesField("binaryVal", TestUtil.randomBinaryTerm(random()))));
}
}
for (String file : dir.listAll()) {
if (file.endsWith("dat")) {
try (IndexInput input = dir.openChecksumInput(file)) {
long length = input.length();
if (length > 20_000) {
// Avoid allocating a huge array if the length is wrong
fail("Doc values should not be this large");
}
byte[] bytes = new byte[(int) length];
input.readBytes(bytes, 0, bytes.length);
BytesRef bytesRef = new BytesRef(bytes);
assertNotEquals(bytesRef.toString(), Term.toString(bytesRef));
}
}
}
}
}
}