mirror of https://github.com/apache/lucene.git
Output binary doc values as hex array in SimpleTextCodec (#12987)
Binary doc values were being written directly in SimpleTextCodec, though they may not be valid UTF-8 (i.e. they may not be "text"). This change encodes them as a string representing an array of hexadecimal bytes.
This commit is contained in:
parent
8bee41880e
commit
7dfef017e3
|
@ -302,6 +302,7 @@ class SimpleTextDocValuesReader extends DocValuesProducer {
|
|||
IntFunction<BytesRef> values =
|
||||
new IntFunction<BytesRef>() {
|
||||
final BytesRefBuilder term = new BytesRefBuilder();
|
||||
final BytesRefBuilder termByteArray = new BytesRefBuilder();
|
||||
|
||||
@Override
|
||||
public BytesRef apply(int docID) {
|
||||
|
@ -329,9 +330,10 @@ class SimpleTextDocValuesReader extends DocValuesProducer {
|
|||
} catch (ParseException pe) {
|
||||
throw new CorruptIndexException("failed to parse int length", in, pe);
|
||||
}
|
||||
term.grow(len);
|
||||
term.setLength(len);
|
||||
in.readBytes(term.bytes(), 0, len);
|
||||
termByteArray.grow(len);
|
||||
termByteArray.setLength(len);
|
||||
in.readBytes(termByteArray.bytes(), 0, len);
|
||||
term.copyBytes(SimpleTextUtil.fromBytesRefString(termByteArray.get().utf8ToString()));
|
||||
return term.get();
|
||||
} catch (IOException ioe) {
|
||||
throw new RuntimeException(ioe);
|
||||
|
|
|
@ -169,7 +169,7 @@ class SimpleTextDocValuesWriter extends DocValuesConsumer {
|
|||
int maxLength = 0;
|
||||
BinaryDocValues values = valuesProducer.getBinary(field);
|
||||
for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) {
|
||||
maxLength = Math.max(maxLength, values.binaryValue().length);
|
||||
maxLength = Math.max(maxLength, values.binaryValue().toString().length());
|
||||
}
|
||||
writeFieldEntry(field, DocValuesType.BINARY);
|
||||
|
||||
|
@ -197,17 +197,16 @@ class SimpleTextDocValuesWriter extends DocValuesConsumer {
|
|||
values.nextDoc();
|
||||
assert values.docID() >= i;
|
||||
}
|
||||
String stringVal = values.docID() == i ? values.binaryValue().toString() : null;
|
||||
// write length
|
||||
final int length = values.docID() != i ? 0 : values.binaryValue().length;
|
||||
final int length = stringVal == null ? 0 : stringVal.length();
|
||||
SimpleTextUtil.write(data, LENGTH);
|
||||
SimpleTextUtil.write(data, encoder.format(length), scratch);
|
||||
SimpleTextUtil.writeNewline(data);
|
||||
|
||||
// write bytes -- don't use SimpleText.write
|
||||
// because it escapes:
|
||||
if (values.docID() == i) {
|
||||
BytesRef value = values.binaryValue();
|
||||
data.writeBytes(value.bytes, value.offset, value.length);
|
||||
// write bytes as hex array
|
||||
if (stringVal != null) {
|
||||
SimpleTextUtil.write(data, stringVal, scratch);
|
||||
}
|
||||
|
||||
// pad to fit
|
||||
|
@ -215,7 +214,7 @@ class SimpleTextDocValuesWriter extends DocValuesConsumer {
|
|||
data.writeByte((byte) ' ');
|
||||
}
|
||||
SimpleTextUtil.writeNewline(data);
|
||||
if (values.docID() != i) {
|
||||
if (stringVal == null) {
|
||||
SimpleTextUtil.write(data, "F", scratch);
|
||||
} else {
|
||||
SimpleTextUtil.write(data, "T", scratch);
|
||||
|
|
|
@ -111,6 +111,9 @@ class SimpleTextUtil {
|
|||
throw new IllegalArgumentException(
|
||||
"string " + s + " was not created from BytesRef.toString?");
|
||||
}
|
||||
if (s.length() == 2) {
|
||||
return new BytesRef(BytesRef.EMPTY_BYTES);
|
||||
}
|
||||
String[] parts = s.substring(1, s.length() - 1).split(" ");
|
||||
byte[] bytes = new byte[parts.length];
|
||||
for (int i = 0; i < parts.length; i++) {
|
||||
|
|
|
@ -16,8 +16,22 @@
|
|||
*/
|
||||
package org.apache.lucene.codecs.simpletext;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.document.BinaryDocValuesField;
|
||||
import org.apache.lucene.document.NumericDocValuesField;
|
||||
import org.apache.lucene.document.SortedDocValuesField;
|
||||
import org.apache.lucene.document.SortedSetDocValuesField;
|
||||
import org.apache.lucene.index.IndexWriterConfig;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.tests.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.tests.index.BaseDocValuesFormatTestCase;
|
||||
import org.apache.lucene.tests.index.RandomIndexWriter;
|
||||
import org.apache.lucene.tests.util.TestUtil;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
||||
/** Tests SimpleTextDocValuesFormat */
|
||||
public class TestSimpleTextDocValuesFormat extends BaseDocValuesFormatTestCase {
|
||||
|
@ -27,4 +41,37 @@ public class TestSimpleTextDocValuesFormat extends BaseDocValuesFormatTestCase {
|
|||
protected Codec getCodec() {
|
||||
return codec;
|
||||
}
|
||||
|
||||
public void testFileIsUTF8() throws IOException {
|
||||
try (Directory dir = newDirectory()) {
|
||||
IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random()));
|
||||
try (RandomIndexWriter writer = new RandomIndexWriter(random(), dir, conf)) {
|
||||
for (int i = 0; i < 100; i++) {
|
||||
writer.addDocument(
|
||||
List.of(
|
||||
new SortedDocValuesField(
|
||||
"sortedVal", newBytesRef(TestUtil.randomSimpleString(random()))),
|
||||
new SortedSetDocValuesField(
|
||||
"sortedSetVal", newBytesRef(TestUtil.randomSimpleString(random()))),
|
||||
new NumericDocValuesField("numberVal", random().nextLong()),
|
||||
new BinaryDocValuesField("binaryVal", TestUtil.randomBinaryTerm(random()))));
|
||||
}
|
||||
}
|
||||
for (String file : dir.listAll()) {
|
||||
if (file.endsWith("dat")) {
|
||||
try (IndexInput input = dir.openChecksumInput(file)) {
|
||||
long length = input.length();
|
||||
if (length > 20_000) {
|
||||
// Avoid allocating a huge array if the length is wrong
|
||||
fail("Doc values should not be this large");
|
||||
}
|
||||
byte[] bytes = new byte[(int) length];
|
||||
input.readBytes(bytes, 0, bytes.length);
|
||||
BytesRef bytesRef = new BytesRef(bytes);
|
||||
assertNotEquals(bytesRef.toString(), Term.toString(bytesRef));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue