LUCENE-10405: binary and Sorted doc values are stored as BytesRef instead of BytesRefHash in memory index (#647)

When using the MemoryIndex, binary and Sorted doc values are stored 
as BytesRef instead of BytesRefHash so they don't have a limit on size.
This commit is contained in:
Ignacio Vera 2022-02-07 07:33:07 +01:00 committed by GitHub
parent deef3c704e
commit 4c578017af
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 45 additions and 26 deletions

View File

@ -232,6 +232,9 @@ Bug Fixes
* LUCENE-10407: Containing intervals could sometimes yield incorrect matches when wrapped
in a disjunction. (Alan Woodward, Dawid Weiss)
* LUCENE-10405: When using the MemoryIndex, binary and Sorted doc values are stored
as BytesRef instead of BytesRefHash so they don't have a limit on size. (Ignacio Vera)
Other
---------------------

View File

@ -578,17 +578,6 @@ public class MemoryIndex {
info.numericProducer.dvLongValues[info.numericProducer.count++] = (long) docValuesValue;
break;
case BINARY:
if (info.binaryProducer.dvBytesValuesSet != null) {
throw new IllegalArgumentException(
"Only one value per field allowed for ["
+ docValuesType
+ "] doc values field ["
+ fieldName
+ "]");
}
info.binaryProducer.dvBytesValuesSet = new BytesRefHash(byteBlockPool);
info.binaryProducer.dvBytesValuesSet.add((BytesRef) docValuesValue);
break;
case SORTED:
if (info.binaryProducer.dvBytesValuesSet != null) {
throw new IllegalArgumentException(
@ -598,14 +587,13 @@ public class MemoryIndex {
+ fieldName
+ "]");
}
info.binaryProducer.dvBytesValuesSet = new BytesRefHash(byteBlockPool);
info.binaryProducer.dvBytesValuesSet.add((BytesRef) docValuesValue);
info.binaryProducer.dvBytesValuesSet = ((BytesRef) docValuesValue).clone();
break;
case SORTED_SET:
if (info.binaryProducer.dvBytesValuesSet == null) {
info.binaryProducer.dvBytesValuesSet = new BytesRefHash(byteBlockPool);
if (info.bytesRefHashProducer.dvBytesRefHashValuesSet == null) {
info.bytesRefHashProducer.dvBytesRefHashValuesSet = new BytesRefHash(byteBlockPool);
}
info.binaryProducer.dvBytesValuesSet.add((BytesRef) docValuesValue);
info.bytesRefHashProducer.dvBytesRefHashValuesSet.add((BytesRef) docValuesValue);
break;
case NONE:
default:
@ -866,6 +854,8 @@ public class MemoryIndex {
/** the last offset encountered in this field for multi field support */
private int lastOffset;
private BytesRefHashDocValuesProducer bytesRefHashProducer;
private BinaryDocValuesProducer binaryProducer;
private NumericDocValuesProducer numericProducer;
@ -884,7 +874,8 @@ public class MemoryIndex {
this.fieldInfo = fieldInfo;
this.sliceArray = new SliceByteStartArray(BytesRefHash.DEFAULT_CAPACITY);
this.terms = new BytesRefHash(byteBlockPool, BytesRefHash.DEFAULT_CAPACITY, sliceArray);
;
this.bytesRefHashProducer = new BytesRefHashDocValuesProducer();
this.binaryProducer = new BinaryDocValuesProducer();
this.numericProducer = new NumericDocValuesProducer();
}
@ -914,10 +905,8 @@ public class MemoryIndex {
if (dvType == DocValuesType.NUMERIC || dvType == DocValuesType.SORTED_NUMERIC) {
numericProducer.prepareForUsage();
}
if (dvType == DocValuesType.BINARY
|| dvType == DocValuesType.SORTED
|| dvType == DocValuesType.SORTED_SET) {
binaryProducer.prepareForUsage();
if (dvType == DocValuesType.SORTED_SET) {
bytesRefHashProducer.prepareForUsage();
}
if (pointValues != null) {
assert pointValues[0].bytes.length == pointValues[0].length
@ -1193,12 +1182,16 @@ public class MemoryIndex {
}
private static final class BinaryDocValuesProducer {
BytesRef dvBytesValuesSet;
}
BytesRefHash dvBytesValuesSet;
private static final class BytesRefHashDocValuesProducer {
BytesRefHash dvBytesRefHashValuesSet;
int[] bytesIds;
private void prepareForUsage() {
bytesIds = dvBytesValuesSet.sort();
bytesIds = dvBytesRefHashValuesSet.sort();
}
}
@ -1316,8 +1309,7 @@ public class MemoryIndex {
private SortedDocValues getSortedDocValues(String field, DocValuesType docValuesType) {
Info info = getInfoForExpectedDocValuesType(field, docValuesType);
if (info != null) {
BytesRef value = info.binaryProducer.dvBytesValuesSet.get(0, new BytesRef());
return sortedDocValues(value);
return sortedDocValues(info.binaryProducer.dvBytesValuesSet);
} else {
return null;
}
@ -1338,7 +1330,7 @@ public class MemoryIndex {
Info info = getInfoForExpectedDocValuesType(field, DocValuesType.SORTED_SET);
if (info != null) {
return sortedSetDocValues(
info.binaryProducer.dvBytesValuesSet, info.binaryProducer.bytesIds);
info.bytesRefHashProducer.dvBytesRefHashValuesSet, info.bytesRefHashProducer.bytesIds);
} else {
return null;
}

View File

@ -430,6 +430,30 @@ public class TestMemoryIndex extends LuceneTestCase {
assertEquals("quick brown fox", binaryDocValues.binaryValue().utf8ToString());
}
public void testBigBinaryDocValues() throws Exception {
Document doc = new Document();
byte[] bytes = new byte[33 * 1024];
random().nextBytes(bytes);
doc.add(new BinaryDocValuesField("binary", new BytesRef(bytes)));
MemoryIndex mi = MemoryIndex.fromDocument(doc, analyzer, true, true);
LeafReader leafReader = mi.createSearcher().getIndexReader().leaves().get(0).reader();
BinaryDocValues binaryDocValues = leafReader.getBinaryDocValues("binary");
assertEquals(0, binaryDocValues.nextDoc());
assertArrayEquals(bytes, binaryDocValues.binaryValue().bytes);
}
public void testBigSortedDocValues() throws Exception {
Document doc = new Document();
byte[] bytes = new byte[33 * 1024];
random().nextBytes(bytes);
doc.add(new SortedDocValuesField("binary", new BytesRef(bytes)));
MemoryIndex mi = MemoryIndex.fromDocument(doc, analyzer, true, true);
LeafReader leafReader = mi.createSearcher().getIndexReader().leaves().get(0).reader();
SortedDocValues sortedDocValues = leafReader.getSortedDocValues("binary");
assertEquals(0, sortedDocValues.nextDoc());
assertArrayEquals(bytes, sortedDocValues.lookupOrd(0).bytes);
}
public void testPointValues() throws Exception {
List<Function<Long, IndexableField>> fieldFunctions =
Arrays.asList(