LUCENE-10405: binary and Sorted doc values are stored as BytesRef instead of BytesRefHash in memory index (#647)

When using the MemoryIndex, binary and Sorted doc values are stored as BytesRef instead of BytesRefHash so they don't have a limit on size.
2022-02-07 07:33:07 +01:00 · 2022-02-07 07:33:07 +01:00 · 4c578017af
parent deef3c704e
commit 4c578017af
3 changed files with 45 additions and 26 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -232,6 +232,9 @@ Bug Fixes

 * LUCENE-10407: Containing intervals could sometimes yield incorrect matches when wrapped
  in a disjunction. (Alan Woodward, Dawid Weiss)
+  
+* LUCENE-10405: When using the MemoryIndex, binary and Sorted doc values are stored 
+   as BytesRef instead of BytesRefHash so they don't have a limit on size. (Ignacio Vera)

 Other
 ---------------------
--- a/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java
+++ b/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java
@ -578,17 +578,6 @@ public class MemoryIndex {
        info.numericProducer.dvLongValues[info.numericProducer.count++] = (long) docValuesValue;
        break;
      case BINARY:
-        if (info.binaryProducer.dvBytesValuesSet != null) {
-          throw new IllegalArgumentException(
-              "Only one value per field allowed for ["
-                  + docValuesType
-                  + "] doc values field ["
-                  + fieldName
-                  + "]");
-        }
-        info.binaryProducer.dvBytesValuesSet = new BytesRefHash(byteBlockPool);
-        info.binaryProducer.dvBytesValuesSet.add((BytesRef) docValuesValue);
-        break;
      case SORTED:
        if (info.binaryProducer.dvBytesValuesSet != null) {
          throw new IllegalArgumentException(
@ -598,14 +587,13 @@ public class MemoryIndex {
                  + fieldName
                  + "]");
        }
-        info.binaryProducer.dvBytesValuesSet = new BytesRefHash(byteBlockPool);
-        info.binaryProducer.dvBytesValuesSet.add((BytesRef) docValuesValue);
+        info.binaryProducer.dvBytesValuesSet = ((BytesRef) docValuesValue).clone();
        break;
      case SORTED_SET:
-        if (info.binaryProducer.dvBytesValuesSet == null) {
-          info.binaryProducer.dvBytesValuesSet = new BytesRefHash(byteBlockPool);
+        if (info.bytesRefHashProducer.dvBytesRefHashValuesSet == null) {
+          info.bytesRefHashProducer.dvBytesRefHashValuesSet = new BytesRefHash(byteBlockPool);
        }
-        info.binaryProducer.dvBytesValuesSet.add((BytesRef) docValuesValue);
+        info.bytesRefHashProducer.dvBytesRefHashValuesSet.add((BytesRef) docValuesValue);
        break;
      case NONE:
      default:
@ -866,6 +854,8 @@ public class MemoryIndex {
    /** the last offset encountered in this field for multi field support */
    private int lastOffset;

+    private BytesRefHashDocValuesProducer bytesRefHashProducer;
+
    private BinaryDocValuesProducer binaryProducer;

    private NumericDocValuesProducer numericProducer;
@ -884,7 +874,8 @@ public class MemoryIndex {
      this.fieldInfo = fieldInfo;
      this.sliceArray = new SliceByteStartArray(BytesRefHash.DEFAULT_CAPACITY);
      this.terms = new BytesRefHash(byteBlockPool, BytesRefHash.DEFAULT_CAPACITY, sliceArray);
-      ;
+
+      this.bytesRefHashProducer = new BytesRefHashDocValuesProducer();
      this.binaryProducer = new BinaryDocValuesProducer();
      this.numericProducer = new NumericDocValuesProducer();
    }
@ -914,10 +905,8 @@ public class MemoryIndex {
        if (dvType == DocValuesType.NUMERIC || dvType == DocValuesType.SORTED_NUMERIC) {
          numericProducer.prepareForUsage();
        }
-        if (dvType == DocValuesType.BINARY
-            || dvType == DocValuesType.SORTED
-            || dvType == DocValuesType.SORTED_SET) {
-          binaryProducer.prepareForUsage();
+        if (dvType == DocValuesType.SORTED_SET) {
+          bytesRefHashProducer.prepareForUsage();
        }
        if (pointValues != null) {
          assert pointValues[0].bytes.length == pointValues[0].length
@ -1193,12 +1182,16 @@ public class MemoryIndex {
  }

  private static final class BinaryDocValuesProducer {
+    BytesRef dvBytesValuesSet;
+  }

-    BytesRefHash dvBytesValuesSet;
+  private static final class BytesRefHashDocValuesProducer {
+
+    BytesRefHash dvBytesRefHashValuesSet;
    int[] bytesIds;

    private void prepareForUsage() {
-      bytesIds = dvBytesValuesSet.sort();
+      bytesIds = dvBytesRefHashValuesSet.sort();
    }
  }

@ -1316,8 +1309,7 @@ public class MemoryIndex {
    private SortedDocValues getSortedDocValues(String field, DocValuesType docValuesType) {
      Info info = getInfoForExpectedDocValuesType(field, docValuesType);
      if (info != null) {
-        BytesRef value = info.binaryProducer.dvBytesValuesSet.get(0, new BytesRef());
-        return sortedDocValues(value);
+        return sortedDocValues(info.binaryProducer.dvBytesValuesSet);
      } else {
        return null;
      }
@ -1338,7 +1330,7 @@ public class MemoryIndex {
      Info info = getInfoForExpectedDocValuesType(field, DocValuesType.SORTED_SET);
      if (info != null) {
        return sortedSetDocValues(
-            info.binaryProducer.dvBytesValuesSet, info.binaryProducer.bytesIds);
+            info.bytesRefHashProducer.dvBytesRefHashValuesSet, info.bytesRefHashProducer.bytesIds);
      } else {
        return null;
      }
--- a/lucene/memory/src/test/org/apache/lucene/index/memory/TestMemoryIndex.java
+++ b/lucene/memory/src/test/org/apache/lucene/index/memory/TestMemoryIndex.java
@ -430,6 +430,30 @@ public class TestMemoryIndex extends LuceneTestCase {
    assertEquals("quick brown fox", binaryDocValues.binaryValue().utf8ToString());
  }

+  public void testBigBinaryDocValues() throws Exception {
+    Document doc = new Document();
+    byte[] bytes = new byte[33 * 1024];
+    random().nextBytes(bytes);
+    doc.add(new BinaryDocValuesField("binary", new BytesRef(bytes)));
+    MemoryIndex mi = MemoryIndex.fromDocument(doc, analyzer, true, true);
+    LeafReader leafReader = mi.createSearcher().getIndexReader().leaves().get(0).reader();
+    BinaryDocValues binaryDocValues = leafReader.getBinaryDocValues("binary");
+    assertEquals(0, binaryDocValues.nextDoc());
+    assertArrayEquals(bytes, binaryDocValues.binaryValue().bytes);
+  }
+
+  public void testBigSortedDocValues() throws Exception {
+    Document doc = new Document();
+    byte[] bytes = new byte[33 * 1024];
+    random().nextBytes(bytes);
+    doc.add(new SortedDocValuesField("binary", new BytesRef(bytes)));
+    MemoryIndex mi = MemoryIndex.fromDocument(doc, analyzer, true, true);
+    LeafReader leafReader = mi.createSearcher().getIndexReader().leaves().get(0).reader();
+    SortedDocValues sortedDocValues = leafReader.getSortedDocValues("binary");
+    assertEquals(0, sortedDocValues.nextDoc());
+    assertArrayEquals(bytes, sortedDocValues.lookupOrd(0).bytes);
+  }
+
  public void testPointValues() throws Exception {
    List<Function<Long, IndexableField>> fieldFunctions =
        Arrays.asList(