SOLR-11240: Raise UnInvertedField internal limit

2017-08-22 15:12:38 +02:00 · 2017-08-22 15:12:38 +02:00 · 85b89d15a8
parent b67424ee58
commit 85b89d15a8
4 changed files with 129 additions and 67 deletions
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@ -152,6 +152,8 @@ Other Changes

 * SOLR-10628: Less verbose output from bin/solr commands. (Jason Gerlowski, janhoy) 

+* SOLR-11240: Raise UnInvertedField internal limit. (Toke Eskildsen)
+
 ==================  7.0.0 ==================

 Versions of Major Components
--- a/solr/core/src/java/org/apache/solr/search/facet/UnInvertedField.java
+++ b/solr/core/src/java/org/apache/solr/search/facet/UnInvertedField.java
@ -97,8 +97,10 @@ public class UnInvertedField extends DocTermOrds {
  long memsz;
  final AtomicLong use = new AtomicLong(); // number of uses

+  /* The number of documents holding the term {@code maxDocs = maxTermCounts[termNum]}. */
  int[] maxTermCounts = new int[1024];

+  /* termNum -> docIDs for big terms. */
  final Map<Integer,TopTerm> bigTerms = new LinkedHashMap<>();

  private SolrIndexSearcher.DocsEnumState deState;
@ -111,6 +113,12 @@ public class UnInvertedField extends DocTermOrds {
    searcher = null;
  }

+  /**
+   * Called for each term in the field being uninverted.
+   * Collects {@link #maxTermCounts} for all bigTerms as well as storing them in {@link #bigTerms}.
+   * @param te positioned at the current term.
+   * @param termNum the ID/pointer/ordinal of the current term. Monotonically increasing between calls.
+   */
  @Override
  protected void visitTerm(TermsEnum te, int termNum) throws IOException {

@ -164,10 +172,6 @@ public class UnInvertedField extends DocTermOrds {
    }
    if (maxTermCounts != null)
      sz += maxTermCounts.length * 4;
-    if (indexedTermsArray != null) {
-      // assume 8 byte references?
-      sz += 8+8+8+8+(indexedTermsArray.length<<3)+sizeOfIndexedStrings;
-    }
    memsz = sz;
    return sz;
  }
@ -258,8 +262,8 @@ public class UnInvertedField extends DocTermOrds {
      if (termInstances > 0) {
        int code = index[doc];

-        if ((code & 0xff)==1) {
-          int pos = code>>>8;
+        if ((code & 0x80000000)!=0) {
+          int pos = code & 0x7fffffff;
          int whichArray = (doc >>> 16) & 0xff;
          byte[] arr = tnums[whichArray];
          int tnum = 0;
@ -344,8 +348,8 @@ public class UnInvertedField extends DocTermOrds {
        int doc = iter.nextDoc();
        int code = index[doc];

-        if ((code & 0xff) == 1) {
-          int pos = code >>> 8;
+        if ((code & 0x80000000)!=0) {
+          int pos = code & 0x7fffffff;
          int whichArray = (doc >>> 16) & 0xff;
          byte[] arr = tnums[whichArray];
          int tnum = 0;
@ -469,8 +473,8 @@ public class UnInvertedField extends DocTermOrds {

        int code = index[doc];

-        if ((code & 0xff)==1) {
-          int pos = code>>>8;
+        if ((code & 0x80000000)!=0) {
+          int pos = code & 0x7fffffff;
          int whichArray = (doc >>> 16) & 0xff;
          byte[] arr = tnums[whichArray];
          int tnum = 0;
--- a/solr/core/src/java/org/apache/solr/uninverting/DocTermOrds.java
+++ b/solr/core/src/java/org/apache/solr/uninverting/DocTermOrds.java
@ -55,10 +55,9 @@ import org.apache.lucene.util.StringHelper;
 * int as the internal representation here cannot address
 * more than MAX_INT unique terms.  Also, typically this
 * class is used on fields with relatively few unique terms
- * vs the number of documents.  In addition, there is an
- * internal limit (16 MB) on how many bytes each chunk of
- * documents may consume.  If you trip this limit you'll hit
- * an IllegalStateException.
+ * vs the number of documents. A previous internal limit (16 MB)
+ * on how many bytes each chunk of documents may consume has been
+ * increased to 2 GB.
 *
 * Deleted documents are skipped during uninversion, and if
 * you look them up you'll get 0 ords.
@ -69,11 +68,10 @@ import org.apache.lucene.util.StringHelper;
 * are also de-dup'd (ie if doc has same term more than once
 * in this field, you'll only get that ord back once).
 *
- * This class
- * will create its own term index internally, allowing to
- * create a wrapped TermsEnum that can handle ord.  The
- * {@link #getOrdTermsEnum} method then provides this
- * wrapped enum.
+ * This class will create its own term index internally, allowing to
+ * create a wrapped TermsEnum that can handle ord.
+ * The {@link #getOrdTermsEnum} method then provides this wrapped
+ * enum.
 *
 * The RAM consumption of this class can be high!
 *
@ -81,7 +79,7 @@ import org.apache.lucene.util.StringHelper;
 */

 /*
- * Final form of the un-inverted field:
+ * The un-inverted field:
 *   Each document points to a list of term numbers that are contained in that document.
 *
 *   Term numbers are in sorted order, and are encoded as variable-length deltas from the
@ -89,13 +87,17 @@ import org.apache.lucene.util.StringHelper;
 *   term number of 0 signals the end of the termNumber list.
 *
 *   There is a single int[maxDoc()] which either contains a pointer into a byte[] for
- *   the termNumber lists, or directly contains the termNumber list if it fits in the 4
- *   bytes of an integer.  If the first byte in the integer is 1, the next 3 bytes
- *   are a pointer into a byte[] where the termNumber list starts.
+ *   the termNumber lists, or directly contains the termNumber list if it fits as a vInt-list
+ *   in the 4 bytes of an integer. As bit 7 within each byte is used in the vInt encoding to
+ *   signal overflow into the next byte, bit 7 of the highest byte (bit 31 in the full integer)
+ *   will never be 1. If bit 31 in the integer is set, this signals a pointer and bit 0-30
+ *   is then the value of the pointer into a byte[] where the termNumber list starts.
 *
- *   There are actually 256 byte arrays, to compensate for the fact that the pointers
- *   into the byte arrays are only 3 bytes long.  The correct byte array for a document
- *   is a function of its id.
+ *   A single entry is thus either 0b0xxxxxxxx_xxxxxxxx_xxxxxxxx_xxxxxxxx holding 0-4 vInts
+ *   (low byte first) or 0b1xxxxxxxx_xxxxxxxx_xxxxxxxx_xxxxxxxx holding a 31-bit pointer.
+ *
+ *   There are 256 byte arrays, as the previous version of DocTermOrds had a pointer limit
+ *   of 24 bits / 3 bytes. The correct byte array for a document is a function of its id.
 *
 *   To save space and speed up faceting, any term that matches enough documents will
 *   not be un-inverted... it will be skipped while building the un-inverted field structure,
@ -107,7 +109,6 @@ import org.apache.lucene.util.StringHelper;
 *   is stored, along with its corresponding term number, and this is used as an
 *   index to find the closest term and iterate until the desired number is hit (very
 *   much like Lucene's own internal term index).
- *
 */

 public class DocTermOrds implements Accountable {
@ -152,6 +153,9 @@ public class DocTermOrds implements Accountable {
  protected long sizeOfIndexedStrings;

  /** Holds the indexed (by default every 128th) terms. */
+  // TODO: This seems like an obvious candidate for using BytesRefArray extended with binarySearch
+  // This would save heap space as well as avoid a lot of small Objects (BytesRefs).
+  // This would also increase data locality for binarySearch lookups, potentially making it faster.
  protected BytesRef[] indexedTermsArray = new BytesRef[0];

  /** If non-null, only terms matching this prefix were
@ -170,7 +174,9 @@ public class DocTermOrds implements Accountable {
   * Normally, docValues should be used in preference to DocTermOrds. */
  protected boolean checkForDocValues = true;

+  // TODO: Why is indexedTermsArray not part of this?
  /** Returns total bytes used. */
+  @Override
  public long ramBytesUsed() {
    // can cache the mem size since it shouldn't change
    if (memsz!=0) return memsz;
@ -180,11 +186,15 @@ public class DocTermOrds implements Accountable {
      for (byte[] arr : tnums)
        if (arr != null) sz += arr.length;
    }
+    if (indexedTermsArray != null) {
+      // assume 8 byte references?
+      sz += 8+8+8+8+(indexedTermsArray.length<<3)+sizeOfIndexedStrings;
+    }
    memsz = sz;
    return sz;
  }

-  /** Inverts all terms */
+  /** Inverts all terms. */
  public DocTermOrds(LeafReader reader, Bits liveDocs, String field) throws IOException {
    this(reader, liveDocs, field, null, Integer.MAX_VALUE);
  }
@ -374,10 +384,9 @@ public class DocTermOrds implements Accountable {
          lastTerm[doc] = termNum;
          int val = index[doc];

-          if ((val & 0xff)==1) {
-            // index into byte array (actually the end of
-            // the doc-specific byte[] when building)
-            int pos = val >>> 8;
+          if ((val & 0x80000000) != 0) {
+            // index into byte array (actually the end of the doc-specific byte[] when building)
+            int pos = val & 0x7fffffff;
            int ilen = vIntSize(delta);
            byte[] arr = bytes[doc];
            int newend = pos+ilen;
@ -395,7 +404,7 @@ public class DocTermOrds implements Accountable {
              bytes[doc] = newarr;
            }
            pos = writeInt(delta, arr, pos);
-            index[doc] = (pos<<8) | 1;  // update pointer to end index in byte[]
+            index[doc] = pos | 0x80000000;  // update pointer to end index in byte[]
          } else {
            // OK, this int has data in it... find the end (a zero starting byte - not
            // part of another number, hence not following a byte with the high bit set).
@ -430,7 +439,7 @@ public class DocTermOrds implements Accountable {
                val >>>=8;
              }
              // point at the end index in the byte[]
-              index[doc] = (endPos<<8) | 1;
+              index[doc] = endPos | 0x80000000;
              bytes[doc] = tempArr;
              tempArr = new byte[12];
            }
@ -480,14 +489,11 @@ public class DocTermOrds implements Accountable {
          for (int doc=docbase; doc<lim; doc++) {
            //System.out.println("  pass=" + pass + " process docID=" + doc);
            int val = index[doc];
-            if ((val&0xff) == 1) {
-              int len = val >>> 8;
+            if ((val & 0x80000000) != 0) {
+              int len = val & 0x7fffffff;
              //System.out.println("    ptr pos=" + pos);
-              index[doc] = (pos<<8)|1; // change index to point to start of array
-              if ((pos & 0xff000000) != 0) {
-                // we only have 24 bits for the array index
-                throw new IllegalStateException("Too many values for UnInvertedField faceting on field "+field);
-              }
+              //index[doc] = (pos<<8)|1; // change index to point to start of array
+              index[doc] = pos | 0x80000000; // change index to point to start of array
              byte[] arr = bytes[doc];
              /*
              for(byte b : arr) {
@ -497,19 +503,15 @@ public class DocTermOrds implements Accountable {
              bytes[doc] = null;        // IMPORTANT: allow GC to avoid OOM
              if (target.length <= pos + len) {
                int newlen = target.length;
-                /*** we don't have to worry about the array getting too large
-                 * since the "pos" param will overflow first (only 24 bits available)
-                if ((newlen<<1) <= 0) {
-                  // overflow...
-                  newlen = Integer.MAX_VALUE;
+                while (newlen <= pos + len) {
+                  if ((newlen<<=1) < 0) { // Double until overflow
+                    newlen = Integer.MAX_VALUE - 16; // ArrayList.MAX_ARRAY_SIZE says 8. We double that to be sure
                    if (newlen <= pos + len) {
-                    throw new SolrException(400,"Too many terms to uninvert field!");
+                      throw new IllegalStateException(
+                          "Too many terms (> Integer.MAX_VALUE-16) to uninvert field '" + field + "'");
+                    }
                  }
-                } else {
-                  while (newlen <= pos + len) newlen<<=1;  // doubling strategy
                }
-                ****/
-                while (newlen <= pos + len) newlen<<=1;  // doubling strategy                 
                byte[] newtarget = new byte[newlen];
                System.arraycopy(target, 0, newtarget, 0, pos);
                target = newtarget;
@ -544,23 +546,24 @@ public class DocTermOrds implements Accountable {

  /** Number of bytes to represent an unsigned int as a vint. */
  private static int vIntSize(int x) {
-    if ((x & (0xffffffff << (7*1))) == 0 ) {
-      return 1;
-    }
-    if ((x & (0xffffffff << (7*2))) == 0 ) {
-      return 2;
-    }
-    if ((x & (0xffffffff << (7*3))) == 0 ) {
-      return 3;
-    }
-    if ((x & (0xffffffff << (7*4))) == 0 ) {
-      return 4;
-    }
-    return 5;
+    // Tests outside of this code base shows that the previous conditional-based vIntSize is fairly slow until
+    // JITted and still about 1/3 slower after JIT than the numberOfLeadingZeros version below.
+    return BLOCK7[Integer.numberOfLeadingZeros(x)]; // Intrinsic on modern CPUs
  }
+  private final static byte[] BLOCK7 = new byte[]{
+          5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1};

  // todo: if we know the size of the vInt already, we could do
  // a single switch on the size
+
+  /**
+   * Write the x value as vInt at pos in arr, returning the new endPos. This requires arr to be capable of holding the
+   * bytes needed to represent x. Array length checking should be performed beforehand.
+   * @param x   the value to write as vInt.
+   * @param arr the array holding vInt-values.
+   * @param pos the position in arr where the vInt representation of x should be written.
+   * @return the new end position after writing x at pos.
+   */
  private static int writeInt(int x, byte[] arr, int pos) {
    int a;
    a = (x >>> (7*4));
@ -830,9 +833,9 @@ public class DocTermOrds implements Accountable {
    public void setDocument(int docID) {
      tnum = 0;
      final int code = index[docID];
-      if ((code & 0xff)==1) {
+      if ((code & 0x80000000) != 0) {
        // a pointer
-        upto = code>>>8;
+        upto = code & 0x7fffffff;
        //System.out.println("    pointer!  upto=" + upto);
        int whichArray = (docID >>> 16) & 0xff;
        arr = tnums[whichArray];
--- a/solr/core/src/test/org/apache/solr/uninverting/TestDocTermOrds.java
+++ b/solr/core/src/test/org/apache/solr/uninverting/TestDocTermOrds.java
@ -136,6 +136,59 @@ public class TestDocTermOrds extends LuceneTestCase {
    dir.close();
  }

+  /* UnInvertedField had a reference block limitation of 2^24. This unit test triggered it.
+   *
+   * With the current code, the test verifies that the old limit no longer applies.
+   * New limit is 2^31, which is not very realistic to unit-test. */
+  @SuppressWarnings({"ConstantConditions", "PointlessBooleanExpression"})
+  @Nightly
+  public void testTriggerUnInvertLimit() throws IOException {
+    final boolean SHOULD_TRIGGER = false; // Set this to true to use the test with the old implementation
+
+    // Ensure enough terms inside of a single UnInvert-pass-structure to trigger the limit
+    final int REF_LIMIT = (int) Math.pow(2, 24); // Maximum number of references within a single pass-structure
+    final int DOCS = (1<<16)-1;                  // The number of documents within a single pass (simplified)
+    final int TERMS = REF_LIMIT/DOCS;            // Each document must have this many references aka terms hit limit
+
+    Directory dir = newDirectory();
+    final RandomIndexWriter w = new RandomIndexWriter(random(), dir,
+        newIndexWriterConfig(new MockAnalyzer(random())).setMergePolicy(newLogMergePolicy()));
+    Document doc = new Document();
+    Field field = newTextField("field", "", Field.Store.NO);
+    doc.add(field);
+
+    StringBuilder sb = new StringBuilder(TERMS*(Integer.toString(TERMS).length()+1));
+    for (int i = 0 ; i < TERMS ; i++) {
+      sb.append(" ").append(Integer.toString(i));
+    }
+    field.setStringValue(sb.toString());
+
+    for (int i = 0 ; i < DOCS ; i++) {
+      w.addDocument(doc);
+    }
+    //System.out.println("\n Finished adding " + DOCS + " documents of " + TERMS + " unique terms");
+    final IndexReader r = w.getReader();
+    w.close();
+    
+    try {
+      final LeafReader ar = SlowCompositeReaderWrapper.wrap(r);
+      TestUtil.checkReader(ar);
+      final DocTermOrds dto = new DocTermOrds(ar, ar.getLiveDocs(), "field"); // bigTerms turned off
+      if (SHOULD_TRIGGER) {
+        fail("DocTermOrds should have failed with a \"Too many values for UnInvertedField\" message");
+      }
+    } catch (IllegalStateException e) {
+      if (!SHOULD_TRIGGER) {
+        fail("DocTermsOrd should not have failed with this implementation, but got exception " +
+            e.getClass().getSimpleName() + " with message " + e.getMessage());
+      }
+      // This is (hopefully) "Too many values for UnInvertedField faceting on field field", so all is as expected
+    } finally {
+      r.close();
+      dir.close();
+    }
+  }
+
  public void testRandom() throws Exception {
    Directory dir = newDirectory();