mirror of https://github.com/apache/lucene.git
SOLR-11240: Raise UnInvertedField internal limit
This commit is contained in:
parent
b67424ee58
commit
85b89d15a8
|
@ -152,6 +152,8 @@ Other Changes
|
|||
|
||||
* SOLR-10628: Less verbose output from bin/solr commands. (Jason Gerlowski, janhoy)
|
||||
|
||||
* SOLR-11240: Raise UnInvertedField internal limit. (Toke Eskildsen)
|
||||
|
||||
================== 7.0.0 ==================
|
||||
|
||||
Versions of Major Components
|
||||
|
|
|
@ -97,8 +97,10 @@ public class UnInvertedField extends DocTermOrds {
|
|||
long memsz;
|
||||
final AtomicLong use = new AtomicLong(); // number of uses
|
||||
|
||||
/* The number of documents holding the term {@code maxDocs = maxTermCounts[termNum]}. */
|
||||
int[] maxTermCounts = new int[1024];
|
||||
|
||||
/* termNum -> docIDs for big terms. */
|
||||
final Map<Integer,TopTerm> bigTerms = new LinkedHashMap<>();
|
||||
|
||||
private SolrIndexSearcher.DocsEnumState deState;
|
||||
|
@ -111,6 +113,12 @@ public class UnInvertedField extends DocTermOrds {
|
|||
searcher = null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Called for each term in the field being uninverted.
|
||||
* Collects {@link #maxTermCounts} for all bigTerms as well as storing them in {@link #bigTerms}.
|
||||
* @param te positioned at the current term.
|
||||
* @param termNum the ID/pointer/ordinal of the current term. Monotonically increasing between calls.
|
||||
*/
|
||||
@Override
|
||||
protected void visitTerm(TermsEnum te, int termNum) throws IOException {
|
||||
|
||||
|
@ -164,10 +172,6 @@ public class UnInvertedField extends DocTermOrds {
|
|||
}
|
||||
if (maxTermCounts != null)
|
||||
sz += maxTermCounts.length * 4;
|
||||
if (indexedTermsArray != null) {
|
||||
// assume 8 byte references?
|
||||
sz += 8+8+8+8+(indexedTermsArray.length<<3)+sizeOfIndexedStrings;
|
||||
}
|
||||
memsz = sz;
|
||||
return sz;
|
||||
}
|
||||
|
@ -258,8 +262,8 @@ public class UnInvertedField extends DocTermOrds {
|
|||
if (termInstances > 0) {
|
||||
int code = index[doc];
|
||||
|
||||
if ((code & 0xff)==1) {
|
||||
int pos = code>>>8;
|
||||
if ((code & 0x80000000)!=0) {
|
||||
int pos = code & 0x7fffffff;
|
||||
int whichArray = (doc >>> 16) & 0xff;
|
||||
byte[] arr = tnums[whichArray];
|
||||
int tnum = 0;
|
||||
|
@ -344,8 +348,8 @@ public class UnInvertedField extends DocTermOrds {
|
|||
int doc = iter.nextDoc();
|
||||
int code = index[doc];
|
||||
|
||||
if ((code & 0xff) == 1) {
|
||||
int pos = code >>> 8;
|
||||
if ((code & 0x80000000)!=0) {
|
||||
int pos = code & 0x7fffffff;
|
||||
int whichArray = (doc >>> 16) & 0xff;
|
||||
byte[] arr = tnums[whichArray];
|
||||
int tnum = 0;
|
||||
|
@ -469,8 +473,8 @@ public class UnInvertedField extends DocTermOrds {
|
|||
|
||||
int code = index[doc];
|
||||
|
||||
if ((code & 0xff)==1) {
|
||||
int pos = code>>>8;
|
||||
if ((code & 0x80000000)!=0) {
|
||||
int pos = code & 0x7fffffff;
|
||||
int whichArray = (doc >>> 16) & 0xff;
|
||||
byte[] arr = tnums[whichArray];
|
||||
int tnum = 0;
|
||||
|
|
|
@ -55,10 +55,9 @@ import org.apache.lucene.util.StringHelper;
|
|||
* int as the internal representation here cannot address
|
||||
* more than MAX_INT unique terms. Also, typically this
|
||||
* class is used on fields with relatively few unique terms
|
||||
* vs the number of documents. In addition, there is an
|
||||
* internal limit (16 MB) on how many bytes each chunk of
|
||||
* documents may consume. If you trip this limit you'll hit
|
||||
* an IllegalStateException.
|
||||
* vs the number of documents. A previous internal limit (16 MB)
|
||||
* on how many bytes each chunk of documents may consume has been
|
||||
* increased to 2 GB.
|
||||
*
|
||||
* Deleted documents are skipped during uninversion, and if
|
||||
* you look them up you'll get 0 ords.
|
||||
|
@ -69,11 +68,10 @@ import org.apache.lucene.util.StringHelper;
|
|||
* are also de-dup'd (ie if doc has same term more than once
|
||||
* in this field, you'll only get that ord back once).
|
||||
*
|
||||
* This class
|
||||
* will create its own term index internally, allowing to
|
||||
* create a wrapped TermsEnum that can handle ord. The
|
||||
* {@link #getOrdTermsEnum} method then provides this
|
||||
* wrapped enum.
|
||||
* This class will create its own term index internally, allowing to
|
||||
* create a wrapped TermsEnum that can handle ord.
|
||||
* The {@link #getOrdTermsEnum} method then provides this wrapped
|
||||
* enum.
|
||||
*
|
||||
* The RAM consumption of this class can be high!
|
||||
*
|
||||
|
@ -81,7 +79,7 @@ import org.apache.lucene.util.StringHelper;
|
|||
*/
|
||||
|
||||
/*
|
||||
* Final form of the un-inverted field:
|
||||
* The un-inverted field:
|
||||
* Each document points to a list of term numbers that are contained in that document.
|
||||
*
|
||||
* Term numbers are in sorted order, and are encoded as variable-length deltas from the
|
||||
|
@ -89,13 +87,17 @@ import org.apache.lucene.util.StringHelper;
|
|||
* term number of 0 signals the end of the termNumber list.
|
||||
*
|
||||
* There is a single int[maxDoc()] which either contains a pointer into a byte[] for
|
||||
* the termNumber lists, or directly contains the termNumber list if it fits in the 4
|
||||
* bytes of an integer. If the first byte in the integer is 1, the next 3 bytes
|
||||
* are a pointer into a byte[] where the termNumber list starts.
|
||||
* the termNumber lists, or directly contains the termNumber list if it fits as a vInt-list
|
||||
* in the 4 bytes of an integer. As bit 7 within each byte is used in the vInt encoding to
|
||||
* signal overflow into the next byte, bit 7 of the highest byte (bit 31 in the full integer)
|
||||
* will never be 1. If bit 31 in the integer is set, this signals a pointer and bit 0-30
|
||||
* is then the value of the pointer into a byte[] where the termNumber list starts.
|
||||
*
|
||||
* There are actually 256 byte arrays, to compensate for the fact that the pointers
|
||||
* into the byte arrays are only 3 bytes long. The correct byte array for a document
|
||||
* is a function of its id.
|
||||
* A single entry is thus either 0b0xxxxxxxx_xxxxxxxx_xxxxxxxx_xxxxxxxx holding 0-4 vInts
|
||||
* (low byte first) or 0b1xxxxxxxx_xxxxxxxx_xxxxxxxx_xxxxxxxx holding a 31-bit pointer.
|
||||
*
|
||||
* There are 256 byte arrays, as the previous version of DocTermOrds had a pointer limit
|
||||
* of 24 bits / 3 bytes. The correct byte array for a document is a function of its id.
|
||||
*
|
||||
* To save space and speed up faceting, any term that matches enough documents will
|
||||
* not be un-inverted... it will be skipped while building the un-inverted field structure,
|
||||
|
@ -107,7 +109,6 @@ import org.apache.lucene.util.StringHelper;
|
|||
* is stored, along with its corresponding term number, and this is used as an
|
||||
* index to find the closest term and iterate until the desired number is hit (very
|
||||
* much like Lucene's own internal term index).
|
||||
*
|
||||
*/
|
||||
|
||||
public class DocTermOrds implements Accountable {
|
||||
|
@ -152,6 +153,9 @@ public class DocTermOrds implements Accountable {
|
|||
protected long sizeOfIndexedStrings;
|
||||
|
||||
/** Holds the indexed (by default every 128th) terms. */
|
||||
// TODO: This seems like an obvious candidate for using BytesRefArray extended with binarySearch
|
||||
// This would save heap space as well as avoid a lot of small Objects (BytesRefs).
|
||||
// This would also increase data locality for binarySearch lookups, potentially making it faster.
|
||||
protected BytesRef[] indexedTermsArray = new BytesRef[0];
|
||||
|
||||
/** If non-null, only terms matching this prefix were
|
||||
|
@ -170,7 +174,9 @@ public class DocTermOrds implements Accountable {
|
|||
* Normally, docValues should be used in preference to DocTermOrds. */
|
||||
protected boolean checkForDocValues = true;
|
||||
|
||||
// TODO: Why is indexedTermsArray not part of this?
|
||||
/** Returns total bytes used. */
|
||||
@Override
|
||||
public long ramBytesUsed() {
|
||||
// can cache the mem size since it shouldn't change
|
||||
if (memsz!=0) return memsz;
|
||||
|
@ -180,11 +186,15 @@ public class DocTermOrds implements Accountable {
|
|||
for (byte[] arr : tnums)
|
||||
if (arr != null) sz += arr.length;
|
||||
}
|
||||
if (indexedTermsArray != null) {
|
||||
// assume 8 byte references?
|
||||
sz += 8+8+8+8+(indexedTermsArray.length<<3)+sizeOfIndexedStrings;
|
||||
}
|
||||
memsz = sz;
|
||||
return sz;
|
||||
}
|
||||
|
||||
/** Inverts all terms */
|
||||
/** Inverts all terms. */
|
||||
public DocTermOrds(LeafReader reader, Bits liveDocs, String field) throws IOException {
|
||||
this(reader, liveDocs, field, null, Integer.MAX_VALUE);
|
||||
}
|
||||
|
@ -374,10 +384,9 @@ public class DocTermOrds implements Accountable {
|
|||
lastTerm[doc] = termNum;
|
||||
int val = index[doc];
|
||||
|
||||
if ((val & 0xff)==1) {
|
||||
// index into byte array (actually the end of
|
||||
// the doc-specific byte[] when building)
|
||||
int pos = val >>> 8;
|
||||
if ((val & 0x80000000) != 0) {
|
||||
// index into byte array (actually the end of the doc-specific byte[] when building)
|
||||
int pos = val & 0x7fffffff;
|
||||
int ilen = vIntSize(delta);
|
||||
byte[] arr = bytes[doc];
|
||||
int newend = pos+ilen;
|
||||
|
@ -395,7 +404,7 @@ public class DocTermOrds implements Accountable {
|
|||
bytes[doc] = newarr;
|
||||
}
|
||||
pos = writeInt(delta, arr, pos);
|
||||
index[doc] = (pos<<8) | 1; // update pointer to end index in byte[]
|
||||
index[doc] = pos | 0x80000000; // update pointer to end index in byte[]
|
||||
} else {
|
||||
// OK, this int has data in it... find the end (a zero starting byte - not
|
||||
// part of another number, hence not following a byte with the high bit set).
|
||||
|
@ -430,7 +439,7 @@ public class DocTermOrds implements Accountable {
|
|||
val >>>=8;
|
||||
}
|
||||
// point at the end index in the byte[]
|
||||
index[doc] = (endPos<<8) | 1;
|
||||
index[doc] = endPos | 0x80000000;
|
||||
bytes[doc] = tempArr;
|
||||
tempArr = new byte[12];
|
||||
}
|
||||
|
@ -480,14 +489,11 @@ public class DocTermOrds implements Accountable {
|
|||
for (int doc=docbase; doc<lim; doc++) {
|
||||
//System.out.println(" pass=" + pass + " process docID=" + doc);
|
||||
int val = index[doc];
|
||||
if ((val&0xff) == 1) {
|
||||
int len = val >>> 8;
|
||||
if ((val & 0x80000000) != 0) {
|
||||
int len = val & 0x7fffffff;
|
||||
//System.out.println(" ptr pos=" + pos);
|
||||
index[doc] = (pos<<8)|1; // change index to point to start of array
|
||||
if ((pos & 0xff000000) != 0) {
|
||||
// we only have 24 bits for the array index
|
||||
throw new IllegalStateException("Too many values for UnInvertedField faceting on field "+field);
|
||||
}
|
||||
//index[doc] = (pos<<8)|1; // change index to point to start of array
|
||||
index[doc] = pos | 0x80000000; // change index to point to start of array
|
||||
byte[] arr = bytes[doc];
|
||||
/*
|
||||
for(byte b : arr) {
|
||||
|
@ -497,19 +503,15 @@ public class DocTermOrds implements Accountable {
|
|||
bytes[doc] = null; // IMPORTANT: allow GC to avoid OOM
|
||||
if (target.length <= pos + len) {
|
||||
int newlen = target.length;
|
||||
/*** we don't have to worry about the array getting too large
|
||||
* since the "pos" param will overflow first (only 24 bits available)
|
||||
if ((newlen<<1) <= 0) {
|
||||
// overflow...
|
||||
newlen = Integer.MAX_VALUE;
|
||||
while (newlen <= pos + len) {
|
||||
if ((newlen<<=1) < 0) { // Double until overflow
|
||||
newlen = Integer.MAX_VALUE - 16; // ArrayList.MAX_ARRAY_SIZE says 8. We double that to be sure
|
||||
if (newlen <= pos + len) {
|
||||
throw new SolrException(400,"Too many terms to uninvert field!");
|
||||
throw new IllegalStateException(
|
||||
"Too many terms (> Integer.MAX_VALUE-16) to uninvert field '" + field + "'");
|
||||
}
|
||||
}
|
||||
} else {
|
||||
while (newlen <= pos + len) newlen<<=1; // doubling strategy
|
||||
}
|
||||
****/
|
||||
while (newlen <= pos + len) newlen<<=1; // doubling strategy
|
||||
byte[] newtarget = new byte[newlen];
|
||||
System.arraycopy(target, 0, newtarget, 0, pos);
|
||||
target = newtarget;
|
||||
|
@ -544,23 +546,24 @@ public class DocTermOrds implements Accountable {
|
|||
|
||||
/** Number of bytes to represent an unsigned int as a vint. */
|
||||
private static int vIntSize(int x) {
|
||||
if ((x & (0xffffffff << (7*1))) == 0 ) {
|
||||
return 1;
|
||||
}
|
||||
if ((x & (0xffffffff << (7*2))) == 0 ) {
|
||||
return 2;
|
||||
}
|
||||
if ((x & (0xffffffff << (7*3))) == 0 ) {
|
||||
return 3;
|
||||
}
|
||||
if ((x & (0xffffffff << (7*4))) == 0 ) {
|
||||
return 4;
|
||||
}
|
||||
return 5;
|
||||
// Tests outside of this code base shows that the previous conditional-based vIntSize is fairly slow until
|
||||
// JITted and still about 1/3 slower after JIT than the numberOfLeadingZeros version below.
|
||||
return BLOCK7[Integer.numberOfLeadingZeros(x)]; // Intrinsic on modern CPUs
|
||||
}
|
||||
private final static byte[] BLOCK7 = new byte[]{
|
||||
5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1};
|
||||
|
||||
// todo: if we know the size of the vInt already, we could do
|
||||
// a single switch on the size
|
||||
|
||||
/**
|
||||
* Write the x value as vInt at pos in arr, returning the new endPos. This requires arr to be capable of holding the
|
||||
* bytes needed to represent x. Array length checking should be performed beforehand.
|
||||
* @param x the value to write as vInt.
|
||||
* @param arr the array holding vInt-values.
|
||||
* @param pos the position in arr where the vInt representation of x should be written.
|
||||
* @return the new end position after writing x at pos.
|
||||
*/
|
||||
private static int writeInt(int x, byte[] arr, int pos) {
|
||||
int a;
|
||||
a = (x >>> (7*4));
|
||||
|
@ -830,9 +833,9 @@ public class DocTermOrds implements Accountable {
|
|||
public void setDocument(int docID) {
|
||||
tnum = 0;
|
||||
final int code = index[docID];
|
||||
if ((code & 0xff)==1) {
|
||||
if ((code & 0x80000000) != 0) {
|
||||
// a pointer
|
||||
upto = code>>>8;
|
||||
upto = code & 0x7fffffff;
|
||||
//System.out.println(" pointer! upto=" + upto);
|
||||
int whichArray = (docID >>> 16) & 0xff;
|
||||
arr = tnums[whichArray];
|
||||
|
|
|
@ -136,6 +136,59 @@ public class TestDocTermOrds extends LuceneTestCase {
|
|||
dir.close();
|
||||
}
|
||||
|
||||
/* UnInvertedField had a reference block limitation of 2^24. This unit test triggered it.
|
||||
*
|
||||
* With the current code, the test verifies that the old limit no longer applies.
|
||||
* New limit is 2^31, which is not very realistic to unit-test. */
|
||||
@SuppressWarnings({"ConstantConditions", "PointlessBooleanExpression"})
|
||||
@Nightly
|
||||
public void testTriggerUnInvertLimit() throws IOException {
|
||||
final boolean SHOULD_TRIGGER = false; // Set this to true to use the test with the old implementation
|
||||
|
||||
// Ensure enough terms inside of a single UnInvert-pass-structure to trigger the limit
|
||||
final int REF_LIMIT = (int) Math.pow(2, 24); // Maximum number of references within a single pass-structure
|
||||
final int DOCS = (1<<16)-1; // The number of documents within a single pass (simplified)
|
||||
final int TERMS = REF_LIMIT/DOCS; // Each document must have this many references aka terms hit limit
|
||||
|
||||
Directory dir = newDirectory();
|
||||
final RandomIndexWriter w = new RandomIndexWriter(random(), dir,
|
||||
newIndexWriterConfig(new MockAnalyzer(random())).setMergePolicy(newLogMergePolicy()));
|
||||
Document doc = new Document();
|
||||
Field field = newTextField("field", "", Field.Store.NO);
|
||||
doc.add(field);
|
||||
|
||||
StringBuilder sb = new StringBuilder(TERMS*(Integer.toString(TERMS).length()+1));
|
||||
for (int i = 0 ; i < TERMS ; i++) {
|
||||
sb.append(" ").append(Integer.toString(i));
|
||||
}
|
||||
field.setStringValue(sb.toString());
|
||||
|
||||
for (int i = 0 ; i < DOCS ; i++) {
|
||||
w.addDocument(doc);
|
||||
}
|
||||
//System.out.println("\n Finished adding " + DOCS + " documents of " + TERMS + " unique terms");
|
||||
final IndexReader r = w.getReader();
|
||||
w.close();
|
||||
|
||||
try {
|
||||
final LeafReader ar = SlowCompositeReaderWrapper.wrap(r);
|
||||
TestUtil.checkReader(ar);
|
||||
final DocTermOrds dto = new DocTermOrds(ar, ar.getLiveDocs(), "field"); // bigTerms turned off
|
||||
if (SHOULD_TRIGGER) {
|
||||
fail("DocTermOrds should have failed with a \"Too many values for UnInvertedField\" message");
|
||||
}
|
||||
} catch (IllegalStateException e) {
|
||||
if (!SHOULD_TRIGGER) {
|
||||
fail("DocTermsOrd should not have failed with this implementation, but got exception " +
|
||||
e.getClass().getSimpleName() + " with message " + e.getMessage());
|
||||
}
|
||||
// This is (hopefully) "Too many values for UnInvertedField faceting on field field", so all is as expected
|
||||
} finally {
|
||||
r.close();
|
||||
dir.close();
|
||||
}
|
||||
}
|
||||
|
||||
public void testRandom() throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
|
||||
|
|
Loading…
Reference in New Issue