mirror of https://github.com/apache/lucene.git
LUCENE-8380: UTF8TaxonomyWriterCache page/ offset calculation bug
This commit is contained in:
parent
b7b6f242e8
commit
0f652627a0
|
@ -129,6 +129,8 @@ API Changes:
|
|||
|
||||
Bug Fixes:
|
||||
|
||||
* LUCENE-8380: UTF8TaxonomyWriterCache inconsistency. (Ruslan Torobaev, Dawid Weiss)
|
||||
|
||||
* LUCENE-8164: IndexWriter silently accepts broken payload. This has been fixed
|
||||
via LUCENE-8165 since we are now checking for offset+length going out of bounds.
|
||||
(Robert Muir, Nhat Nyugen, Simon Willnauer)
|
||||
|
|
|
@ -37,12 +37,14 @@ public final class UTF8TaxonomyWriterCache implements TaxonomyWriterCache, Accou
|
|||
return new BytesRefBuilder();
|
||||
}
|
||||
};
|
||||
|
||||
private final Counter bytesUsed = Counter.newCounter();
|
||||
private final BytesRefHash map = new BytesRefHash(new ByteBlockPool(new DirectTrackingAllocator(bytesUsed)));
|
||||
|
||||
private final static int ORDINALS_PAGE_SIZE = 65536;
|
||||
private final static int ORDINALS_PAGE_MASK = ORDINALS_PAGE_SIZE - 1;
|
||||
|
||||
private final static int PAGE_BITS = 16;
|
||||
private final static int PAGE_SIZE = 1 << PAGE_BITS;
|
||||
private final static int PAGE_MASK = PAGE_SIZE - 1;
|
||||
|
||||
private volatile int[][] ordinals;
|
||||
|
||||
// How many labels we are storing:
|
||||
|
@ -54,7 +56,7 @@ public final class UTF8TaxonomyWriterCache implements TaxonomyWriterCache, Accou
|
|||
/** Sole constructor. */
|
||||
public UTF8TaxonomyWriterCache() {
|
||||
ordinals = new int[1][];
|
||||
ordinals[0] = new int[ORDINALS_PAGE_SIZE];
|
||||
ordinals[0] = new int[PAGE_SIZE];
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -67,16 +69,16 @@ public final class UTF8TaxonomyWriterCache implements TaxonomyWriterCache, Accou
|
|||
if (id == -1) {
|
||||
return LabelToOrdinal.INVALID_ORDINAL;
|
||||
}
|
||||
int page = id / ORDINALS_PAGE_SIZE;
|
||||
int offset = id % ORDINALS_PAGE_MASK;
|
||||
int page = id >>> PAGE_BITS;
|
||||
int offset = id & PAGE_MASK;
|
||||
return ordinals[page][offset];
|
||||
}
|
||||
|
||||
// Called only from assert
|
||||
private boolean assertSameOrdinal(FacetLabel label, int id, int ord) {
|
||||
id = -id - 1;
|
||||
int page = id / ORDINALS_PAGE_SIZE;
|
||||
int offset = id % ORDINALS_PAGE_MASK;
|
||||
int page = id >>> PAGE_BITS;
|
||||
int offset = id & PAGE_MASK;
|
||||
int oldOrd = ordinals[page][offset];
|
||||
if (oldOrd != ord) {
|
||||
throw new IllegalArgumentException("label " + label + " was already cached, with old ord=" + oldOrd + " versus new ord=" + ord);
|
||||
|
@ -95,15 +97,15 @@ public final class UTF8TaxonomyWriterCache implements TaxonomyWriterCache, Accou
|
|||
return false;
|
||||
}
|
||||
assert id == count;
|
||||
int page = id / ORDINALS_PAGE_SIZE;
|
||||
int offset = id % ORDINALS_PAGE_MASK;
|
||||
int page = id >>> PAGE_BITS;
|
||||
int offset = id & PAGE_MASK;
|
||||
if (page == pageCount) {
|
||||
if (page == ordinals.length) {
|
||||
int[][] newOrdinals = new int[ArrayUtil.oversize(page+1, RamUsageEstimator.NUM_BYTES_OBJECT_REF)][];
|
||||
System.arraycopy(ordinals, 0, newOrdinals, 0, ordinals.length);
|
||||
ordinals = newOrdinals;
|
||||
}
|
||||
ordinals[page] = new int[ORDINALS_PAGE_MASK];
|
||||
ordinals[page] = new int[PAGE_SIZE];
|
||||
pageCount++;
|
||||
}
|
||||
ordinals[page][offset] = ord;
|
||||
|
@ -125,7 +127,7 @@ public final class UTF8TaxonomyWriterCache implements TaxonomyWriterCache, Accou
|
|||
map.clear();
|
||||
map.reinit();
|
||||
ordinals = new int[1][];
|
||||
ordinals[0] = new int[ORDINALS_PAGE_SIZE];
|
||||
ordinals[0] = new int[PAGE_SIZE];
|
||||
count = 0;
|
||||
pageCount = 0;
|
||||
assert bytesUsed.get() == 0;
|
||||
|
@ -138,7 +140,7 @@ public final class UTF8TaxonomyWriterCache implements TaxonomyWriterCache, Accou
|
|||
|
||||
@Override
|
||||
public synchronized long ramBytesUsed() {
|
||||
return bytesUsed.get() + pageCount * ORDINALS_PAGE_SIZE * RamUsageEstimator.NUM_BYTES_INT;
|
||||
return bytesUsed.get() + pageCount * PAGE_SIZE * Integer.BYTES;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -146,7 +148,7 @@ public final class UTF8TaxonomyWriterCache implements TaxonomyWriterCache, Accou
|
|||
}
|
||||
|
||||
private static final byte DELIM_CHAR = (byte) 0x1F;
|
||||
|
||||
|
||||
private BytesRef toBytes(FacetLabel label) {
|
||||
BytesRefBuilder bytes = this.bytes.get();
|
||||
bytes.clear();
|
||||
|
|
|
@ -28,6 +28,16 @@ import org.apache.lucene.facet.taxonomy.FacetLabel;
|
|||
import org.apache.lucene.util.TestUtil;
|
||||
|
||||
public class TestUTF8TaxonomyWriterCache extends FacetTestCase {
|
||||
public void testPageOverflow() throws Exception {
|
||||
UTF8TaxonomyWriterCache cache = new UTF8TaxonomyWriterCache();
|
||||
for (int ord = 0; ord < 65536 * 2; ord++) {
|
||||
cache.put(new FacetLabel("foo:", Integer.toString(ord)), ord);
|
||||
}
|
||||
|
||||
for (int ord = 0; ord < 65536 * 2; ord++) {
|
||||
assertEquals(ord, cache.get(new FacetLabel("foo:", Integer.toString(ord))));
|
||||
}
|
||||
}
|
||||
|
||||
public void testRandom() throws Exception {
|
||||
LabelToOrdinal map = new LabelToOrdinalMap();
|
||||
|
@ -37,8 +47,6 @@ public class TestUTF8TaxonomyWriterCache extends FacetTestCase {
|
|||
final int n = atLeast(10 * 1000);
|
||||
final int numUniqueValues = 50 * 1000;
|
||||
|
||||
byte[] buffer = new byte[50];
|
||||
|
||||
Random random = random();
|
||||
Set<String> uniqueValuesSet = new HashSet<>();
|
||||
while (uniqueValuesSet.size() < numUniqueValues) {
|
||||
|
|
Loading…
Reference in New Issue