OOM prevention with many unique sorted values (still slooooooooooooow)

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene4547@1440429 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2013-01-30 14:21:22 +00:00
parent 23eb872045
commit c4d720b6a7
2 changed files with 69 additions and 3 deletions

View File

@ -89,6 +89,12 @@ class SortedDocValuesWriter extends DocValuesWriter {
int ord = hash.add(value);
if (ord < 0) {
ord = -ord-1;
} else {
// reserve additional space for each unique value:
// 1. when indexing, when hash is 50% full, rehash() suddenly needs 2*size ints.
// TODO: can this same OOM happen in THPF?
// 2. when flushing, we need 1 int per value (slot in the ordMap).
iwBytesUsed.addAndGet(2 * RamUsageEstimator.NUM_BYTES_INT);
}
pending.add(ord);
@ -120,9 +126,7 @@ class SortedDocValuesWriter extends DocValuesWriter {
final int valueCount = hash.size();
// nocommit: account for both sortedValues and ordMap as-we-go...
final int[] sortedValues = hash.sort(BytesRef.getUTF8SortedAsUnicodeComparator());
final int sortedValueRamUsage = RamUsageEstimator.NUM_BYTES_ARRAY_HEADER + RamUsageEstimator.NUM_BYTES_INT*valueCount;
final int[] ordMap = new int[valueCount];
for(int ord=0;ord<valueCount;ord++) {

View File

@ -93,5 +93,67 @@ public class Test2BSortedDocValues extends LuceneTestCase {
dir.close();
}
// TODO: variable, and also Test2BOrds
// indexes Integer.MAX_VALUE docs with a fixed binary field
// nocommit: this must be some kind of worst case for BytesRefHash / its hash fn...
// or there is some other perf bug...VERY slow!
public void test2BOrds() throws Exception {
BaseDirectoryWrapper dir = newFSDirectory(_TestUtil.getTempDir("2BOrds"));
if (dir instanceof MockDirectoryWrapper) {
((MockDirectoryWrapper)dir).setThrottling(MockDirectoryWrapper.Throttling.NEVER);
}
IndexWriter w = new IndexWriter(dir,
new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()))
.setMaxBufferedDocs(IndexWriterConfig.DISABLE_AUTO_FLUSH)
.setRAMBufferSizeMB(256.0)
.setMergeScheduler(new ConcurrentMergeScheduler())
.setMergePolicy(newLogMergePolicy(false, 10))
.setOpenMode(IndexWriterConfig.OpenMode.CREATE));
Document doc = new Document();
byte bytes[] = new byte[4];
BytesRef data = new BytesRef(bytes);
SortedDocValuesField dvField = new SortedDocValuesField("dv", data);
doc.add(dvField);
for (int i = 0; i < Integer.MAX_VALUE; i++) {
bytes[0] = (byte)(i >> 24);
bytes[1] = (byte)(i >> 16);
bytes[2] = (byte)(i >> 8);
bytes[3] = (byte) i;
w.addDocument(doc);
if (i % 100000 == 0) {
System.out.println("indexed: " + i);
System.out.flush();
}
}
w.forceMerge(1);
w.close();
System.out.println("verifying...");
System.out.flush();
DirectoryReader r = DirectoryReader.open(dir);
int expectedValue = 0;
for (AtomicReaderContext context : r.leaves()) {
AtomicReader reader = context.reader();
BytesRef scratch = new BytesRef();
BinaryDocValues dv = reader.getSortedDocValues("dv");
for (int i = 0; i < reader.maxDoc(); i++) {
bytes[0] = (byte)(expectedValue >> 24);
bytes[1] = (byte)(expectedValue >> 16);
bytes[2] = (byte)(expectedValue >> 8);
bytes[3] = (byte) expectedValue;
dv.get(i, scratch);
assertEquals(data, scratch);
expectedValue++;
}
}
r.close();
dir.close();
}
// TODO: variable
}