mirror of https://github.com/apache/lucene.git
OOM prevention with many unique sorted values (still slooooooooooooow)
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene4547@1440429 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
23eb872045
commit
c4d720b6a7
|
@ -89,6 +89,12 @@ class SortedDocValuesWriter extends DocValuesWriter {
|
|||
int ord = hash.add(value);
|
||||
if (ord < 0) {
|
||||
ord = -ord-1;
|
||||
} else {
|
||||
// reserve additional space for each unique value:
|
||||
// 1. when indexing, when hash is 50% full, rehash() suddenly needs 2*size ints.
|
||||
// TODO: can this same OOM happen in THPF?
|
||||
// 2. when flushing, we need 1 int per value (slot in the ordMap).
|
||||
iwBytesUsed.addAndGet(2 * RamUsageEstimator.NUM_BYTES_INT);
|
||||
}
|
||||
|
||||
pending.add(ord);
|
||||
|
@ -120,9 +126,7 @@ class SortedDocValuesWriter extends DocValuesWriter {
|
|||
|
||||
final int valueCount = hash.size();
|
||||
|
||||
// nocommit: account for both sortedValues and ordMap as-we-go...
|
||||
final int[] sortedValues = hash.sort(BytesRef.getUTF8SortedAsUnicodeComparator());
|
||||
final int sortedValueRamUsage = RamUsageEstimator.NUM_BYTES_ARRAY_HEADER + RamUsageEstimator.NUM_BYTES_INT*valueCount;
|
||||
final int[] ordMap = new int[valueCount];
|
||||
|
||||
for(int ord=0;ord<valueCount;ord++) {
|
||||
|
|
|
@ -93,5 +93,67 @@ public class Test2BSortedDocValues extends LuceneTestCase {
|
|||
dir.close();
|
||||
}
|
||||
|
||||
// TODO: variable, and also Test2BOrds
|
||||
// indexes Integer.MAX_VALUE docs with a fixed binary field
|
||||
// nocommit: this must be some kind of worst case for BytesRefHash / its hash fn...
|
||||
// or there is some other perf bug...VERY slow!
|
||||
public void test2BOrds() throws Exception {
|
||||
BaseDirectoryWrapper dir = newFSDirectory(_TestUtil.getTempDir("2BOrds"));
|
||||
if (dir instanceof MockDirectoryWrapper) {
|
||||
((MockDirectoryWrapper)dir).setThrottling(MockDirectoryWrapper.Throttling.NEVER);
|
||||
}
|
||||
|
||||
IndexWriter w = new IndexWriter(dir,
|
||||
new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()))
|
||||
.setMaxBufferedDocs(IndexWriterConfig.DISABLE_AUTO_FLUSH)
|
||||
.setRAMBufferSizeMB(256.0)
|
||||
.setMergeScheduler(new ConcurrentMergeScheduler())
|
||||
.setMergePolicy(newLogMergePolicy(false, 10))
|
||||
.setOpenMode(IndexWriterConfig.OpenMode.CREATE));
|
||||
|
||||
Document doc = new Document();
|
||||
byte bytes[] = new byte[4];
|
||||
BytesRef data = new BytesRef(bytes);
|
||||
SortedDocValuesField dvField = new SortedDocValuesField("dv", data);
|
||||
doc.add(dvField);
|
||||
|
||||
for (int i = 0; i < Integer.MAX_VALUE; i++) {
|
||||
bytes[0] = (byte)(i >> 24);
|
||||
bytes[1] = (byte)(i >> 16);
|
||||
bytes[2] = (byte)(i >> 8);
|
||||
bytes[3] = (byte) i;
|
||||
w.addDocument(doc);
|
||||
if (i % 100000 == 0) {
|
||||
System.out.println("indexed: " + i);
|
||||
System.out.flush();
|
||||
}
|
||||
}
|
||||
|
||||
w.forceMerge(1);
|
||||
w.close();
|
||||
|
||||
System.out.println("verifying...");
|
||||
System.out.flush();
|
||||
|
||||
DirectoryReader r = DirectoryReader.open(dir);
|
||||
int expectedValue = 0;
|
||||
for (AtomicReaderContext context : r.leaves()) {
|
||||
AtomicReader reader = context.reader();
|
||||
BytesRef scratch = new BytesRef();
|
||||
BinaryDocValues dv = reader.getSortedDocValues("dv");
|
||||
for (int i = 0; i < reader.maxDoc(); i++) {
|
||||
bytes[0] = (byte)(expectedValue >> 24);
|
||||
bytes[1] = (byte)(expectedValue >> 16);
|
||||
bytes[2] = (byte)(expectedValue >> 8);
|
||||
bytes[3] = (byte) expectedValue;
|
||||
dv.get(i, scratch);
|
||||
assertEquals(data, scratch);
|
||||
expectedValue++;
|
||||
}
|
||||
}
|
||||
|
||||
r.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
// TODO: variable
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue