From c4d720b6a73e3aaec235401fe921d6a4597c061e Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Wed, 30 Jan 2013 14:21:22 +0000 Subject: [PATCH] OOM prevention with many unique sorted values (still slooooooooooooow) git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene4547@1440429 13f79535-47bb-0310-9956-ffa450edef68 --- .../lucene/index/SortedDocValuesWriter.java | 8 ++- .../lucene/index/Test2BSortedDocValues.java | 64 ++++++++++++++++++- 2 files changed, 69 insertions(+), 3 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/index/SortedDocValuesWriter.java b/lucene/core/src/java/org/apache/lucene/index/SortedDocValuesWriter.java index 3dfbaf2051f..b90eb7fa6ca 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SortedDocValuesWriter.java +++ b/lucene/core/src/java/org/apache/lucene/index/SortedDocValuesWriter.java @@ -89,6 +89,12 @@ class SortedDocValuesWriter extends DocValuesWriter { int ord = hash.add(value); if (ord < 0) { ord = -ord-1; + } else { + // reserve additional space for each unique value: + // 1. when indexing, when hash is 50% full, rehash() suddenly needs 2*size ints. + // TODO: can this same OOM happen in THPF? + // 2. when flushing, we need 1 int per value (slot in the ordMap). + iwBytesUsed.addAndGet(2 * RamUsageEstimator.NUM_BYTES_INT); } pending.add(ord); @@ -120,9 +126,7 @@ class SortedDocValuesWriter extends DocValuesWriter { final int valueCount = hash.size(); - // nocommit: account for both sortedValues and ordMap as-we-go... final int[] sortedValues = hash.sort(BytesRef.getUTF8SortedAsUnicodeComparator()); - final int sortedValueRamUsage = RamUsageEstimator.NUM_BYTES_ARRAY_HEADER + RamUsageEstimator.NUM_BYTES_INT*valueCount; final int[] ordMap = new int[valueCount]; for(int ord=0;ord> 24); + bytes[1] = (byte)(i >> 16); + bytes[2] = (byte)(i >> 8); + bytes[3] = (byte) i; + w.addDocument(doc); + if (i % 100000 == 0) { + System.out.println("indexed: " + i); + System.out.flush(); + } + } + + w.forceMerge(1); + w.close(); + + System.out.println("verifying..."); + System.out.flush(); + + DirectoryReader r = DirectoryReader.open(dir); + int expectedValue = 0; + for (AtomicReaderContext context : r.leaves()) { + AtomicReader reader = context.reader(); + BytesRef scratch = new BytesRef(); + BinaryDocValues dv = reader.getSortedDocValues("dv"); + for (int i = 0; i < reader.maxDoc(); i++) { + bytes[0] = (byte)(expectedValue >> 24); + bytes[1] = (byte)(expectedValue >> 16); + bytes[2] = (byte)(expectedValue >> 8); + bytes[3] = (byte) expectedValue; + dv.get(i, scratch); + assertEquals(data, scratch); + expectedValue++; + } + } + + r.close(); + dir.close(); + } + + // TODO: variable }