From 34634ff4ec0c40bd8c10280ee01d5381d1a69f5c Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Tue, 29 Jan 2013 23:56:15 +0000 Subject: [PATCH] fix OOM (allocating too-large int[] in indexer for binary lengths) git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene4547@1440224 13f79535-47bb-0310-9956-ffa450edef68 --- .../lucene/index/BinaryDocValuesWriter.java | 64 ++++++++++++++++--- .../lucene/index/Test2BBinaryDocValues.java | 2 - .../BufferingTermFreqIteratorWrapper.java | 1 - .../lucene/search/suggest}/BytesRefArray.java | 11 +++- .../suggest/{fst => }/InMemorySorter.java | 4 +- .../lucene/search/suggest/{fst => }/Sort.java | 2 +- .../SortedTermFreqIteratorWrapper.java | 5 +- .../suggest/analyzing/AnalyzingSuggester.java | 2 +- .../search/suggest/fst/ExternalRefSorter.java | 3 +- .../suggest/fst/FSTCompletionBuilder.java | 1 + .../suggest/fst/FSTCompletionLookup.java | 3 +- .../suggest/fst/WFSTCompletionLookup.java | 2 +- .../search/suggest}/TestBytesRefArray.java | 4 +- .../suggest/fst/BytesRefSortersTest.java | 2 + .../search/suggest/fst/LargeInputFST.java | 1 + .../lucene/search/suggest/fst/TestSort.java | 7 +- 16 files changed, 86 insertions(+), 28 deletions(-) rename lucene/{core/src/java/org/apache/lucene/util => suggest/src/java/org/apache/lucene/search/suggest}/BytesRefArray.java (93%) rename lucene/suggest/src/java/org/apache/lucene/search/suggest/{fst => }/InMemorySorter.java (94%) rename lucene/suggest/src/java/org/apache/lucene/search/suggest/{fst => }/Sort.java (99%) rename lucene/{core/src/test/org/apache/lucene/util => suggest/src/test/org/apache/lucene/search/suggest}/TestBytesRefArray.java (97%) diff --git a/lucene/core/src/java/org/apache/lucene/index/BinaryDocValuesWriter.java b/lucene/core/src/java/org/apache/lucene/index/BinaryDocValuesWriter.java index 6b92b7e5ae3..5d642f53229 100644 --- a/lucene/core/src/java/org/apache/lucene/index/BinaryDocValuesWriter.java +++ b/lucene/core/src/java/org/apache/lucene/index/BinaryDocValuesWriter.java @@ -23,23 +23,32 @@ import java.io.IOException; import java.util.Iterator; import org.apache.lucene.codecs.DocValuesConsumer; +import org.apache.lucene.store.RAMFile; +import org.apache.lucene.store.RAMInputStream; +import org.apache.lucene.store.RAMOutputStream; import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.BytesRefArray; import org.apache.lucene.util.Counter; +import org.apache.lucene.util.packed.AppendingLongBuffer; /** Buffers up pending byte[] per doc, then flushes when * segment flushes. */ class BinaryDocValuesWriter extends DocValuesWriter { - private final BytesRefArray bytesRefArray; + private final RAMFile bytes; + private final RAMOutputStream bytesWriter; + private final AppendingLongBuffer lengths; private final FieldInfo fieldInfo; + private final Counter iwBytesUsed; + private long bytesUsed; private int addedValues = 0; - private final BytesRef emptyBytesRef = new BytesRef(); public BinaryDocValuesWriter(FieldInfo fieldInfo, Counter iwBytesUsed) { this.fieldInfo = fieldInfo; - this.bytesRefArray = new BytesRefArray(iwBytesUsed); + this.bytes = new RAMFile(); + this.bytesWriter = new RAMOutputStream(bytes); + this.iwBytesUsed = iwBytesUsed; + this.lengths = new AppendingLongBuffer(); } public void addValue(int docID, BytesRef value) { @@ -56,19 +65,41 @@ class BinaryDocValuesWriter extends DocValuesWriter { // Fill in any holes: while(addedValues < docID) { addedValues++; - bytesRefArray.append(emptyBytesRef); + lengths.add(0); } addedValues++; - bytesRefArray.append(value); + lengths.add(value.length); + try { + bytesWriter.writeBytes(value.bytes, value.offset, value.length); + } catch (IOException e) { + throw new RuntimeException(e); + } + + updateBytesUsed(); + } + + private void updateBytesUsed() { + // nocommit not totally accurate, but just fix not to use RAMFile anyway + long numBuffers = (bytesWriter.getFilePointer() / 1024) + 1; // round up + long oversize = numBuffers * (1024 + 32); // fudge for arraylist/etc overhead + final long newBytesUsed = lengths.ramBytesUsed() + oversize; + iwBytesUsed.addAndGet(newBytesUsed - bytesUsed); + bytesUsed = newBytesUsed; } @Override public void finish(int maxDoc) { + try { + bytesWriter.close(); + } catch (IOException e) { + throw new RuntimeException(e); + } } @Override public void flush(SegmentWriteState state, DocValuesConsumer dvConsumer) throws IOException { final int maxDoc = state.segmentInfo.getDocCount(); + final int size = addedValues; dvConsumer.addBinaryField(fieldInfo, new Iterable() { @@ -76,8 +107,18 @@ class BinaryDocValuesWriter extends DocValuesWriter { @Override public Iterator iterator() { return new Iterator() { + RAMInputStream bytesReader; + AppendingLongBuffer.Iterator iter = lengths.iterator(); BytesRef value = new BytesRef(); int upto; + + { + try { + bytesReader = new RAMInputStream("bogus", bytes); + } catch (IOException e) { + throw new RuntimeException(e); + } + } @Override public boolean hasNext() { @@ -91,8 +132,15 @@ class BinaryDocValuesWriter extends DocValuesWriter { @Override public BytesRef next() { - if (upto < bytesRefArray.size()) { - bytesRefArray.get(value, upto); + if (upto < size) { + int length = (int) iter.next(); + value.grow(length); + try { + bytesReader.readBytes(value.bytes, 0, length); + } catch (IOException e) { + throw new RuntimeException(e); + } + value.length = length; } else { value.length = 0; } diff --git a/lucene/core/src/test/org/apache/lucene/index/Test2BBinaryDocValues.java b/lucene/core/src/test/org/apache/lucene/index/Test2BBinaryDocValues.java index c8545866122..6ad9c63a437 100644 --- a/lucene/core/src/test/org/apache/lucene/index/Test2BBinaryDocValues.java +++ b/lucene/core/src/test/org/apache/lucene/index/Test2BBinaryDocValues.java @@ -37,7 +37,6 @@ import com.carrotsearch.randomizedtesting.annotations.TimeoutSuite; public class Test2BBinaryDocValues extends LuceneTestCase { // indexes Integer.MAX_VALUE docs with a fixed binary field - // nocommit: broken ram accounting? ant test -Dtestcase=Test2BBinaryDocValues -Dtests.method=testFixedBinary -Dtests.seed=5554AA830176B848 -Dtests.slow=true -Dtests.docvaluesformat=Disk -Dtests.locale=sr_RS_#Latn -Dtests.timezone=Africa/Luanda -Dtests.file.encoding=UTF-8 public void testFixedBinary() throws Exception { BaseDirectoryWrapper dir = newFSDirectory(_TestUtil.getTempDir("2BFixedBinary")); if (dir instanceof MockDirectoryWrapper) { @@ -98,7 +97,6 @@ public class Test2BBinaryDocValues extends LuceneTestCase { } // indexes Integer.MAX_VALUE docs with a variable binary field - // nocommit: broken ram accounting? ant test -Dtestcase=Test2BBinaryDocValues -Dtests.method=testVariableBinary -Dtests.seed=FD50D16920062578 -Dtests.slow=true -Dtests.docvaluesformat=Disk -Dtests.locale=sr_ME_#Latn -Dtests.timezone=America/Argentina/Tucuman -Dtests.file.encoding=UTF-8 public void testVariableBinary() throws Exception { BaseDirectoryWrapper dir = newFSDirectory(_TestUtil.getTempDir("2BVariableBinary")); if (dir instanceof MockDirectoryWrapper) { diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/BufferingTermFreqIteratorWrapper.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/BufferingTermFreqIteratorWrapper.java index f0347cb0c8e..f4eae438778 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/BufferingTermFreqIteratorWrapper.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/BufferingTermFreqIteratorWrapper.java @@ -22,7 +22,6 @@ import java.util.Comparator; import org.apache.lucene.search.spell.TermFreqIterator; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.BytesRefArray; import org.apache.lucene.util.Counter; /** diff --git a/lucene/core/src/java/org/apache/lucene/util/BytesRefArray.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/BytesRefArray.java similarity index 93% rename from lucene/core/src/java/org/apache/lucene/util/BytesRefArray.java rename to lucene/suggest/src/java/org/apache/lucene/search/suggest/BytesRefArray.java index c1a616514d7..15ca9eb4412 100644 --- a/lucene/core/src/java/org/apache/lucene/util/BytesRefArray.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/BytesRefArray.java @@ -1,4 +1,4 @@ -package org.apache.lucene.util; +package org.apache.lucene.search.suggest; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -20,6 +20,13 @@ package org.apache.lucene.util; import java.util.Arrays; import java.util.Comparator; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.ByteBlockPool; +import org.apache.lucene.util.BytesRefIterator; +import org.apache.lucene.util.Counter; +import org.apache.lucene.util.RamUsageEstimator; +import org.apache.lucene.util.SorterTemplate; /** * A simple append only random-access {@link BytesRef} array that stores full @@ -31,7 +38,7 @@ import java.util.Comparator; * @lucene.internal * @lucene.experimental */ -public final class BytesRefArray { +final class BytesRefArray { private final ByteBlockPool pool; private int[] offsets = new int[1]; private int lastElement = 0; diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/InMemorySorter.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/InMemorySorter.java similarity index 94% rename from lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/InMemorySorter.java rename to lucene/suggest/src/java/org/apache/lucene/search/suggest/InMemorySorter.java index adeb5398790..0efc3a5fa7a 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/InMemorySorter.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/InMemorySorter.java @@ -1,4 +1,4 @@ -package org.apache.lucene.search.suggest.fst; +package org.apache.lucene.search.suggest; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -19,9 +19,9 @@ package org.apache.lucene.search.suggest.fst; import java.util.Comparator; +import org.apache.lucene.search.suggest.fst.BytesRefSorter; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefIterator; -import org.apache.lucene.util.BytesRefArray; import org.apache.lucene.util.Counter; /** diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/Sort.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/Sort.java similarity index 99% rename from lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/Sort.java rename to lucene/suggest/src/java/org/apache/lucene/search/suggest/Sort.java index 463cc993186..8c6c20f1444 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/Sort.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/Sort.java @@ -1,4 +1,4 @@ -package org.apache.lucene.search.suggest.fst; +package org.apache.lucene.search.suggest; /* * Licensed to the Apache Software Foundation (ASF) under one or more diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/SortedTermFreqIteratorWrapper.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/SortedTermFreqIteratorWrapper.java index eb758a40c2f..f48305e6fc6 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/SortedTermFreqIteratorWrapper.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/SortedTermFreqIteratorWrapper.java @@ -22,9 +22,8 @@ import java.io.IOException; import java.util.Comparator; import org.apache.lucene.search.spell.TermFreqIterator; -import org.apache.lucene.search.suggest.fst.Sort; -import org.apache.lucene.search.suggest.fst.Sort.ByteSequencesReader; -import org.apache.lucene.search.suggest.fst.Sort.ByteSequencesWriter; +import org.apache.lucene.search.suggest.Sort.ByteSequencesReader; +import org.apache.lucene.search.suggest.Sort.ByteSequencesWriter; import org.apache.lucene.store.ByteArrayDataInput; import org.apache.lucene.store.ByteArrayDataOutput; import org.apache.lucene.util.ArrayUtil; diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java index 0bbbef90b07..14d881ee8fd 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java @@ -34,7 +34,7 @@ import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStreamToAutomaton; import org.apache.lucene.search.spell.TermFreqIterator; import org.apache.lucene.search.suggest.Lookup; -import org.apache.lucene.search.suggest.fst.Sort; +import org.apache.lucene.search.suggest.Sort; import org.apache.lucene.store.ByteArrayDataInput; import org.apache.lucene.store.ByteArrayDataOutput; import org.apache.lucene.store.DataInput; diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/ExternalRefSorter.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/ExternalRefSorter.java index 56da7b27787..0c464898306 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/ExternalRefSorter.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/ExternalRefSorter.java @@ -20,7 +20,8 @@ package org.apache.lucene.search.suggest.fst; import java.io.*; import java.util.Comparator; -import org.apache.lucene.search.suggest.fst.Sort.ByteSequencesReader; +import org.apache.lucene.search.suggest.Sort; +import org.apache.lucene.search.suggest.Sort.ByteSequencesReader; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefIterator; import org.apache.lucene.util.IOUtils; diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionBuilder.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionBuilder.java index e2309879996..8029cfd973e 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionBuilder.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionBuilder.java @@ -20,6 +20,7 @@ package org.apache.lucene.search.suggest.fst; import java.io.Closeable; import java.io.IOException; +import org.apache.lucene.search.suggest.InMemorySorter; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefIterator; import org.apache.lucene.util.IntsRef; diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionLookup.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionLookup.java index bc92976ad55..a0549e236ed 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionLookup.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionLookup.java @@ -26,8 +26,9 @@ import java.util.List; import org.apache.lucene.search.spell.TermFreqIterator; import org.apache.lucene.search.suggest.Lookup; +import org.apache.lucene.search.suggest.Sort; +import org.apache.lucene.search.suggest.Sort.SortInfo; import org.apache.lucene.search.suggest.fst.FSTCompletion.Completion; -import org.apache.lucene.search.suggest.fst.Sort.SortInfo; import org.apache.lucene.search.suggest.tst.TSTLookup; import org.apache.lucene.store.ByteArrayDataInput; import org.apache.lucene.store.ByteArrayDataOutput; diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/WFSTCompletionLookup.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/WFSTCompletionLookup.java index e6d69621e0f..4d74880d768 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/WFSTCompletionLookup.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/WFSTCompletionLookup.java @@ -28,7 +28,7 @@ import java.util.List; import org.apache.lucene.search.spell.TermFreqIterator; import org.apache.lucene.search.suggest.Lookup; import org.apache.lucene.search.suggest.SortedTermFreqIteratorWrapper; -import org.apache.lucene.search.suggest.fst.Sort.ByteSequencesWriter; +import org.apache.lucene.search.suggest.Sort.ByteSequencesWriter; import org.apache.lucene.store.ByteArrayDataInput; import org.apache.lucene.store.ByteArrayDataOutput; import org.apache.lucene.store.InputStreamDataInput; diff --git a/lucene/core/src/test/org/apache/lucene/util/TestBytesRefArray.java b/lucene/suggest/src/test/org/apache/lucene/search/suggest/TestBytesRefArray.java similarity index 97% rename from lucene/core/src/test/org/apache/lucene/util/TestBytesRefArray.java rename to lucene/suggest/src/test/org/apache/lucene/search/suggest/TestBytesRefArray.java index 7136de1ac8e..90cae6fe218 100644 --- a/lucene/core/src/test/org/apache/lucene/util/TestBytesRefArray.java +++ b/lucene/suggest/src/test/org/apache/lucene/search/suggest/TestBytesRefArray.java @@ -1,4 +1,4 @@ -package org.apache.lucene.util; +package org.apache.lucene.search.suggest; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -22,7 +22,7 @@ import java.util.*; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefIterator; -import org.apache.lucene.util.BytesRefArray; +import org.apache.lucene.util.Counter; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util._TestUtil; diff --git a/lucene/suggest/src/test/org/apache/lucene/search/suggest/fst/BytesRefSortersTest.java b/lucene/suggest/src/test/org/apache/lucene/search/suggest/fst/BytesRefSortersTest.java index 02bc7309c79..f8ccd35b55c 100644 --- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/fst/BytesRefSortersTest.java +++ b/lucene/suggest/src/test/org/apache/lucene/search/suggest/fst/BytesRefSortersTest.java @@ -17,6 +17,8 @@ package org.apache.lucene.search.suggest.fst; * limitations under the License. */ +import org.apache.lucene.search.suggest.InMemorySorter; +import org.apache.lucene.search.suggest.Sort; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefIterator; import org.apache.lucene.util.LuceneTestCase; diff --git a/lucene/suggest/src/test/org/apache/lucene/search/suggest/fst/LargeInputFST.java b/lucene/suggest/src/test/org/apache/lucene/search/suggest/fst/LargeInputFST.java index a63c3b73310..48a1409d9f9 100644 --- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/fst/LargeInputFST.java +++ b/lucene/suggest/src/test/org/apache/lucene/search/suggest/fst/LargeInputFST.java @@ -19,6 +19,7 @@ package org.apache.lucene.search.suggest.fst; import java.io.*; +import org.apache.lucene.search.suggest.Sort; import org.apache.lucene.util.BytesRef; /** diff --git a/lucene/suggest/src/test/org/apache/lucene/search/suggest/fst/TestSort.java b/lucene/suggest/src/test/org/apache/lucene/search/suggest/fst/TestSort.java index c6adb97163d..6b4c298332e 100644 --- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/fst/TestSort.java +++ b/lucene/suggest/src/test/org/apache/lucene/search/suggest/fst/TestSort.java @@ -22,9 +22,10 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.Comparator; -import org.apache.lucene.search.suggest.fst.Sort.BufferSize; -import org.apache.lucene.search.suggest.fst.Sort.ByteSequencesWriter; -import org.apache.lucene.search.suggest.fst.Sort.SortInfo; +import org.apache.lucene.search.suggest.Sort; +import org.apache.lucene.search.suggest.Sort.BufferSize; +import org.apache.lucene.search.suggest.Sort.ByteSequencesWriter; +import org.apache.lucene.search.suggest.Sort.SortInfo; import org.apache.lucene.util.*; import org.junit.*;