fix OOM (allocating too-large int[] in indexer for binary lengths)

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene4547@1440224 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2013-01-29 23:56:15 +00:00
parent d08b259fba
commit 34634ff4ec
16 changed files with 86 additions and 28 deletions

View File

@ -23,23 +23,32 @@ import java.io.IOException;
import java.util.Iterator;
import org.apache.lucene.codecs.DocValuesConsumer;
import org.apache.lucene.store.RAMFile;
import org.apache.lucene.store.RAMInputStream;
import org.apache.lucene.store.RAMOutputStream;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefArray;
import org.apache.lucene.util.Counter;
import org.apache.lucene.util.packed.AppendingLongBuffer;
/** Buffers up pending byte[] per doc, then flushes when
* segment flushes. */
class BinaryDocValuesWriter extends DocValuesWriter {
private final BytesRefArray bytesRefArray;
private final RAMFile bytes;
private final RAMOutputStream bytesWriter;
private final AppendingLongBuffer lengths;
private final FieldInfo fieldInfo;
private final Counter iwBytesUsed;
private long bytesUsed;
private int addedValues = 0;
private final BytesRef emptyBytesRef = new BytesRef();
public BinaryDocValuesWriter(FieldInfo fieldInfo, Counter iwBytesUsed) {
this.fieldInfo = fieldInfo;
this.bytesRefArray = new BytesRefArray(iwBytesUsed);
this.bytes = new RAMFile();
this.bytesWriter = new RAMOutputStream(bytes);
this.iwBytesUsed = iwBytesUsed;
this.lengths = new AppendingLongBuffer();
}
public void addValue(int docID, BytesRef value) {
@ -56,19 +65,41 @@ class BinaryDocValuesWriter extends DocValuesWriter {
// Fill in any holes:
while(addedValues < docID) {
addedValues++;
bytesRefArray.append(emptyBytesRef);
lengths.add(0);
}
addedValues++;
bytesRefArray.append(value);
lengths.add(value.length);
try {
bytesWriter.writeBytes(value.bytes, value.offset, value.length);
} catch (IOException e) {
throw new RuntimeException(e);
}
updateBytesUsed();
}
private void updateBytesUsed() {
// nocommit not totally accurate, but just fix not to use RAMFile anyway
long numBuffers = (bytesWriter.getFilePointer() / 1024) + 1; // round up
long oversize = numBuffers * (1024 + 32); // fudge for arraylist/etc overhead
final long newBytesUsed = lengths.ramBytesUsed() + oversize;
iwBytesUsed.addAndGet(newBytesUsed - bytesUsed);
bytesUsed = newBytesUsed;
}
@Override
public void finish(int maxDoc) {
try {
bytesWriter.close();
} catch (IOException e) {
throw new RuntimeException(e);
}
}
@Override
public void flush(SegmentWriteState state, DocValuesConsumer dvConsumer) throws IOException {
final int maxDoc = state.segmentInfo.getDocCount();
final int size = addedValues;
dvConsumer.addBinaryField(fieldInfo,
new Iterable<BytesRef>() {
@ -76,8 +107,18 @@ class BinaryDocValuesWriter extends DocValuesWriter {
@Override
public Iterator<BytesRef> iterator() {
return new Iterator<BytesRef>() {
RAMInputStream bytesReader;
AppendingLongBuffer.Iterator iter = lengths.iterator();
BytesRef value = new BytesRef();
int upto;
{
try {
bytesReader = new RAMInputStream("bogus", bytes);
} catch (IOException e) {
throw new RuntimeException(e);
}
}
@Override
public boolean hasNext() {
@ -91,8 +132,15 @@ class BinaryDocValuesWriter extends DocValuesWriter {
@Override
public BytesRef next() {
if (upto < bytesRefArray.size()) {
bytesRefArray.get(value, upto);
if (upto < size) {
int length = (int) iter.next();
value.grow(length);
try {
bytesReader.readBytes(value.bytes, 0, length);
} catch (IOException e) {
throw new RuntimeException(e);
}
value.length = length;
} else {
value.length = 0;
}

View File

@ -37,7 +37,6 @@ import com.carrotsearch.randomizedtesting.annotations.TimeoutSuite;
public class Test2BBinaryDocValues extends LuceneTestCase {
// indexes Integer.MAX_VALUE docs with a fixed binary field
// nocommit: broken ram accounting? ant test -Dtestcase=Test2BBinaryDocValues -Dtests.method=testFixedBinary -Dtests.seed=5554AA830176B848 -Dtests.slow=true -Dtests.docvaluesformat=Disk -Dtests.locale=sr_RS_#Latn -Dtests.timezone=Africa/Luanda -Dtests.file.encoding=UTF-8
public void testFixedBinary() throws Exception {
BaseDirectoryWrapper dir = newFSDirectory(_TestUtil.getTempDir("2BFixedBinary"));
if (dir instanceof MockDirectoryWrapper) {
@ -98,7 +97,6 @@ public class Test2BBinaryDocValues extends LuceneTestCase {
}
// indexes Integer.MAX_VALUE docs with a variable binary field
// nocommit: broken ram accounting? ant test -Dtestcase=Test2BBinaryDocValues -Dtests.method=testVariableBinary -Dtests.seed=FD50D16920062578 -Dtests.slow=true -Dtests.docvaluesformat=Disk -Dtests.locale=sr_ME_#Latn -Dtests.timezone=America/Argentina/Tucuman -Dtests.file.encoding=UTF-8
public void testVariableBinary() throws Exception {
BaseDirectoryWrapper dir = newFSDirectory(_TestUtil.getTempDir("2BVariableBinary"));
if (dir instanceof MockDirectoryWrapper) {

View File

@ -22,7 +22,6 @@ import java.util.Comparator;
import org.apache.lucene.search.spell.TermFreqIterator;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefArray;
import org.apache.lucene.util.Counter;
/**

View File

@ -1,4 +1,4 @@
package org.apache.lucene.util;
package org.apache.lucene.search.suggest;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
@ -20,6 +20,13 @@ package org.apache.lucene.util;
import java.util.Arrays;
import java.util.Comparator;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.ByteBlockPool;
import org.apache.lucene.util.BytesRefIterator;
import org.apache.lucene.util.Counter;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.SorterTemplate;
/**
* A simple append only random-access {@link BytesRef} array that stores full
@ -31,7 +38,7 @@ import java.util.Comparator;
* @lucene.internal
* @lucene.experimental
*/
public final class BytesRefArray {
final class BytesRefArray {
private final ByteBlockPool pool;
private int[] offsets = new int[1];
private int lastElement = 0;

View File

@ -1,4 +1,4 @@
package org.apache.lucene.search.suggest.fst;
package org.apache.lucene.search.suggest;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
@ -19,9 +19,9 @@ package org.apache.lucene.search.suggest.fst;
import java.util.Comparator;
import org.apache.lucene.search.suggest.fst.BytesRefSorter;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefIterator;
import org.apache.lucene.util.BytesRefArray;
import org.apache.lucene.util.Counter;
/**

View File

@ -1,4 +1,4 @@
package org.apache.lucene.search.suggest.fst;
package org.apache.lucene.search.suggest;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more

View File

@ -22,9 +22,8 @@ import java.io.IOException;
import java.util.Comparator;
import org.apache.lucene.search.spell.TermFreqIterator;
import org.apache.lucene.search.suggest.fst.Sort;
import org.apache.lucene.search.suggest.fst.Sort.ByteSequencesReader;
import org.apache.lucene.search.suggest.fst.Sort.ByteSequencesWriter;
import org.apache.lucene.search.suggest.Sort.ByteSequencesReader;
import org.apache.lucene.search.suggest.Sort.ByteSequencesWriter;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.ByteArrayDataOutput;
import org.apache.lucene.util.ArrayUtil;

View File

@ -34,7 +34,7 @@ import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.TokenStreamToAutomaton;
import org.apache.lucene.search.spell.TermFreqIterator;
import org.apache.lucene.search.suggest.Lookup;
import org.apache.lucene.search.suggest.fst.Sort;
import org.apache.lucene.search.suggest.Sort;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.ByteArrayDataOutput;
import org.apache.lucene.store.DataInput;

View File

@ -20,7 +20,8 @@ package org.apache.lucene.search.suggest.fst;
import java.io.*;
import java.util.Comparator;
import org.apache.lucene.search.suggest.fst.Sort.ByteSequencesReader;
import org.apache.lucene.search.suggest.Sort;
import org.apache.lucene.search.suggest.Sort.ByteSequencesReader;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefIterator;
import org.apache.lucene.util.IOUtils;

View File

@ -20,6 +20,7 @@ package org.apache.lucene.search.suggest.fst;
import java.io.Closeable;
import java.io.IOException;
import org.apache.lucene.search.suggest.InMemorySorter;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefIterator;
import org.apache.lucene.util.IntsRef;

View File

@ -26,8 +26,9 @@ import java.util.List;
import org.apache.lucene.search.spell.TermFreqIterator;
import org.apache.lucene.search.suggest.Lookup;
import org.apache.lucene.search.suggest.Sort;
import org.apache.lucene.search.suggest.Sort.SortInfo;
import org.apache.lucene.search.suggest.fst.FSTCompletion.Completion;
import org.apache.lucene.search.suggest.fst.Sort.SortInfo;
import org.apache.lucene.search.suggest.tst.TSTLookup;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.ByteArrayDataOutput;

View File

@ -28,7 +28,7 @@ import java.util.List;
import org.apache.lucene.search.spell.TermFreqIterator;
import org.apache.lucene.search.suggest.Lookup;
import org.apache.lucene.search.suggest.SortedTermFreqIteratorWrapper;
import org.apache.lucene.search.suggest.fst.Sort.ByteSequencesWriter;
import org.apache.lucene.search.suggest.Sort.ByteSequencesWriter;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.ByteArrayDataOutput;
import org.apache.lucene.store.InputStreamDataInput;

View File

@ -1,4 +1,4 @@
package org.apache.lucene.util;
package org.apache.lucene.search.suggest;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
@ -22,7 +22,7 @@ import java.util.*;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefIterator;
import org.apache.lucene.util.BytesRefArray;
import org.apache.lucene.util.Counter;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util._TestUtil;

View File

@ -17,6 +17,8 @@ package org.apache.lucene.search.suggest.fst;
* limitations under the License.
*/
import org.apache.lucene.search.suggest.InMemorySorter;
import org.apache.lucene.search.suggest.Sort;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefIterator;
import org.apache.lucene.util.LuceneTestCase;

View File

@ -19,6 +19,7 @@ package org.apache.lucene.search.suggest.fst;
import java.io.*;
import org.apache.lucene.search.suggest.Sort;
import org.apache.lucene.util.BytesRef;
/**

View File

@ -22,9 +22,10 @@ import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import org.apache.lucene.search.suggest.fst.Sort.BufferSize;
import org.apache.lucene.search.suggest.fst.Sort.ByteSequencesWriter;
import org.apache.lucene.search.suggest.fst.Sort.SortInfo;
import org.apache.lucene.search.suggest.Sort;
import org.apache.lucene.search.suggest.Sort.BufferSize;
import org.apache.lucene.search.suggest.Sort.ByteSequencesWriter;
import org.apache.lucene.search.suggest.Sort.SortInfo;
import org.apache.lucene.util.*;
import org.junit.*;