LUCENE-7390: revert this change, since it's obsoleted by the much better LUCENE-7396

This commit is contained in:
Mike McCandless 2016-07-29 14:02:00 -04:00
parent 60975d2dfa
commit 1aecdd28d1
16 changed files with 48 additions and 52 deletions

View File

@ -138,11 +138,6 @@ Improvements
* LUCENE-7385: Improve/fix assert messages in SpanScorer. (David Smiley)
* LUCENE-7390: Improve performance of indexing points by allowing the
codec to use transient heap in proportion to IndexWriter's RAM
buffer, instead of a fixed 16.0 MB. A custom codec can still
override the buffer size itself. (Mike McCandless)
* LUCENE-7393: Add ICUTokenizer option to parse Myanmar text as syllables instead of words,
because the ICU word-breaking algorithm has some issues. This allows for the previous
tokenization used before Lucene 5. (AM, Robert Muir)

View File

@ -68,7 +68,7 @@ class SimpleTextPointsWriter extends PointsWriter {
}
@Override
public void writeField(FieldInfo fieldInfo, PointsReader values, double maxMBSortInHeap) throws IOException {
public void writeField(FieldInfo fieldInfo, PointsReader values) throws IOException {
boolean singleValuePerDoc = values.size(fieldInfo.name) == values.getDocCount(fieldInfo.name);
@ -79,7 +79,7 @@ class SimpleTextPointsWriter extends PointsWriter {
fieldInfo.getPointDimensionCount(),
fieldInfo.getPointNumBytes(),
BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE,
maxMBSortInHeap,
BKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP,
values.size(fieldInfo.name),
singleValuePerDoc) {

View File

@ -22,7 +22,6 @@ import java.io.IOException;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.MergeState;
import org.apache.lucene.util.bkd.BKDWriter;
/** Abstract API to write points
*
@ -35,9 +34,8 @@ public abstract class PointsWriter implements Closeable {
protected PointsWriter() {
}
/** Write all values contained in the provided reader. {@code maxMBSortInHeap} is the maximum
* transient heap that can be used to sort values, before spilling to disk for offline sorting */
public abstract void writeField(FieldInfo fieldInfo, PointsReader values, double maxMBSortInHeap) throws IOException;
/** Write all values contained in the provided reader */
public abstract void writeField(FieldInfo fieldInfo, PointsReader values) throws IOException;
/** Default naive merge implementation for one field: it just re-indexes all the values
* from the incoming segment. The default codec overrides this for 1D fields and uses
@ -147,10 +145,7 @@ public abstract class PointsWriter implements Closeable {
public int getDocCount(String fieldName) {
return finalDocCount;
}
},
// TODO: also let merging of > 1D fields tap into IW's indexing buffer size, somehow (1D fields do an optimized merge sort
// and don't need heap)
BKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP);
});
}
/** Default merge implementation to merge incoming points readers by visiting all their points and

View File

@ -40,9 +40,7 @@ import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.bkd.BKDReader;
import org.apache.lucene.util.bkd.BKDWriter;
/** Writes dimensional values
*
* @lucene.experimental */
/** Writes dimensional values */
public class Lucene60PointsWriter extends PointsWriter implements Closeable {
/** Output used to write the BKD tree data file */
@ -53,13 +51,15 @@ public class Lucene60PointsWriter extends PointsWriter implements Closeable {
final SegmentWriteState writeState;
final int maxPointsInLeafNode;
final double maxMBSortInHeap;
private boolean finished;
/** Full constructor */
public Lucene60PointsWriter(SegmentWriteState writeState, int maxPointsInLeafNode) throws IOException {
public Lucene60PointsWriter(SegmentWriteState writeState, int maxPointsInLeafNode, double maxMBSortInHeap) throws IOException {
assert writeState.fieldInfos.hasPointValues();
this.writeState = writeState;
this.maxPointsInLeafNode = maxPointsInLeafNode;
this.maxMBSortInHeap = maxMBSortInHeap;
String dataFileName = IndexFileNames.segmentFileName(writeState.segmentInfo.name,
writeState.segmentSuffix,
Lucene60PointsFormat.DATA_EXTENSION);
@ -81,11 +81,11 @@ public class Lucene60PointsWriter extends PointsWriter implements Closeable {
/** Uses the defaults values for {@code maxPointsInLeafNode} (1024) and {@code maxMBSortInHeap} (16.0) */
public Lucene60PointsWriter(SegmentWriteState writeState) throws IOException {
this(writeState, BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE);
this(writeState, BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE, BKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP);
}
@Override
public void writeField(FieldInfo fieldInfo, PointsReader values, double maxMBSortInHeap) throws IOException {
public void writeField(FieldInfo fieldInfo, PointsReader values) throws IOException {
boolean singleValuePerDoc = values.size(fieldInfo.name) == values.getDocCount(fieldInfo.name);
@ -182,8 +182,7 @@ public class Lucene60PointsWriter extends PointsWriter implements Closeable {
fieldInfo.getPointDimensionCount(),
fieldInfo.getPointNumBytes(),
maxPointsInLeafNode,
// NOTE: not used, since BKDWriter.merge does a merge sort:
BKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP,
maxMBSortInHeap,
totMaxSize,
singleValuePerDoc)) {
List<BKDReader> bkdReaders = new ArrayList<>();

View File

@ -153,7 +153,7 @@ class DocumentsWriterPerThread {
final Allocator byteBlockAllocator;
final IntBlockPool.Allocator intBlockAllocator;
private final AtomicLong pendingNumDocs;
final LiveIndexWriterConfig indexWriterConfig;
private final LiveIndexWriterConfig indexWriterConfig;
private final boolean enableTestPoints;
private final IndexWriter indexWriter;

View File

@ -762,7 +762,7 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable {
* {@link #getConfig()}.
*
* <p>
* <b>NOTE:</b> after this writer is created, the given configuration instance
* <b>NOTE:</b> after ths writer is created, the given configuration instance
* cannot be passed to another writer.
*
* @param d

View File

@ -168,14 +168,9 @@ public class LiveIndexWriterConfig {
/**
* Determines the amount of RAM that may be used for buffering added documents
* and deletions before beginning to flush them to the Directory. For
* faster indexing performance it's best to use as large a RAM buffer as you can.
* <p>
* Note that this setting is not a hard limit on memory usage during indexing, as
* transient and non-trivial memory well beyond this buffer size may be used,
* for example due to segment merges or writing points to new segments.
* For application stability the available memory in the JVM
* should be significantly larger than the RAM buffer used for indexing.
* and deletions before they are flushed to the Directory. Generally for
* faster indexing performance it's best to flush by RAM usage instead of
* document count and use as large a RAM buffer as you can.
* <p>
* When this is set, the writer will flush whenever buffered documents and
* deletions use this much RAM. Pass in
@ -183,6 +178,14 @@ public class LiveIndexWriterConfig {
* due to RAM usage. Note that if flushing by document count is also enabled,
* then the flush will be triggered by whichever comes first.
* <p>
* The maximum RAM limit is inherently determined by the JVMs available
* memory. Yet, an {@link IndexWriter} session can consume a significantly
* larger amount of memory than the given RAM limit since this limit is just
* an indicator when to flush memory resident documents to the Directory.
* Flushes are likely happen concurrently while other threads adding documents
* to the writer. For application stability the available memory in the JVM
* should be significantly larger than the RAM buffer used for indexing.
* <p>
* <b>NOTE</b>: the account of RAM usage for pending deletions is only
* approximate. Specifically, if you delete by Query, Lucene currently has no
* way to measure the RAM usage of individual Queries so the accounting will

View File

@ -25,7 +25,6 @@ import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.ByteBlockPool;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.Counter;
import org.apache.lucene.util.bkd.BKDWriter;
/** Buffers up pending byte[][] value(s) per doc, then flushes when segment flushes. */
class PointValuesWriter {
@ -37,7 +36,6 @@ class PointValuesWriter {
private int numDocs;
private int lastDocID = -1;
private final int packedBytesLength;
private final LiveIndexWriterConfig indexWriterConfig;
public PointValuesWriter(DocumentsWriterPerThread docWriter, FieldInfo fieldInfo) {
this.fieldInfo = fieldInfo;
@ -46,7 +44,6 @@ class PointValuesWriter {
docIDs = new int[16];
iwBytesUsed.addAndGet(16 * Integer.BYTES);
packedBytesLength = fieldInfo.getPointDimensionCount() * fieldInfo.getPointNumBytes();
indexWriterConfig = docWriter.indexWriterConfig;
}
// TODO: if exactly the same value is added to exactly the same doc, should we dedup?
@ -167,6 +164,6 @@ class PointValuesWriter {
}
};
writer.writeField(fieldInfo, reader, Math.max(indexWriterConfig.getRAMBufferSizeMB()/8.0, BKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP));
writer.writeField(fieldInfo, reader, Math.max(indexWriterConfig.getRAMBufferSizeMB()/8.0));
}
}

View File

@ -205,7 +205,7 @@ public class BKDWriter implements Closeable {
// all recursive halves (i.e. 16 + 8 + 4 + 2) so the memory usage is 2X
// what that level would consume, so we multiply by 0.5 to convert from
// bytes to points here. Each dimension has its own sorted partition, so
// we must divide by numDims as well.
// we must divide by numDims as wel.
maxPointsSortInHeap = (int) (0.5 * (maxMBSortInHeap * 1024 * 1024) / (bytesPerDoc * numDims));

View File

@ -41,8 +41,9 @@ public class TestLucene60PointsFormat extends BasePointsFormatTestCase {
if (random().nextBoolean()) {
// randomize parameters
int maxPointsInLeafNode = TestUtil.nextInt(random(), 50, 500);
double maxMBSortInHeap = 3.0 + (3*random().nextDouble());
if (VERBOSE) {
System.out.println("TEST: using Lucene60PointsFormat with maxPointsInLeafNode=" + maxPointsInLeafNode);
System.out.println("TEST: using Lucene60PointsFormat with maxPointsInLeafNode=" + maxPointsInLeafNode + " and maxMBSortInHeap=" + maxMBSortInHeap);
}
// sneaky impersonation!
@ -52,7 +53,7 @@ public class TestLucene60PointsFormat extends BasePointsFormatTestCase {
return new PointsFormat() {
@Override
public PointsWriter fieldsWriter(SegmentWriteState writeState) throws IOException {
return new Lucene60PointsWriter(writeState, maxPointsInLeafNode);
return new Lucene60PointsWriter(writeState, maxPointsInLeafNode, maxMBSortInHeap);
}
@Override

View File

@ -1156,8 +1156,9 @@ public class TestPointQueries extends LuceneTestCase {
private static Codec getCodec() {
if (Codec.getDefault().getName().equals("Lucene62")) {
int maxPointsInLeafNode = TestUtil.nextInt(random(), 16, 2048);
double maxMBSortInHeap = 5.0 + (3*random().nextDouble());
if (VERBOSE) {
System.out.println("TEST: using Lucene60PointsFormat with maxPointsInLeafNode=" + maxPointsInLeafNode);
System.out.println("TEST: using Lucene60PointsFormat with maxPointsInLeafNode=" + maxPointsInLeafNode + " and maxMBSortInHeap=" + maxMBSortInHeap);
}
return new FilterCodec("Lucene62", Codec.getDefault()) {
@ -1166,7 +1167,7 @@ public class TestPointQueries extends LuceneTestCase {
return new PointsFormat() {
@Override
public PointsWriter fieldsWriter(SegmentWriteState writeState) throws IOException {
return new Lucene60PointsWriter(writeState, maxPointsInLeafNode);
return new Lucene60PointsWriter(writeState, maxPointsInLeafNode, maxMBSortInHeap);
}
@Override

View File

@ -87,8 +87,9 @@ public class TestGeo3DPoint extends LuceneTestCase {
private static Codec getCodec() {
if (Codec.getDefault().getName().equals("Lucene62")) {
int maxPointsInLeafNode = TestUtil.nextInt(random(), 16, 2048);
double maxMBSortInHeap = 3.0 + (3*random().nextDouble());
if (VERBOSE) {
System.out.println("TEST: using Lucene60PointsFormat with maxPointsInLeafNode=" + maxPointsInLeafNode);
System.out.println("TEST: using Lucene60PointsFormat with maxPointsInLeafNode=" + maxPointsInLeafNode + " and maxMBSortInHeap=" + maxMBSortInHeap);
}
return new FilterCodec("Lucene62", Codec.getDefault()) {
@ -97,7 +98,7 @@ public class TestGeo3DPoint extends LuceneTestCase {
return new PointsFormat() {
@Override
public PointsWriter fieldsWriter(SegmentWriteState writeState) throws IOException {
return new Lucene60PointsWriter(writeState, maxPointsInLeafNode);
return new Lucene60PointsWriter(writeState, maxPointsInLeafNode, maxMBSortInHeap);
}
@Override

View File

@ -255,11 +255,11 @@ public final class AssertingPointsFormat extends PointsFormat {
}
@Override
public void writeField(FieldInfo fieldInfo, PointsReader values, double maxMBSortInHeap) throws IOException {
public void writeField(FieldInfo fieldInfo, PointsReader values) throws IOException {
if (fieldInfo.getPointDimensionCount() == 0) {
throw new IllegalArgumentException("writing field=\"" + fieldInfo.name + "\" but pointDimensionalCount is 0");
}
in.writeField(fieldInfo, values, maxMBSortInHeap);
in.writeField(fieldInfo, values);
}
@Override

View File

@ -56,11 +56,11 @@ class CrankyPointsFormat extends PointsFormat {
}
@Override
public void writeField(FieldInfo fieldInfo, PointsReader values, double maxMBSortInHeap) throws IOException {
public void writeField(FieldInfo fieldInfo, PointsReader values) throws IOException {
if (random.nextInt(100) == 0) {
throw new IOException("Fake IOException");
}
delegate.writeField(fieldInfo, values, maxMBSortInHeap);
delegate.writeField(fieldInfo, values);
}
@Override

View File

@ -67,6 +67,7 @@ import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.SloppyMath;
import org.apache.lucene.util.TestUtil;
import org.apache.lucene.util.bkd.BKDWriter;
/**
* Abstract class to do basic tests for a geospatial impl (high level
@ -1247,7 +1248,7 @@ public abstract class BaseGeoPointTestCase extends LuceneTestCase {
return new PointsFormat() {
@Override
public PointsWriter fieldsWriter(SegmentWriteState writeState) throws IOException {
return new Lucene60PointsWriter(writeState, pointsInLeaf);
return new Lucene60PointsWriter(writeState, pointsInLeaf, BKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP);
}
@Override

View File

@ -92,6 +92,7 @@ public class RandomCodec extends AssertingCodec {
// which is less effective for testing.
// TODO: improve how we randomize this...
private final int maxPointsInLeafNode;
private final double maxMBSortInHeap;
private final int bkdSplitRandomSeed;
@Override
@ -102,9 +103,9 @@ public class RandomCodec extends AssertingCodec {
// Randomize how BKDWriter chooses its splis:
return new Lucene60PointsWriter(writeState, maxPointsInLeafNode) {
return new Lucene60PointsWriter(writeState, maxPointsInLeafNode, maxMBSortInHeap) {
@Override
public void writeField(FieldInfo fieldInfo, PointsReader values, double maxMBSortInHeap) throws IOException {
public void writeField(FieldInfo fieldInfo, PointsReader values) throws IOException {
boolean singleValuePerDoc = values.size(fieldInfo.name) == values.getDocCount(fieldInfo.name);
@ -184,6 +185,7 @@ public class RandomCodec extends AssertingCodec {
int lowFreqCutoff = TestUtil.nextInt(random, 2, 100);
maxPointsInLeafNode = TestUtil.nextInt(random, 16, 2048);
maxMBSortInHeap = 5.0 + (3*random.nextDouble());
bkdSplitRandomSeed = random.nextInt();
add(avoidCodecs,
@ -251,7 +253,8 @@ public class RandomCodec extends AssertingCodec {
public String toString() {
return super.toString() + ": " + previousMappings.toString() +
", docValues:" + previousDVMappings.toString() +
", maxPointsInLeafNode=" + maxPointsInLeafNode;
", maxPointsInLeafNode=" + maxPointsInLeafNode +
", maxMBSortInHeap=" + maxMBSortInHeap;
}
/** Just like {@link BKDWriter} except it evilly picks random ways to split cells on