LUCENE-2357: used packed ints to hold docID remapping during merging, to reduce transient RAM usage

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1343946 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2012-05-29 19:39:36 +00:00
parent f5223505b0
commit 5d3dba2c56
9 changed files with 228 additions and 55 deletions

View File

@ -937,6 +937,9 @@ Optimizations
performance; add float acceptableOverheadRatio to getWriter and
getMutable API to give packed ints freedom to pick faster
implementations (Adrien Grand via Mike McCandless)
* LUCENE-2357: Reduce transient RAM usage when merging segments in
IndexWriter. (Adrien Grand via Mike McCandless)
Bug fixes

View File

@ -36,7 +36,7 @@ public final class MappingMultiDocsAndPositionsEnum extends DocsAndPositionsEnum
private MultiDocsAndPositionsEnum.EnumWithSlice[] subs;
int numSubs;
int upto;
int[] currentMap;
MergeState.DocMap currentMap;
DocsAndPositionsEnum current;
int currentBase;
int doc = -1;
@ -94,12 +94,10 @@ public final class MappingMultiDocsAndPositionsEnum extends DocsAndPositionsEnum
int doc = current.nextDoc();
if (doc != NO_MORE_DOCS) {
if (currentMap != null) {
// compact deletions
doc = currentMap[doc];
if (doc == -1) {
continue;
}
// compact deletions
doc = currentMap.get(doc);
if (doc == -1) {
continue;
}
return this.doc = currentBase + doc;
} else {

View File

@ -35,7 +35,7 @@ public final class MappingMultiDocsEnum extends DocsEnum {
private MultiDocsEnum.EnumWithSlice[] subs;
int numSubs;
int upto;
int[] currentMap;
MergeState.DocMap currentMap;
DocsEnum current;
int currentBase;
int doc = -1;
@ -88,18 +88,16 @@ public final class MappingMultiDocsEnum extends DocsEnum {
current = subs[upto].docsEnum;
currentBase = mergeState.docBase[reader];
currentMap = mergeState.docMaps[reader];
assert currentMap == null || currentMap.length == subs[upto].slice.length: "readerIndex=" + reader + " subs.len=" + subs.length + " len1=" + currentMap.length + " vs " + subs[upto].slice.length;
assert currentMap.maxDoc() == subs[upto].slice.length: "readerIndex=" + reader + " subs.len=" + subs.length + " len1=" + currentMap.maxDoc() + " vs " + subs[upto].slice.length;
}
}
int doc = current.nextDoc();
if (doc != NO_MORE_DOCS) {
if (currentMap != null) {
// compact deletions
doc = currentMap[doc];
if (doc == -1) {
continue;
}
// compact deletions
doc = currentMap.get(doc);
if (doc == -1) {
continue;
}
return this.doc = currentBase + doc;
} else {

View File

@ -3483,7 +3483,7 @@ public class IndexWriter implements Closeable, TwoPhaseCommit {
merge.readers.add(reader);
assert delCount <= info.info.getDocCount(): "delCount=" + delCount + " info.docCount=" + info.info.getDocCount() + " rld.pendingDeleteCount=" + rld.getPendingDeleteCount() + " info.getDelCount()=" + info.getDelCount();
if (delCount < info.info.getDocCount()) {
merger.add(reader, liveDocs);
merger.add(reader, liveDocs, delCount);
}
segUpto++;
}

View File

@ -19,11 +19,12 @@ package org.apache.lucene.index;
import java.util.List;
import org.apache.lucene.index.PayloadProcessorProvider.ReaderPayloadProcessor;
import org.apache.lucene.index.PayloadProcessorProvider.PayloadProcessor;
import org.apache.lucene.index.PayloadProcessorProvider.ReaderPayloadProcessor;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.InfoStream;
import org.apache.lucene.util.packed.PackedInts;
/** Holds common state used during segment merging
*
@ -33,17 +34,171 @@ public class MergeState {
public static class IndexReaderAndLiveDocs {
public final AtomicReader reader;
public final Bits liveDocs;
public final int numDeletedDocs;
public IndexReaderAndLiveDocs(AtomicReader reader, Bits liveDocs) {
public IndexReaderAndLiveDocs(AtomicReader reader, Bits liveDocs, int numDeletedDocs) {
this.reader = reader;
this.liveDocs = liveDocs;
this.numDeletedDocs = numDeletedDocs;
}
}
public static abstract class DocMap {
private final Bits liveDocs;
protected DocMap(Bits liveDocs) {
this.liveDocs = liveDocs;
}
public static DocMap build(IndexReaderAndLiveDocs reader) {
final int maxDoc = reader.reader.maxDoc();
final int numDeletes = reader.numDeletedDocs;
final int numDocs = maxDoc - numDeletes;
assert reader.liveDocs != null || numDeletes == 0;
if (numDeletes == 0) {
return new NoDelDocMap(maxDoc);
} else if (numDeletes < numDocs) {
return buildDelCountDocmap(maxDoc, numDeletes, reader.liveDocs, PackedInts.FAST);
} else {
return buildDirectDocMap(maxDoc, numDocs, reader.liveDocs, PackedInts.FAST);
}
}
static DocMap buildDelCountDocmap(int maxDoc, int numDeletes, Bits liveDocs, float acceptableOverheadRatio) {
PackedInts.Mutable numDeletesSoFar = PackedInts.getMutable(maxDoc,
PackedInts.bitsRequired(numDeletes), acceptableOverheadRatio);
int del = 0;
for (int i = 0; i < maxDoc; ++i) {
if (!liveDocs.get(i)) {
++del;
}
numDeletesSoFar.set(i, del);
}
assert del == numDeletes : "del=" + del + ", numdeletes=" + numDeletes;
return new DelCountDocMap(liveDocs, numDeletesSoFar);
}
static DocMap buildDirectDocMap(int maxDoc, int numDocs, Bits liveDocs, float acceptableOverheadRatio) {
PackedInts.Mutable docIds = PackedInts.getMutable(maxDoc,
PackedInts.bitsRequired(Math.max(0, numDocs - 1)), acceptableOverheadRatio);
int del = 0;
for (int i = 0; i < maxDoc; ++i) {
if (liveDocs.get(i)) {
docIds.set(i, i - del);
} else {
++del;
}
}
assert numDocs + del == maxDoc : "maxDoc=" + maxDoc + ", del=" + del + ", numDocs=" + numDocs;
return new DirectDocMap(liveDocs, docIds, del);
}
public int get(int docId) {
if (liveDocs == null || liveDocs.get(docId)) {
return remap(docId);
} else {
return -1;
}
}
public abstract int remap(int docId);
public abstract int maxDoc();
public final int numDocs() {
return maxDoc() - numDeletedDocs();
}
public abstract int numDeletedDocs();
public boolean hasDeletions() {
return numDeletedDocs() > 0;
}
}
private static class NoDelDocMap extends DocMap {
private final int maxDoc;
private NoDelDocMap(int maxDoc) {
super(null);
this.maxDoc = maxDoc;
}
@Override
public int remap(int docId) {
return docId;
}
@Override
public int maxDoc() {
return maxDoc;
}
@Override
public int numDeletedDocs() {
return 0;
}
}
private static class DirectDocMap extends DocMap {
private final PackedInts.Mutable docIds;
private final int numDeletedDocs;
private DirectDocMap(Bits liveDocs, PackedInts.Mutable docIds, int numDeletedDocs) {
super(liveDocs);
this.docIds = docIds;
this.numDeletedDocs = numDeletedDocs;
}
@Override
public int remap(int docId) {
return (int) docIds.get(docId);
}
@Override
public int maxDoc() {
return docIds.size();
}
@Override
public int numDeletedDocs() {
return numDeletedDocs;
}
}
private static class DelCountDocMap extends DocMap {
private final PackedInts.Mutable numDeletesSoFar;
private DelCountDocMap(Bits liveDocs, PackedInts.Mutable numDeletesSoFar) {
super(liveDocs);
this.numDeletesSoFar = numDeletesSoFar;
}
@Override
public int remap(int docId) {
return docId - (int) numDeletesSoFar.get(docId);
}
@Override
public int maxDoc() {
return numDeletesSoFar.size();
}
@Override
public int numDeletedDocs() {
final int maxDoc = maxDoc();
return (int) numDeletesSoFar.get(maxDoc - 1);
}
}
public SegmentInfo segmentInfo;
public FieldInfos fieldInfos;
public List<IndexReaderAndLiveDocs> readers; // Readers & liveDocs being merged
public int[][] docMaps; // Maps docIDs around deletions
public DocMap[] docMaps; // Maps docIDs around deletions
public int[] docBase; // New docID base per reader
public CheckAbort checkAbort;
public InfoStream infoStream;
@ -65,8 +220,8 @@ public class MergeState {
public static class CheckAbort {
private double workCount;
private MergePolicy.OneMerge merge;
private Directory dir;
private final MergePolicy.OneMerge merge;
private final Directory dir;
public CheckAbort(MergePolicy.OneMerge merge, Directory dir) {
this.merge = merge;
this.dir = dir;

View File

@ -322,7 +322,7 @@ public class MultiDocValues extends DocValues {
final MergeContext ctx = SortedBytesMergeUtils.init(type, values,
comp, globalNumDocs);
List<SortedSourceSlice> slices = SortedBytesMergeUtils.buildSlices(
docBases(), new int[values.length][], values, ctx);
docBases(), new MergeState.DocMap[values.length], values, ctx);
RecordingBytesRefConsumer consumer = new RecordingBytesRefConsumer(
type);
final int maxOrd = SortedBytesMergeUtils.mergeRecords(ctx, consumer,

View File

@ -80,7 +80,7 @@ final class SegmentMerger {
new ReaderUtil.Gather(reader) {
@Override
protected void add(int base, AtomicReader r) {
mergeState.readers.add(new MergeState.IndexReaderAndLiveDocs(r, r.getLiveDocs()));
mergeState.readers.add(new MergeState.IndexReaderAndLiveDocs(r, r.getLiveDocs(), r.numDeletedDocs()));
}
}.run();
} catch (IOException ioe) {
@ -89,8 +89,8 @@ final class SegmentMerger {
}
}
final void add(SegmentReader reader, Bits liveDocs) {
mergeState.readers.add(new MergeState.IndexReaderAndLiveDocs(reader, liveDocs));
final void add(SegmentReader reader, Bits liveDocs, int delCount) {
mergeState.readers.add(new MergeState.IndexReaderAndLiveDocs(reader, liveDocs, delCount));
}
/**
@ -287,7 +287,7 @@ final class SegmentMerger {
final int numReaders = mergeState.readers.size();
// Remap docIDs
mergeState.docMaps = new int[numReaders][];
mergeState.docMaps = new MergeState.DocMap[numReaders];
mergeState.docBase = new int[numReaders];
mergeState.readerPayloadProcessor = new PayloadProcessorProvider.ReaderPayloadProcessor[numReaders];
mergeState.currentPayloadProcessor = new PayloadProcessorProvider.PayloadProcessor[numReaders];
@ -300,30 +300,9 @@ final class SegmentMerger {
final MergeState.IndexReaderAndLiveDocs reader = mergeState.readers.get(i);
mergeState.docBase[i] = docBase;
final int maxDoc = reader.reader.maxDoc();
final int docCount;
final Bits liveDocs = reader.liveDocs;
final int[] docMap;
if (liveDocs != null) {
int delCount = 0;
docMap = new int[maxDoc];
int newDocID = 0;
for(int j=0;j<maxDoc;j++) {
if (!liveDocs.get(j)) {
docMap[j] = -1;
delCount++;
} else {
docMap[j] = newDocID++;
}
}
docCount = maxDoc - delCount;
} else {
docCount = maxDoc;
docMap = null;
}
final MergeState.DocMap docMap = MergeState.DocMap.build(reader);
mergeState.docMaps[i] = docMap;
docBase += docCount;
docBase += docMap.numDocs();
if (mergeState.payloadProcessorProvider != null) {
mergeState.readerPayloadProcessor[i] = mergeState.payloadProcessorProvider.getReaderProcessor(reader.reader);

View File

@ -81,7 +81,8 @@ public final class SortedBytesMergeUtils {
}
}
public static List<SortedSourceSlice> buildSlices(int[] docBases, int[][] docMaps,
public static List<SortedSourceSlice> buildSlices(
int[] docBases, MergeState.DocMap[] docMaps,
DocValues[] docValues, MergeContext ctx) throws IOException {
final List<SortedSourceSlice> slices = new ArrayList<SortedSourceSlice>();
for (int i = 0; i < docValues.length; i++) {
@ -111,15 +112,15 @@ public final class SortedBytesMergeUtils {
* mapping in docIDToRelativeOrd. After the merge SortedSourceSlice#ordMapping
* contains the new global ordinals for the relative index.
*/
private static void createOrdMapping(int[] docBases, int[][] docMaps,
private static void createOrdMapping(int[] docBases, MergeState.DocMap[] docMaps,
SortedSourceSlice currentSlice) {
final int readerIdx = currentSlice.readerIdx;
final int[] currentDocMap = docMaps[readerIdx];
final MergeState.DocMap currentDocMap = docMaps[readerIdx];
final int docBase = currentSlice.docToOrdStart;
assert docBase == docBases[readerIdx];
if (currentDocMap != null) { // we have deletes
for (int i = 0; i < currentDocMap.length; i++) {
final int doc = currentDocMap[i];
if (currentDocMap != null && currentDocMap.hasDeletions()) { // we have deletes
for (int i = 0; i < currentDocMap.maxDoc(); i++) {
final int doc = currentDocMap.get(i);
if (doc != -1) { // not deleted
final int ord = currentSlice.source.ord(i); // collect ords strictly
// increasing

View File

@ -25,9 +25,11 @@ import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.Constants;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.InfoStream;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util._TestUtil;
import org.apache.lucene.util.packed.PackedInts;
public class TestSegmentMerger extends LuceneTestCase {
@ -139,4 +141,41 @@ public class TestSegmentMerger extends LuceneTestCase {
TestSegmentReader.checkNorms(mergedReader);
mergedReader.close();
}
private static boolean equals(MergeState.DocMap map1, MergeState.DocMap map2) {
if (map1.maxDoc() != map2.maxDoc()) {
return false;
}
for (int i = 0; i < map1.maxDoc(); ++i) {
if (map1.get(i) != map2.get(i)) {
return false;
}
}
return true;
}
public void testBuildDocMap() {
final int maxDoc = 128;
final FixedBitSet liveDocs = new FixedBitSet(maxDoc);
MergeState.DocMap docMap1 = MergeState.DocMap.buildDelCountDocmap(maxDoc, maxDoc, liveDocs, PackedInts.COMPACT);
MergeState.DocMap docMap2 = MergeState.DocMap.buildDirectDocMap(maxDoc, 0, liveDocs, PackedInts.COMPACT);
assertTrue(equals(docMap1, docMap2));
liveDocs.set(1);
for (int i = 7; i < 79; ++i) {
liveDocs.set(i);
}
liveDocs.set(80);
liveDocs.set(88);
int numDocs = liveDocs.cardinality();
docMap1 = MergeState.DocMap.buildDelCountDocmap(maxDoc, maxDoc - numDocs, liveDocs, PackedInts.COMPACT);
docMap2 = MergeState.DocMap.buildDirectDocMap(maxDoc, numDocs, liveDocs, PackedInts.COMPACT);
assertTrue(equals(docMap1, docMap2));
liveDocs.set(0, maxDoc);
docMap1 = MergeState.DocMap.buildDelCountDocmap(maxDoc, 0, liveDocs, PackedInts.COMPACT);
docMap2 = MergeState.DocMap.buildDirectDocMap(maxDoc, maxDoc, liveDocs, PackedInts.COMPACT);
assertTrue(equals(docMap1, docMap2));
}
}