LUCENE-6133: improve default stored fields merge algorithm

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1648327 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2014-12-29 13:02:20 +00:00
parent 410099b6fb
commit d249f8bc33
4 changed files with 149 additions and 25 deletions

View File

@ -183,6 +183,9 @@ Optimizations
* LUCENE-6131: Optimize SortingMergePolicy. (Robert Muir) * LUCENE-6131: Optimize SortingMergePolicy. (Robert Muir)
* LUCENE-6133: Improve default StoredFieldsWriter.merge() to be more efficient.
(Robert Muir)
API Changes API Changes
* LUCENE-5900: Deprecated more constructors taking Version in *InfixSuggester and * LUCENE-5900: Deprecated more constructors taking Version in *InfixSuggester and

View File

@ -18,14 +18,17 @@ package org.apache.lucene.codecs;
import java.io.Closeable; import java.io.Closeable;
import java.io.IOException; import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.document.DocumentStoredFieldVisitor; import org.apache.lucene.document.StoredField;
import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos; import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.IndexableFieldType;
import org.apache.lucene.index.MergeState; import org.apache.lucene.index.MergeState;
import org.apache.lucene.index.StorableField; import org.apache.lucene.index.StorableField;
import org.apache.lucene.index.StoredDocument; import org.apache.lucene.index.StoredFieldVisitor;
import org.apache.lucene.util.Bits; import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
/** /**
* Codec API for writing stored fields: * Codec API for writing stored fields:
@ -81,6 +84,7 @@ public abstract class StoredFieldsWriter implements Closeable {
for (int i=0;i<mergeState.storedFieldsReaders.length;i++) { for (int i=0;i<mergeState.storedFieldsReaders.length;i++) {
StoredFieldsReader storedFieldsReader = mergeState.storedFieldsReaders[i]; StoredFieldsReader storedFieldsReader = mergeState.storedFieldsReaders[i];
storedFieldsReader.checkIntegrity(); storedFieldsReader.checkIntegrity();
MergeVisitor visitor = new MergeVisitor(mergeState, i);
int maxDoc = mergeState.maxDocs[i]; int maxDoc = mergeState.maxDocs[i];
Bits liveDocs = mergeState.liveDocs[i]; Bits liveDocs = mergeState.liveDocs[i];
for (int docID=0;docID<maxDoc;docID++) { for (int docID=0;docID<maxDoc;docID++) {
@ -88,16 +92,9 @@ public abstract class StoredFieldsWriter implements Closeable {
// skip deleted docs // skip deleted docs
continue; continue;
} }
// TODO: this could be more efficient using startDocument();
// FieldVisitor instead of loading/writing entire
// doc; ie we just have to renumber the field number
// on the fly?
// NOTE: it's very important to first assign to doc then pass it to
// fieldsWriter.addDocument; see LUCENE-1282
DocumentStoredFieldVisitor visitor = new DocumentStoredFieldVisitor();
storedFieldsReader.visitDocument(docID, visitor); storedFieldsReader.visitDocument(docID, visitor);
StoredDocument doc = visitor.getDocument(); finishDocument();
addDocument(doc, mergeState.mergeFieldInfos);
docCount++; docCount++;
mergeState.checkAbort.work(300); mergeState.checkAbort.work(300);
} }
@ -106,15 +103,134 @@ public abstract class StoredFieldsWriter implements Closeable {
return docCount; return docCount;
} }
/** sugar method for startDocument() + writeField() for every stored field in the document */ /**
protected final void addDocument(Iterable<? extends StorableField> doc, FieldInfos fieldInfos) throws IOException { * A visitor that adds every field it sees.
startDocument(); * <p>
* Use like this:
for (StorableField field : doc) { * <pre>
writeField(fieldInfos.fieldInfo(field.name()), field); * MergeVisitor visitor = new MergeVisitor(mergeState, readerIndex);
* for (...) {
* startDocument();
* storedFieldsReader.visitDocument(docID, visitor);
* finishDocument();
* }
* </pre>
*/
protected class MergeVisitor extends StoredFieldVisitor implements StorableField {
BytesRef binaryValue;
String stringValue;
Number numericValue;
FieldInfo currentField;
FieldInfos remapper;
/**
* Create new merge visitor.
*/
public MergeVisitor(MergeState mergeState, int readerIndex) {
// if field numbers are aligned, we can save hash lookups
// on every field access. Otherwise, we need to lookup
// fieldname each time, and remap to a new number.
for (FieldInfo fi : mergeState.fieldInfos[readerIndex]) {
FieldInfo other = mergeState.mergeFieldInfos.fieldInfo(fi.number);
if (other == null || !other.name.equals(fi.name)) {
remapper = mergeState.mergeFieldInfos;
break;
}
}
}
@Override
public void binaryField(FieldInfo fieldInfo, byte[] value) throws IOException {
reset(fieldInfo);
binaryValue = new BytesRef(value);
write();
} }
finishDocument(); @Override
public void stringField(FieldInfo fieldInfo, String value) throws IOException {
reset(fieldInfo);
stringValue = value;
write();
}
@Override
public void intField(FieldInfo fieldInfo, int value) throws IOException {
reset(fieldInfo);
numericValue = value;
write();
}
@Override
public void longField(FieldInfo fieldInfo, long value) throws IOException {
reset(fieldInfo);
numericValue = value;
write();
}
@Override
public void floatField(FieldInfo fieldInfo, float value) throws IOException {
reset(fieldInfo);
numericValue = value;
write();
}
@Override
public void doubleField(FieldInfo fieldInfo, double value) throws IOException {
reset(fieldInfo);
numericValue = value;
write();
}
@Override
public Status needsField(FieldInfo fieldInfo) throws IOException {
return Status.YES;
}
@Override
public String name() {
return currentField.name;
}
@Override
public IndexableFieldType fieldType() {
return StoredField.TYPE;
}
@Override
public BytesRef binaryValue() {
return binaryValue;
}
@Override
public String stringValue() {
return stringValue;
}
@Override
public Number numericValue() {
return numericValue;
}
@Override
public Reader readerValue() {
return null;
}
void reset(FieldInfo field) {
if (remapper != null) {
// field numbers are not aligned, we need to remap to the new field number
currentField = remapper.fieldInfo(field.name);
} else {
currentField = field;
}
binaryValue = null;
stringValue = null;
numericValue = null;
}
void write() throws IOException {
writeField(currentField, this);
}
} }
@Override @Override

View File

@ -24,7 +24,6 @@ import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.StoredFieldsReader; import org.apache.lucene.codecs.StoredFieldsReader;
import org.apache.lucene.codecs.StoredFieldsWriter; import org.apache.lucene.codecs.StoredFieldsWriter;
import org.apache.lucene.codecs.compressing.CompressingStoredFieldsReader.SerializedDocument; import org.apache.lucene.codecs.compressing.CompressingStoredFieldsReader.SerializedDocument;
import org.apache.lucene.document.DocumentStoredFieldVisitor;
import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos; import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.IndexFileNames; import org.apache.lucene.index.IndexFileNames;
@ -41,6 +40,7 @@ import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.GrowableByteArrayDataOutput; import org.apache.lucene.util.GrowableByteArrayDataOutput;
import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util.packed.PackedInts; import org.apache.lucene.util.packed.PackedInts;
/** /**
@ -74,7 +74,6 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter {
private CompressingStoredFieldsIndexWriter indexWriter; private CompressingStoredFieldsIndexWriter indexWriter;
private IndexOutput fieldsStream; private IndexOutput fieldsStream;
private final CompressionMode compressionMode;
private final Compressor compressor; private final Compressor compressor;
private final int chunkSize; private final int chunkSize;
private final int maxDocsPerChunk; private final int maxDocsPerChunk;
@ -90,7 +89,6 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter {
String formatName, CompressionMode compressionMode, int chunkSize, int maxDocsPerChunk) throws IOException { String formatName, CompressionMode compressionMode, int chunkSize, int maxDocsPerChunk) throws IOException {
assert directory != null; assert directory != null;
this.segment = si.name; this.segment = si.name;
this.compressionMode = compressionMode;
this.compressor = compressionMode.newCompressor(); this.compressor = compressionMode.newCompressor();
this.chunkSize = chunkSize; this.chunkSize = chunkSize;
this.maxDocsPerChunk = maxDocsPerChunk; this.maxDocsPerChunk = maxDocsPerChunk;
@ -237,6 +235,8 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter {
numBufferedDocs = 0; numBufferedDocs = 0;
bufferedDocs.length = 0; bufferedDocs.length = 0;
} }
byte scratchBytes[] = new byte[16];
@Override @Override
public void writeField(FieldInfo info, StorableField field) public void writeField(FieldInfo info, StorableField field)
@ -284,7 +284,11 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter {
bufferedDocs.writeVInt(bytes.length); bufferedDocs.writeVInt(bytes.length);
bufferedDocs.writeBytes(bytes.bytes, bytes.offset, bytes.length); bufferedDocs.writeBytes(bytes.bytes, bytes.offset, bytes.length);
} else if (string != null) { } else if (string != null) {
bufferedDocs.writeString(field.stringValue()); // this is just an optimized writeString() that re-uses scratchBytes.
scratchBytes = ArrayUtil.grow(scratchBytes, string.length() * UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR);
int length = UnicodeUtil.UTF16toUTF8(string, 0, string.length(), scratchBytes);
bufferedDocs.writeVInt(length);
bufferedDocs.writeBytes(scratchBytes, length);
} else { } else {
if (number instanceof Byte || number instanceof Short || number instanceof Integer) { if (number instanceof Byte || number instanceof Short || number instanceof Integer) {
bufferedDocs.writeZInt(number.intValue()); bufferedDocs.writeZInt(number.intValue());
@ -474,6 +478,7 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter {
MatchingReaders matching = new MatchingReaders(mergeState); MatchingReaders matching = new MatchingReaders(mergeState);
for (int readerIndex=0;readerIndex<numReaders;readerIndex++) { for (int readerIndex=0;readerIndex<numReaders;readerIndex++) {
MergeVisitor visitor = new MergeVisitor(mergeState, readerIndex);
CompressingStoredFieldsReader matchingFieldsReader = null; CompressingStoredFieldsReader matchingFieldsReader = null;
if (matching.matchingReaders[readerIndex]) { if (matching.matchingReaders[readerIndex]) {
final StoredFieldsReader fieldsReader = mergeState.storedFieldsReaders[readerIndex]; final StoredFieldsReader fieldsReader = mergeState.storedFieldsReaders[readerIndex];
@ -497,9 +502,9 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter {
if (liveDocs != null && liveDocs.get(docID) == false) { if (liveDocs != null && liveDocs.get(docID) == false) {
continue; continue;
} }
DocumentStoredFieldVisitor visitor = new DocumentStoredFieldVisitor(); startDocument();
storedFieldsReader.visitDocument(docID, visitor); storedFieldsReader.visitDocument(docID, visitor);
addDocument(visitor.getDocument(), mergeState.mergeFieldInfos); finishDocument();
++docCount; ++docCount;
mergeState.checkAbort.work(300); mergeState.checkAbort.work(300);
} }

View File

@ -123,7 +123,7 @@ public final class UnicodeUtil {
(UNI_SUR_HIGH_START << HALF_SHIFT) - UNI_SUR_LOW_START; (UNI_SUR_HIGH_START << HALF_SHIFT) - UNI_SUR_LOW_START;
/** Maximum number of UTF8 bytes per UTF16 character. */ /** Maximum number of UTF8 bytes per UTF16 character. */
public static final int MAX_UTF8_BYTES_PER_CHAR = 4; public static final int MAX_UTF8_BYTES_PER_CHAR = 3;
/** Encode characters from a char[] source, starting at /** Encode characters from a char[] source, starting at
* offset for length chars. It is the responsibility of the * offset for length chars. It is the responsibility of the