LUCENE-6133: improve default stored fields merge algorithm

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1648327 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2014-12-29 13:02:20 +00:00
parent 410099b6fb
commit d249f8bc33
4 changed files with 149 additions and 25 deletions

View File

@ -183,6 +183,9 @@ Optimizations
* LUCENE-6131: Optimize SortingMergePolicy. (Robert Muir)
* LUCENE-6133: Improve default StoredFieldsWriter.merge() to be more efficient.
(Robert Muir)
API Changes
* LUCENE-5900: Deprecated more constructors taking Version in *InfixSuggester and

View File

@ -18,14 +18,17 @@ package org.apache.lucene.codecs;
import java.io.Closeable;
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.document.DocumentStoredFieldVisitor;
import org.apache.lucene.document.StoredField;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.IndexableFieldType;
import org.apache.lucene.index.MergeState;
import org.apache.lucene.index.StorableField;
import org.apache.lucene.index.StoredDocument;
import org.apache.lucene.index.StoredFieldVisitor;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
/**
* Codec API for writing stored fields:
@ -81,6 +84,7 @@ public abstract class StoredFieldsWriter implements Closeable {
for (int i=0;i<mergeState.storedFieldsReaders.length;i++) {
StoredFieldsReader storedFieldsReader = mergeState.storedFieldsReaders[i];
storedFieldsReader.checkIntegrity();
MergeVisitor visitor = new MergeVisitor(mergeState, i);
int maxDoc = mergeState.maxDocs[i];
Bits liveDocs = mergeState.liveDocs[i];
for (int docID=0;docID<maxDoc;docID++) {
@ -88,16 +92,9 @@ public abstract class StoredFieldsWriter implements Closeable {
// skip deleted docs
continue;
}
// TODO: this could be more efficient using
// FieldVisitor instead of loading/writing entire
// doc; ie we just have to renumber the field number
// on the fly?
// NOTE: it's very important to first assign to doc then pass it to
// fieldsWriter.addDocument; see LUCENE-1282
DocumentStoredFieldVisitor visitor = new DocumentStoredFieldVisitor();
startDocument();
storedFieldsReader.visitDocument(docID, visitor);
StoredDocument doc = visitor.getDocument();
addDocument(doc, mergeState.mergeFieldInfos);
finishDocument();
docCount++;
mergeState.checkAbort.work(300);
}
@ -106,15 +103,134 @@ public abstract class StoredFieldsWriter implements Closeable {
return docCount;
}
/** sugar method for startDocument() + writeField() for every stored field in the document */
protected final void addDocument(Iterable<? extends StorableField> doc, FieldInfos fieldInfos) throws IOException {
startDocument();
for (StorableField field : doc) {
writeField(fieldInfos.fieldInfo(field.name()), field);
/**
* A visitor that adds every field it sees.
* <p>
* Use like this:
* <pre>
* MergeVisitor visitor = new MergeVisitor(mergeState, readerIndex);
* for (...) {
* startDocument();
* storedFieldsReader.visitDocument(docID, visitor);
* finishDocument();
* }
* </pre>
*/
protected class MergeVisitor extends StoredFieldVisitor implements StorableField {
BytesRef binaryValue;
String stringValue;
Number numericValue;
FieldInfo currentField;
FieldInfos remapper;
/**
* Create new merge visitor.
*/
public MergeVisitor(MergeState mergeState, int readerIndex) {
// if field numbers are aligned, we can save hash lookups
// on every field access. Otherwise, we need to lookup
// fieldname each time, and remap to a new number.
for (FieldInfo fi : mergeState.fieldInfos[readerIndex]) {
FieldInfo other = mergeState.mergeFieldInfos.fieldInfo(fi.number);
if (other == null || !other.name.equals(fi.name)) {
remapper = mergeState.mergeFieldInfos;
break;
}
}
}
@Override
public void binaryField(FieldInfo fieldInfo, byte[] value) throws IOException {
reset(fieldInfo);
binaryValue = new BytesRef(value);
write();
}
finishDocument();
@Override
public void stringField(FieldInfo fieldInfo, String value) throws IOException {
reset(fieldInfo);
stringValue = value;
write();
}
@Override
public void intField(FieldInfo fieldInfo, int value) throws IOException {
reset(fieldInfo);
numericValue = value;
write();
}
@Override
public void longField(FieldInfo fieldInfo, long value) throws IOException {
reset(fieldInfo);
numericValue = value;
write();
}
@Override
public void floatField(FieldInfo fieldInfo, float value) throws IOException {
reset(fieldInfo);
numericValue = value;
write();
}
@Override
public void doubleField(FieldInfo fieldInfo, double value) throws IOException {
reset(fieldInfo);
numericValue = value;
write();
}
@Override
public Status needsField(FieldInfo fieldInfo) throws IOException {
return Status.YES;
}
@Override
public String name() {
return currentField.name;
}
@Override
public IndexableFieldType fieldType() {
return StoredField.TYPE;
}
@Override
public BytesRef binaryValue() {
return binaryValue;
}
@Override
public String stringValue() {
return stringValue;
}
@Override
public Number numericValue() {
return numericValue;
}
@Override
public Reader readerValue() {
return null;
}
void reset(FieldInfo field) {
if (remapper != null) {
// field numbers are not aligned, we need to remap to the new field number
currentField = remapper.fieldInfo(field.name);
} else {
currentField = field;
}
binaryValue = null;
stringValue = null;
numericValue = null;
}
void write() throws IOException {
writeField(currentField, this);
}
}
@Override

View File

@ -24,7 +24,6 @@ import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.StoredFieldsReader;
import org.apache.lucene.codecs.StoredFieldsWriter;
import org.apache.lucene.codecs.compressing.CompressingStoredFieldsReader.SerializedDocument;
import org.apache.lucene.document.DocumentStoredFieldVisitor;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.IndexFileNames;
@ -41,6 +40,7 @@ import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.GrowableByteArrayDataOutput;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util.packed.PackedInts;
/**
@ -74,7 +74,6 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter {
private CompressingStoredFieldsIndexWriter indexWriter;
private IndexOutput fieldsStream;
private final CompressionMode compressionMode;
private final Compressor compressor;
private final int chunkSize;
private final int maxDocsPerChunk;
@ -90,7 +89,6 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter {
String formatName, CompressionMode compressionMode, int chunkSize, int maxDocsPerChunk) throws IOException {
assert directory != null;
this.segment = si.name;
this.compressionMode = compressionMode;
this.compressor = compressionMode.newCompressor();
this.chunkSize = chunkSize;
this.maxDocsPerChunk = maxDocsPerChunk;
@ -237,6 +235,8 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter {
numBufferedDocs = 0;
bufferedDocs.length = 0;
}
byte scratchBytes[] = new byte[16];
@Override
public void writeField(FieldInfo info, StorableField field)
@ -284,7 +284,11 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter {
bufferedDocs.writeVInt(bytes.length);
bufferedDocs.writeBytes(bytes.bytes, bytes.offset, bytes.length);
} else if (string != null) {
bufferedDocs.writeString(field.stringValue());
// this is just an optimized writeString() that re-uses scratchBytes.
scratchBytes = ArrayUtil.grow(scratchBytes, string.length() * UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR);
int length = UnicodeUtil.UTF16toUTF8(string, 0, string.length(), scratchBytes);
bufferedDocs.writeVInt(length);
bufferedDocs.writeBytes(scratchBytes, length);
} else {
if (number instanceof Byte || number instanceof Short || number instanceof Integer) {
bufferedDocs.writeZInt(number.intValue());
@ -474,6 +478,7 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter {
MatchingReaders matching = new MatchingReaders(mergeState);
for (int readerIndex=0;readerIndex<numReaders;readerIndex++) {
MergeVisitor visitor = new MergeVisitor(mergeState, readerIndex);
CompressingStoredFieldsReader matchingFieldsReader = null;
if (matching.matchingReaders[readerIndex]) {
final StoredFieldsReader fieldsReader = mergeState.storedFieldsReaders[readerIndex];
@ -497,9 +502,9 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter {
if (liveDocs != null && liveDocs.get(docID) == false) {
continue;
}
DocumentStoredFieldVisitor visitor = new DocumentStoredFieldVisitor();
startDocument();
storedFieldsReader.visitDocument(docID, visitor);
addDocument(visitor.getDocument(), mergeState.mergeFieldInfos);
finishDocument();
++docCount;
mergeState.checkAbort.work(300);
}

View File

@ -123,7 +123,7 @@ public final class UnicodeUtil {
(UNI_SUR_HIGH_START << HALF_SHIFT) - UNI_SUR_LOW_START;
/** Maximum number of UTF8 bytes per UTF16 character. */
public static final int MAX_UTF8_BYTES_PER_CHAR = 4;
public static final int MAX_UTF8_BYTES_PER_CHAR = 3;
/** Encode characters from a char[] source, starting at
* offset for length chars. It is the responsibility of the