mirror of https://github.com/apache/lucene.git
LUCENE-6133: improve default stored fields merge algorithm
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1648327 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
410099b6fb
commit
d249f8bc33
|
@ -183,6 +183,9 @@ Optimizations
|
|||
|
||||
* LUCENE-6131: Optimize SortingMergePolicy. (Robert Muir)
|
||||
|
||||
* LUCENE-6133: Improve default StoredFieldsWriter.merge() to be more efficient.
|
||||
(Robert Muir)
|
||||
|
||||
API Changes
|
||||
|
||||
* LUCENE-5900: Deprecated more constructors taking Version in *InfixSuggester and
|
||||
|
|
|
@ -18,14 +18,17 @@ package org.apache.lucene.codecs;
|
|||
|
||||
import java.io.Closeable;
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.document.DocumentStoredFieldVisitor;
|
||||
import org.apache.lucene.document.StoredField;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.FieldInfos;
|
||||
import org.apache.lucene.index.IndexableFieldType;
|
||||
import org.apache.lucene.index.MergeState;
|
||||
import org.apache.lucene.index.StorableField;
|
||||
import org.apache.lucene.index.StoredDocument;
|
||||
import org.apache.lucene.index.StoredFieldVisitor;
|
||||
import org.apache.lucene.util.Bits;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
||||
/**
|
||||
* Codec API for writing stored fields:
|
||||
|
@ -81,6 +84,7 @@ public abstract class StoredFieldsWriter implements Closeable {
|
|||
for (int i=0;i<mergeState.storedFieldsReaders.length;i++) {
|
||||
StoredFieldsReader storedFieldsReader = mergeState.storedFieldsReaders[i];
|
||||
storedFieldsReader.checkIntegrity();
|
||||
MergeVisitor visitor = new MergeVisitor(mergeState, i);
|
||||
int maxDoc = mergeState.maxDocs[i];
|
||||
Bits liveDocs = mergeState.liveDocs[i];
|
||||
for (int docID=0;docID<maxDoc;docID++) {
|
||||
|
@ -88,16 +92,9 @@ public abstract class StoredFieldsWriter implements Closeable {
|
|||
// skip deleted docs
|
||||
continue;
|
||||
}
|
||||
// TODO: this could be more efficient using
|
||||
// FieldVisitor instead of loading/writing entire
|
||||
// doc; ie we just have to renumber the field number
|
||||
// on the fly?
|
||||
// NOTE: it's very important to first assign to doc then pass it to
|
||||
// fieldsWriter.addDocument; see LUCENE-1282
|
||||
DocumentStoredFieldVisitor visitor = new DocumentStoredFieldVisitor();
|
||||
startDocument();
|
||||
storedFieldsReader.visitDocument(docID, visitor);
|
||||
StoredDocument doc = visitor.getDocument();
|
||||
addDocument(doc, mergeState.mergeFieldInfos);
|
||||
finishDocument();
|
||||
docCount++;
|
||||
mergeState.checkAbort.work(300);
|
||||
}
|
||||
|
@ -106,15 +103,134 @@ public abstract class StoredFieldsWriter implements Closeable {
|
|||
return docCount;
|
||||
}
|
||||
|
||||
/** sugar method for startDocument() + writeField() for every stored field in the document */
|
||||
protected final void addDocument(Iterable<? extends StorableField> doc, FieldInfos fieldInfos) throws IOException {
|
||||
startDocument();
|
||||
|
||||
for (StorableField field : doc) {
|
||||
writeField(fieldInfos.fieldInfo(field.name()), field);
|
||||
/**
|
||||
* A visitor that adds every field it sees.
|
||||
* <p>
|
||||
* Use like this:
|
||||
* <pre>
|
||||
* MergeVisitor visitor = new MergeVisitor(mergeState, readerIndex);
|
||||
* for (...) {
|
||||
* startDocument();
|
||||
* storedFieldsReader.visitDocument(docID, visitor);
|
||||
* finishDocument();
|
||||
* }
|
||||
* </pre>
|
||||
*/
|
||||
protected class MergeVisitor extends StoredFieldVisitor implements StorableField {
|
||||
BytesRef binaryValue;
|
||||
String stringValue;
|
||||
Number numericValue;
|
||||
FieldInfo currentField;
|
||||
FieldInfos remapper;
|
||||
|
||||
/**
|
||||
* Create new merge visitor.
|
||||
*/
|
||||
public MergeVisitor(MergeState mergeState, int readerIndex) {
|
||||
// if field numbers are aligned, we can save hash lookups
|
||||
// on every field access. Otherwise, we need to lookup
|
||||
// fieldname each time, and remap to a new number.
|
||||
for (FieldInfo fi : mergeState.fieldInfos[readerIndex]) {
|
||||
FieldInfo other = mergeState.mergeFieldInfos.fieldInfo(fi.number);
|
||||
if (other == null || !other.name.equals(fi.name)) {
|
||||
remapper = mergeState.mergeFieldInfos;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void binaryField(FieldInfo fieldInfo, byte[] value) throws IOException {
|
||||
reset(fieldInfo);
|
||||
binaryValue = new BytesRef(value);
|
||||
write();
|
||||
}
|
||||
|
||||
finishDocument();
|
||||
@Override
|
||||
public void stringField(FieldInfo fieldInfo, String value) throws IOException {
|
||||
reset(fieldInfo);
|
||||
stringValue = value;
|
||||
write();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void intField(FieldInfo fieldInfo, int value) throws IOException {
|
||||
reset(fieldInfo);
|
||||
numericValue = value;
|
||||
write();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void longField(FieldInfo fieldInfo, long value) throws IOException {
|
||||
reset(fieldInfo);
|
||||
numericValue = value;
|
||||
write();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void floatField(FieldInfo fieldInfo, float value) throws IOException {
|
||||
reset(fieldInfo);
|
||||
numericValue = value;
|
||||
write();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void doubleField(FieldInfo fieldInfo, double value) throws IOException {
|
||||
reset(fieldInfo);
|
||||
numericValue = value;
|
||||
write();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Status needsField(FieldInfo fieldInfo) throws IOException {
|
||||
return Status.YES;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String name() {
|
||||
return currentField.name;
|
||||
}
|
||||
|
||||
@Override
|
||||
public IndexableFieldType fieldType() {
|
||||
return StoredField.TYPE;
|
||||
}
|
||||
|
||||
@Override
|
||||
public BytesRef binaryValue() {
|
||||
return binaryValue;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String stringValue() {
|
||||
return stringValue;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Number numericValue() {
|
||||
return numericValue;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Reader readerValue() {
|
||||
return null;
|
||||
}
|
||||
|
||||
void reset(FieldInfo field) {
|
||||
if (remapper != null) {
|
||||
// field numbers are not aligned, we need to remap to the new field number
|
||||
currentField = remapper.fieldInfo(field.name);
|
||||
} else {
|
||||
currentField = field;
|
||||
}
|
||||
binaryValue = null;
|
||||
stringValue = null;
|
||||
numericValue = null;
|
||||
}
|
||||
|
||||
void write() throws IOException {
|
||||
writeField(currentField, this);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -24,7 +24,6 @@ import org.apache.lucene.codecs.CodecUtil;
|
|||
import org.apache.lucene.codecs.StoredFieldsReader;
|
||||
import org.apache.lucene.codecs.StoredFieldsWriter;
|
||||
import org.apache.lucene.codecs.compressing.CompressingStoredFieldsReader.SerializedDocument;
|
||||
import org.apache.lucene.document.DocumentStoredFieldVisitor;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.FieldInfos;
|
||||
import org.apache.lucene.index.IndexFileNames;
|
||||
|
@ -41,6 +40,7 @@ import org.apache.lucene.util.Bits;
|
|||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.GrowableByteArrayDataOutput;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.UnicodeUtil;
|
||||
import org.apache.lucene.util.packed.PackedInts;
|
||||
|
||||
/**
|
||||
|
@ -74,7 +74,6 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter {
|
|||
private CompressingStoredFieldsIndexWriter indexWriter;
|
||||
private IndexOutput fieldsStream;
|
||||
|
||||
private final CompressionMode compressionMode;
|
||||
private final Compressor compressor;
|
||||
private final int chunkSize;
|
||||
private final int maxDocsPerChunk;
|
||||
|
@ -90,7 +89,6 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter {
|
|||
String formatName, CompressionMode compressionMode, int chunkSize, int maxDocsPerChunk) throws IOException {
|
||||
assert directory != null;
|
||||
this.segment = si.name;
|
||||
this.compressionMode = compressionMode;
|
||||
this.compressor = compressionMode.newCompressor();
|
||||
this.chunkSize = chunkSize;
|
||||
this.maxDocsPerChunk = maxDocsPerChunk;
|
||||
|
@ -237,6 +235,8 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter {
|
|||
numBufferedDocs = 0;
|
||||
bufferedDocs.length = 0;
|
||||
}
|
||||
|
||||
byte scratchBytes[] = new byte[16];
|
||||
|
||||
@Override
|
||||
public void writeField(FieldInfo info, StorableField field)
|
||||
|
@ -284,7 +284,11 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter {
|
|||
bufferedDocs.writeVInt(bytes.length);
|
||||
bufferedDocs.writeBytes(bytes.bytes, bytes.offset, bytes.length);
|
||||
} else if (string != null) {
|
||||
bufferedDocs.writeString(field.stringValue());
|
||||
// this is just an optimized writeString() that re-uses scratchBytes.
|
||||
scratchBytes = ArrayUtil.grow(scratchBytes, string.length() * UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR);
|
||||
int length = UnicodeUtil.UTF16toUTF8(string, 0, string.length(), scratchBytes);
|
||||
bufferedDocs.writeVInt(length);
|
||||
bufferedDocs.writeBytes(scratchBytes, length);
|
||||
} else {
|
||||
if (number instanceof Byte || number instanceof Short || number instanceof Integer) {
|
||||
bufferedDocs.writeZInt(number.intValue());
|
||||
|
@ -474,6 +478,7 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter {
|
|||
MatchingReaders matching = new MatchingReaders(mergeState);
|
||||
|
||||
for (int readerIndex=0;readerIndex<numReaders;readerIndex++) {
|
||||
MergeVisitor visitor = new MergeVisitor(mergeState, readerIndex);
|
||||
CompressingStoredFieldsReader matchingFieldsReader = null;
|
||||
if (matching.matchingReaders[readerIndex]) {
|
||||
final StoredFieldsReader fieldsReader = mergeState.storedFieldsReaders[readerIndex];
|
||||
|
@ -497,9 +502,9 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter {
|
|||
if (liveDocs != null && liveDocs.get(docID) == false) {
|
||||
continue;
|
||||
}
|
||||
DocumentStoredFieldVisitor visitor = new DocumentStoredFieldVisitor();
|
||||
startDocument();
|
||||
storedFieldsReader.visitDocument(docID, visitor);
|
||||
addDocument(visitor.getDocument(), mergeState.mergeFieldInfos);
|
||||
finishDocument();
|
||||
++docCount;
|
||||
mergeState.checkAbort.work(300);
|
||||
}
|
||||
|
|
|
@ -123,7 +123,7 @@ public final class UnicodeUtil {
|
|||
(UNI_SUR_HIGH_START << HALF_SHIFT) - UNI_SUR_LOW_START;
|
||||
|
||||
/** Maximum number of UTF8 bytes per UTF16 character. */
|
||||
public static final int MAX_UTF8_BYTES_PER_CHAR = 4;
|
||||
public static final int MAX_UTF8_BYTES_PER_CHAR = 3;
|
||||
|
||||
/** Encode characters from a char[] source, starting at
|
||||
* offset for length chars. It is the responsibility of the
|
||||
|
|
Loading…
Reference in New Issue