mirror of https://github.com/apache/lucene.git
LUCENE-6133: improve default stored fields merge algorithm
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1648327 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
410099b6fb
commit
d249f8bc33
|
@ -183,6 +183,9 @@ Optimizations
|
||||||
|
|
||||||
* LUCENE-6131: Optimize SortingMergePolicy. (Robert Muir)
|
* LUCENE-6131: Optimize SortingMergePolicy. (Robert Muir)
|
||||||
|
|
||||||
|
* LUCENE-6133: Improve default StoredFieldsWriter.merge() to be more efficient.
|
||||||
|
(Robert Muir)
|
||||||
|
|
||||||
API Changes
|
API Changes
|
||||||
|
|
||||||
* LUCENE-5900: Deprecated more constructors taking Version in *InfixSuggester and
|
* LUCENE-5900: Deprecated more constructors taking Version in *InfixSuggester and
|
||||||
|
|
|
@ -18,14 +18,17 @@ package org.apache.lucene.codecs;
|
||||||
|
|
||||||
import java.io.Closeable;
|
import java.io.Closeable;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.io.Reader;
|
||||||
|
|
||||||
import org.apache.lucene.document.DocumentStoredFieldVisitor;
|
import org.apache.lucene.document.StoredField;
|
||||||
import org.apache.lucene.index.FieldInfo;
|
import org.apache.lucene.index.FieldInfo;
|
||||||
import org.apache.lucene.index.FieldInfos;
|
import org.apache.lucene.index.FieldInfos;
|
||||||
|
import org.apache.lucene.index.IndexableFieldType;
|
||||||
import org.apache.lucene.index.MergeState;
|
import org.apache.lucene.index.MergeState;
|
||||||
import org.apache.lucene.index.StorableField;
|
import org.apache.lucene.index.StorableField;
|
||||||
import org.apache.lucene.index.StoredDocument;
|
import org.apache.lucene.index.StoredFieldVisitor;
|
||||||
import org.apache.lucene.util.Bits;
|
import org.apache.lucene.util.Bits;
|
||||||
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Codec API for writing stored fields:
|
* Codec API for writing stored fields:
|
||||||
|
@ -81,6 +84,7 @@ public abstract class StoredFieldsWriter implements Closeable {
|
||||||
for (int i=0;i<mergeState.storedFieldsReaders.length;i++) {
|
for (int i=0;i<mergeState.storedFieldsReaders.length;i++) {
|
||||||
StoredFieldsReader storedFieldsReader = mergeState.storedFieldsReaders[i];
|
StoredFieldsReader storedFieldsReader = mergeState.storedFieldsReaders[i];
|
||||||
storedFieldsReader.checkIntegrity();
|
storedFieldsReader.checkIntegrity();
|
||||||
|
MergeVisitor visitor = new MergeVisitor(mergeState, i);
|
||||||
int maxDoc = mergeState.maxDocs[i];
|
int maxDoc = mergeState.maxDocs[i];
|
||||||
Bits liveDocs = mergeState.liveDocs[i];
|
Bits liveDocs = mergeState.liveDocs[i];
|
||||||
for (int docID=0;docID<maxDoc;docID++) {
|
for (int docID=0;docID<maxDoc;docID++) {
|
||||||
|
@ -88,16 +92,9 @@ public abstract class StoredFieldsWriter implements Closeable {
|
||||||
// skip deleted docs
|
// skip deleted docs
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
// TODO: this could be more efficient using
|
startDocument();
|
||||||
// FieldVisitor instead of loading/writing entire
|
|
||||||
// doc; ie we just have to renumber the field number
|
|
||||||
// on the fly?
|
|
||||||
// NOTE: it's very important to first assign to doc then pass it to
|
|
||||||
// fieldsWriter.addDocument; see LUCENE-1282
|
|
||||||
DocumentStoredFieldVisitor visitor = new DocumentStoredFieldVisitor();
|
|
||||||
storedFieldsReader.visitDocument(docID, visitor);
|
storedFieldsReader.visitDocument(docID, visitor);
|
||||||
StoredDocument doc = visitor.getDocument();
|
finishDocument();
|
||||||
addDocument(doc, mergeState.mergeFieldInfos);
|
|
||||||
docCount++;
|
docCount++;
|
||||||
mergeState.checkAbort.work(300);
|
mergeState.checkAbort.work(300);
|
||||||
}
|
}
|
||||||
|
@ -106,15 +103,134 @@ public abstract class StoredFieldsWriter implements Closeable {
|
||||||
return docCount;
|
return docCount;
|
||||||
}
|
}
|
||||||
|
|
||||||
/** sugar method for startDocument() + writeField() for every stored field in the document */
|
/**
|
||||||
protected final void addDocument(Iterable<? extends StorableField> doc, FieldInfos fieldInfos) throws IOException {
|
* A visitor that adds every field it sees.
|
||||||
startDocument();
|
* <p>
|
||||||
|
* Use like this:
|
||||||
|
* <pre>
|
||||||
|
* MergeVisitor visitor = new MergeVisitor(mergeState, readerIndex);
|
||||||
|
* for (...) {
|
||||||
|
* startDocument();
|
||||||
|
* storedFieldsReader.visitDocument(docID, visitor);
|
||||||
|
* finishDocument();
|
||||||
|
* }
|
||||||
|
* </pre>
|
||||||
|
*/
|
||||||
|
protected class MergeVisitor extends StoredFieldVisitor implements StorableField {
|
||||||
|
BytesRef binaryValue;
|
||||||
|
String stringValue;
|
||||||
|
Number numericValue;
|
||||||
|
FieldInfo currentField;
|
||||||
|
FieldInfos remapper;
|
||||||
|
|
||||||
for (StorableField field : doc) {
|
/**
|
||||||
writeField(fieldInfos.fieldInfo(field.name()), field);
|
* Create new merge visitor.
|
||||||
|
*/
|
||||||
|
public MergeVisitor(MergeState mergeState, int readerIndex) {
|
||||||
|
// if field numbers are aligned, we can save hash lookups
|
||||||
|
// on every field access. Otherwise, we need to lookup
|
||||||
|
// fieldname each time, and remap to a new number.
|
||||||
|
for (FieldInfo fi : mergeState.fieldInfos[readerIndex]) {
|
||||||
|
FieldInfo other = mergeState.mergeFieldInfos.fieldInfo(fi.number);
|
||||||
|
if (other == null || !other.name.equals(fi.name)) {
|
||||||
|
remapper = mergeState.mergeFieldInfos;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
finishDocument();
|
@Override
|
||||||
|
public void binaryField(FieldInfo fieldInfo, byte[] value) throws IOException {
|
||||||
|
reset(fieldInfo);
|
||||||
|
binaryValue = new BytesRef(value);
|
||||||
|
write();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void stringField(FieldInfo fieldInfo, String value) throws IOException {
|
||||||
|
reset(fieldInfo);
|
||||||
|
stringValue = value;
|
||||||
|
write();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void intField(FieldInfo fieldInfo, int value) throws IOException {
|
||||||
|
reset(fieldInfo);
|
||||||
|
numericValue = value;
|
||||||
|
write();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void longField(FieldInfo fieldInfo, long value) throws IOException {
|
||||||
|
reset(fieldInfo);
|
||||||
|
numericValue = value;
|
||||||
|
write();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void floatField(FieldInfo fieldInfo, float value) throws IOException {
|
||||||
|
reset(fieldInfo);
|
||||||
|
numericValue = value;
|
||||||
|
write();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void doubleField(FieldInfo fieldInfo, double value) throws IOException {
|
||||||
|
reset(fieldInfo);
|
||||||
|
numericValue = value;
|
||||||
|
write();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Status needsField(FieldInfo fieldInfo) throws IOException {
|
||||||
|
return Status.YES;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String name() {
|
||||||
|
return currentField.name;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public IndexableFieldType fieldType() {
|
||||||
|
return StoredField.TYPE;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public BytesRef binaryValue() {
|
||||||
|
return binaryValue;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String stringValue() {
|
||||||
|
return stringValue;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Number numericValue() {
|
||||||
|
return numericValue;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Reader readerValue() {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
void reset(FieldInfo field) {
|
||||||
|
if (remapper != null) {
|
||||||
|
// field numbers are not aligned, we need to remap to the new field number
|
||||||
|
currentField = remapper.fieldInfo(field.name);
|
||||||
|
} else {
|
||||||
|
currentField = field;
|
||||||
|
}
|
||||||
|
binaryValue = null;
|
||||||
|
stringValue = null;
|
||||||
|
numericValue = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
void write() throws IOException {
|
||||||
|
writeField(currentField, this);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -24,7 +24,6 @@ import org.apache.lucene.codecs.CodecUtil;
|
||||||
import org.apache.lucene.codecs.StoredFieldsReader;
|
import org.apache.lucene.codecs.StoredFieldsReader;
|
||||||
import org.apache.lucene.codecs.StoredFieldsWriter;
|
import org.apache.lucene.codecs.StoredFieldsWriter;
|
||||||
import org.apache.lucene.codecs.compressing.CompressingStoredFieldsReader.SerializedDocument;
|
import org.apache.lucene.codecs.compressing.CompressingStoredFieldsReader.SerializedDocument;
|
||||||
import org.apache.lucene.document.DocumentStoredFieldVisitor;
|
|
||||||
import org.apache.lucene.index.FieldInfo;
|
import org.apache.lucene.index.FieldInfo;
|
||||||
import org.apache.lucene.index.FieldInfos;
|
import org.apache.lucene.index.FieldInfos;
|
||||||
import org.apache.lucene.index.IndexFileNames;
|
import org.apache.lucene.index.IndexFileNames;
|
||||||
|
@ -41,6 +40,7 @@ import org.apache.lucene.util.Bits;
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
import org.apache.lucene.util.GrowableByteArrayDataOutput;
|
import org.apache.lucene.util.GrowableByteArrayDataOutput;
|
||||||
import org.apache.lucene.util.IOUtils;
|
import org.apache.lucene.util.IOUtils;
|
||||||
|
import org.apache.lucene.util.UnicodeUtil;
|
||||||
import org.apache.lucene.util.packed.PackedInts;
|
import org.apache.lucene.util.packed.PackedInts;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -74,7 +74,6 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter {
|
||||||
private CompressingStoredFieldsIndexWriter indexWriter;
|
private CompressingStoredFieldsIndexWriter indexWriter;
|
||||||
private IndexOutput fieldsStream;
|
private IndexOutput fieldsStream;
|
||||||
|
|
||||||
private final CompressionMode compressionMode;
|
|
||||||
private final Compressor compressor;
|
private final Compressor compressor;
|
||||||
private final int chunkSize;
|
private final int chunkSize;
|
||||||
private final int maxDocsPerChunk;
|
private final int maxDocsPerChunk;
|
||||||
|
@ -90,7 +89,6 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter {
|
||||||
String formatName, CompressionMode compressionMode, int chunkSize, int maxDocsPerChunk) throws IOException {
|
String formatName, CompressionMode compressionMode, int chunkSize, int maxDocsPerChunk) throws IOException {
|
||||||
assert directory != null;
|
assert directory != null;
|
||||||
this.segment = si.name;
|
this.segment = si.name;
|
||||||
this.compressionMode = compressionMode;
|
|
||||||
this.compressor = compressionMode.newCompressor();
|
this.compressor = compressionMode.newCompressor();
|
||||||
this.chunkSize = chunkSize;
|
this.chunkSize = chunkSize;
|
||||||
this.maxDocsPerChunk = maxDocsPerChunk;
|
this.maxDocsPerChunk = maxDocsPerChunk;
|
||||||
|
@ -238,6 +236,8 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter {
|
||||||
bufferedDocs.length = 0;
|
bufferedDocs.length = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
byte scratchBytes[] = new byte[16];
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void writeField(FieldInfo info, StorableField field)
|
public void writeField(FieldInfo info, StorableField field)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
|
@ -284,7 +284,11 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter {
|
||||||
bufferedDocs.writeVInt(bytes.length);
|
bufferedDocs.writeVInt(bytes.length);
|
||||||
bufferedDocs.writeBytes(bytes.bytes, bytes.offset, bytes.length);
|
bufferedDocs.writeBytes(bytes.bytes, bytes.offset, bytes.length);
|
||||||
} else if (string != null) {
|
} else if (string != null) {
|
||||||
bufferedDocs.writeString(field.stringValue());
|
// this is just an optimized writeString() that re-uses scratchBytes.
|
||||||
|
scratchBytes = ArrayUtil.grow(scratchBytes, string.length() * UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR);
|
||||||
|
int length = UnicodeUtil.UTF16toUTF8(string, 0, string.length(), scratchBytes);
|
||||||
|
bufferedDocs.writeVInt(length);
|
||||||
|
bufferedDocs.writeBytes(scratchBytes, length);
|
||||||
} else {
|
} else {
|
||||||
if (number instanceof Byte || number instanceof Short || number instanceof Integer) {
|
if (number instanceof Byte || number instanceof Short || number instanceof Integer) {
|
||||||
bufferedDocs.writeZInt(number.intValue());
|
bufferedDocs.writeZInt(number.intValue());
|
||||||
|
@ -474,6 +478,7 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter {
|
||||||
MatchingReaders matching = new MatchingReaders(mergeState);
|
MatchingReaders matching = new MatchingReaders(mergeState);
|
||||||
|
|
||||||
for (int readerIndex=0;readerIndex<numReaders;readerIndex++) {
|
for (int readerIndex=0;readerIndex<numReaders;readerIndex++) {
|
||||||
|
MergeVisitor visitor = new MergeVisitor(mergeState, readerIndex);
|
||||||
CompressingStoredFieldsReader matchingFieldsReader = null;
|
CompressingStoredFieldsReader matchingFieldsReader = null;
|
||||||
if (matching.matchingReaders[readerIndex]) {
|
if (matching.matchingReaders[readerIndex]) {
|
||||||
final StoredFieldsReader fieldsReader = mergeState.storedFieldsReaders[readerIndex];
|
final StoredFieldsReader fieldsReader = mergeState.storedFieldsReaders[readerIndex];
|
||||||
|
@ -497,9 +502,9 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter {
|
||||||
if (liveDocs != null && liveDocs.get(docID) == false) {
|
if (liveDocs != null && liveDocs.get(docID) == false) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
DocumentStoredFieldVisitor visitor = new DocumentStoredFieldVisitor();
|
startDocument();
|
||||||
storedFieldsReader.visitDocument(docID, visitor);
|
storedFieldsReader.visitDocument(docID, visitor);
|
||||||
addDocument(visitor.getDocument(), mergeState.mergeFieldInfos);
|
finishDocument();
|
||||||
++docCount;
|
++docCount;
|
||||||
mergeState.checkAbort.work(300);
|
mergeState.checkAbort.work(300);
|
||||||
}
|
}
|
||||||
|
|
|
@ -123,7 +123,7 @@ public final class UnicodeUtil {
|
||||||
(UNI_SUR_HIGH_START << HALF_SHIFT) - UNI_SUR_LOW_START;
|
(UNI_SUR_HIGH_START << HALF_SHIFT) - UNI_SUR_LOW_START;
|
||||||
|
|
||||||
/** Maximum number of UTF8 bytes per UTF16 character. */
|
/** Maximum number of UTF8 bytes per UTF16 character. */
|
||||||
public static final int MAX_UTF8_BYTES_PER_CHAR = 4;
|
public static final int MAX_UTF8_BYTES_PER_CHAR = 3;
|
||||||
|
|
||||||
/** Encode characters from a char[] source, starting at
|
/** Encode characters from a char[] source, starting at
|
||||||
* offset for length chars. It is the responsibility of the
|
* offset for length chars. It is the responsibility of the
|
||||||
|
|
Loading…
Reference in New Issue