diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index f12d0aabbbe..effefaaff5f 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -183,6 +183,9 @@ Optimizations * LUCENE-6131: Optimize SortingMergePolicy. (Robert Muir) +* LUCENE-6133: Improve default StoredFieldsWriter.merge() to be more efficient. + (Robert Muir) + API Changes * LUCENE-5900: Deprecated more constructors taking Version in *InfixSuggester and diff --git a/lucene/core/src/java/org/apache/lucene/codecs/StoredFieldsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/StoredFieldsWriter.java index ce8b6b18af8..d8be8ae8047 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/StoredFieldsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/StoredFieldsWriter.java @@ -18,14 +18,17 @@ package org.apache.lucene.codecs; import java.io.Closeable; import java.io.IOException; +import java.io.Reader; -import org.apache.lucene.document.DocumentStoredFieldVisitor; +import org.apache.lucene.document.StoredField; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.IndexableFieldType; import org.apache.lucene.index.MergeState; import org.apache.lucene.index.StorableField; -import org.apache.lucene.index.StoredDocument; +import org.apache.lucene.index.StoredFieldVisitor; import org.apache.lucene.util.Bits; +import org.apache.lucene.util.BytesRef; /** * Codec API for writing stored fields: @@ -81,6 +84,7 @@ public abstract class StoredFieldsWriter implements Closeable { for (int i=0;i doc, FieldInfos fieldInfos) throws IOException { - startDocument(); - - for (StorableField field : doc) { - writeField(fieldInfos.fieldInfo(field.name()), field); + /** + * A visitor that adds every field it sees. + *

+ * Use like this: + *

+   * MergeVisitor visitor = new MergeVisitor(mergeState, readerIndex);
+   * for (...) {
+   *   startDocument();
+   *   storedFieldsReader.visitDocument(docID, visitor);
+   *   finishDocument();
+   * }
+   * 
+ */ + protected class MergeVisitor extends StoredFieldVisitor implements StorableField { + BytesRef binaryValue; + String stringValue; + Number numericValue; + FieldInfo currentField; + FieldInfos remapper; + + /** + * Create new merge visitor. + */ + public MergeVisitor(MergeState mergeState, int readerIndex) { + // if field numbers are aligned, we can save hash lookups + // on every field access. Otherwise, we need to lookup + // fieldname each time, and remap to a new number. + for (FieldInfo fi : mergeState.fieldInfos[readerIndex]) { + FieldInfo other = mergeState.mergeFieldInfos.fieldInfo(fi.number); + if (other == null || !other.name.equals(fi.name)) { + remapper = mergeState.mergeFieldInfos; + break; + } + } + } + + @Override + public void binaryField(FieldInfo fieldInfo, byte[] value) throws IOException { + reset(fieldInfo); + binaryValue = new BytesRef(value); + write(); } - finishDocument(); + @Override + public void stringField(FieldInfo fieldInfo, String value) throws IOException { + reset(fieldInfo); + stringValue = value; + write(); + } + + @Override + public void intField(FieldInfo fieldInfo, int value) throws IOException { + reset(fieldInfo); + numericValue = value; + write(); + } + + @Override + public void longField(FieldInfo fieldInfo, long value) throws IOException { + reset(fieldInfo); + numericValue = value; + write(); + } + + @Override + public void floatField(FieldInfo fieldInfo, float value) throws IOException { + reset(fieldInfo); + numericValue = value; + write(); + } + + @Override + public void doubleField(FieldInfo fieldInfo, double value) throws IOException { + reset(fieldInfo); + numericValue = value; + write(); + } + + @Override + public Status needsField(FieldInfo fieldInfo) throws IOException { + return Status.YES; + } + + @Override + public String name() { + return currentField.name; + } + + @Override + public IndexableFieldType fieldType() { + return StoredField.TYPE; + } + + @Override + public BytesRef binaryValue() { + return binaryValue; + } + + @Override + public String stringValue() { + return stringValue; + } + + @Override + public Number numericValue() { + return numericValue; + } + + @Override + public Reader readerValue() { + return null; + } + + void reset(FieldInfo field) { + if (remapper != null) { + // field numbers are not aligned, we need to remap to the new field number + currentField = remapper.fieldInfo(field.name); + } else { + currentField = field; + } + binaryValue = null; + stringValue = null; + numericValue = null; + } + + void write() throws IOException { + writeField(currentField, this); + } } @Override diff --git a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java index f579cb6ecc1..6982ef80a70 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java @@ -24,7 +24,6 @@ import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.StoredFieldsReader; import org.apache.lucene.codecs.StoredFieldsWriter; import org.apache.lucene.codecs.compressing.CompressingStoredFieldsReader.SerializedDocument; -import org.apache.lucene.document.DocumentStoredFieldVisitor; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfos; import org.apache.lucene.index.IndexFileNames; @@ -41,6 +40,7 @@ import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.GrowableByteArrayDataOutput; import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.UnicodeUtil; import org.apache.lucene.util.packed.PackedInts; /** @@ -74,7 +74,6 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter { private CompressingStoredFieldsIndexWriter indexWriter; private IndexOutput fieldsStream; - private final CompressionMode compressionMode; private final Compressor compressor; private final int chunkSize; private final int maxDocsPerChunk; @@ -90,7 +89,6 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter { String formatName, CompressionMode compressionMode, int chunkSize, int maxDocsPerChunk) throws IOException { assert directory != null; this.segment = si.name; - this.compressionMode = compressionMode; this.compressor = compressionMode.newCompressor(); this.chunkSize = chunkSize; this.maxDocsPerChunk = maxDocsPerChunk; @@ -237,6 +235,8 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter { numBufferedDocs = 0; bufferedDocs.length = 0; } + + byte scratchBytes[] = new byte[16]; @Override public void writeField(FieldInfo info, StorableField field) @@ -284,7 +284,11 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter { bufferedDocs.writeVInt(bytes.length); bufferedDocs.writeBytes(bytes.bytes, bytes.offset, bytes.length); } else if (string != null) { - bufferedDocs.writeString(field.stringValue()); + // this is just an optimized writeString() that re-uses scratchBytes. + scratchBytes = ArrayUtil.grow(scratchBytes, string.length() * UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR); + int length = UnicodeUtil.UTF16toUTF8(string, 0, string.length(), scratchBytes); + bufferedDocs.writeVInt(length); + bufferedDocs.writeBytes(scratchBytes, length); } else { if (number instanceof Byte || number instanceof Short || number instanceof Integer) { bufferedDocs.writeZInt(number.intValue()); @@ -474,6 +478,7 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter { MatchingReaders matching = new MatchingReaders(mergeState); for (int readerIndex=0;readerIndex