mirror of
https://github.com/apache/lucene.git
synced 2025-02-06 10:08:58 +00:00
- LUCENE-629: indexing and optimizing performance improvements when working with compressed fields
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@431148 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
395115d8d7
commit
1a933665d6
@ -108,6 +108,10 @@ Optimizations
|
||||
internal "files", allowing them to be GCed even if references to the
|
||||
RAMDirectory itself still exist. (Nadav Har'El via Chris Hostetter)
|
||||
|
||||
3. LUCENE-629: Compressed fields are no longer uncompressed and recompressed
|
||||
during segment merges (e.g. during indexing or optimizing), thus improving
|
||||
performance . (Michael Busch via Otis Gospodnetic)
|
||||
|
||||
Release 2.0.0 2006-05-26
|
||||
|
||||
API Changes
|
||||
|
@ -26,11 +26,11 @@ public final class FieldSelectorResult {
|
||||
public static final FieldSelectorResult LAZY_LOAD = new FieldSelectorResult(1);
|
||||
public static final FieldSelectorResult NO_LOAD = new FieldSelectorResult(2);
|
||||
public static final FieldSelectorResult LOAD_AND_BREAK = new FieldSelectorResult(3);
|
||||
|
||||
public static final FieldSelectorResult LOAD_FOR_MERGE = new FieldSelectorResult(4);
|
||||
|
||||
private int id;
|
||||
|
||||
private FieldSelectorResult(int id)
|
||||
{
|
||||
private FieldSelectorResult(int id) {
|
||||
this.id = id;
|
||||
}
|
||||
|
||||
|
@ -16,16 +16,21 @@ package org.apache.lucene.index;
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.document.*;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.zip.DataFormatException;
|
||||
import java.util.zip.Inflater;
|
||||
|
||||
import org.apache.lucene.document.AbstractField;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.FieldSelector;
|
||||
import org.apache.lucene.document.FieldSelectorResult;
|
||||
import org.apache.lucene.document.Fieldable;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
|
||||
/**
|
||||
* Class responsible for access to stored document fields.
|
||||
* <p/>
|
||||
@ -89,6 +94,9 @@ final class FieldsReader {
|
||||
if (acceptField.equals(FieldSelectorResult.LOAD) == true) {
|
||||
addField(doc, fi, binary, compressed, tokenize);
|
||||
}
|
||||
else if (acceptField.equals(FieldSelectorResult.LOAD_FOR_MERGE) == true) {
|
||||
addFieldForMerge(doc, fi, binary, compressed, tokenize);
|
||||
}
|
||||
else if (acceptField.equals(FieldSelectorResult.LOAD_AND_BREAK) == true){
|
||||
addField(doc, fi, binary, compressed, tokenize);
|
||||
break;//Get out of this loop
|
||||
@ -161,6 +169,22 @@ final class FieldsReader {
|
||||
|
||||
}
|
||||
|
||||
// in merge mode we don't uncompress the data of a compressed field
|
||||
private void addFieldForMerge(Document doc, FieldInfo fi, boolean binary, boolean compressed, boolean tokenize) throws IOException {
|
||||
Object data;
|
||||
|
||||
if (binary || compressed) {
|
||||
int toRead = fieldsStream.readVInt();
|
||||
final byte[] b = new byte[toRead];
|
||||
fieldsStream.readBytes(b, 0, b.length);
|
||||
data = b;
|
||||
} else {
|
||||
data = fieldsStream.readString();
|
||||
}
|
||||
|
||||
doc.add(new FieldForMerge(data, fi, binary, compressed, tokenize));
|
||||
}
|
||||
|
||||
private void addField(Document doc, FieldInfo fi, boolean binary, boolean compressed, boolean tokenize) throws IOException {
|
||||
|
||||
//we have a binary stored field, and it may be compressed
|
||||
@ -370,4 +394,37 @@ final class FieldsReader {
|
||||
// Get the decompressed data
|
||||
return bos.toByteArray();
|
||||
}
|
||||
|
||||
// Instances of this class hold field properties and data
|
||||
// for merge
|
||||
final static class FieldForMerge extends AbstractField {
|
||||
public String stringValue() {
|
||||
return (String) this.fieldsData;
|
||||
}
|
||||
|
||||
public Reader readerValue() {
|
||||
// not needed for merge
|
||||
return null;
|
||||
}
|
||||
|
||||
public byte[] binaryValue() {
|
||||
return (byte[]) this.fieldsData;
|
||||
}
|
||||
|
||||
public FieldForMerge(Object value, FieldInfo fi, boolean binary, boolean compressed, boolean tokenize) {
|
||||
this.isStored = true;
|
||||
this.fieldsData = value;
|
||||
this.isCompressed = compressed;
|
||||
this.isBinary = binary;
|
||||
this.isTokenized = tokenize;
|
||||
|
||||
this.name = fi.name.intern();
|
||||
this.isIndexed = fi.isIndexed;
|
||||
this.omitNorms = fi.omitNorms;
|
||||
this.storeOffsetWithTermVector = fi.storeOffsetWithTermVector;
|
||||
this.storePositionWithTermVector = fi.storePositionWithTermVector;
|
||||
this.storeTermVector = fi.storeTermVector;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
@ -23,6 +23,7 @@ import java.util.zip.Deflater;
|
||||
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.Fieldable;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
|
||||
@ -55,7 +56,7 @@ final class FieldsWriter
|
||||
int storedCount = 0;
|
||||
Enumeration fields = doc.fields();
|
||||
while (fields.hasMoreElements()) {
|
||||
Field field = (Field) fields.nextElement();
|
||||
Fieldable field = (Fieldable) fields.nextElement();
|
||||
if (field.isStored())
|
||||
storedCount++;
|
||||
}
|
||||
@ -63,7 +64,11 @@ final class FieldsWriter
|
||||
|
||||
fields = doc.fields();
|
||||
while (fields.hasMoreElements()) {
|
||||
Field field = (Field) fields.nextElement();
|
||||
Fieldable field = (Fieldable) fields.nextElement();
|
||||
// if the field as an instanceof FieldsReader.FieldForMerge, we're in merge mode
|
||||
// and field.binaryValue() already returns the compressed value for a field
|
||||
// with isCompressed()==true, so we disable compression in that case
|
||||
boolean disableCompression = (field instanceof FieldsReader.FieldForMerge);
|
||||
if (field.isStored()) {
|
||||
fieldsStream.writeVInt(fieldInfos.fieldNumber(field.name()));
|
||||
|
||||
@ -80,12 +85,19 @@ final class FieldsWriter
|
||||
if (field.isCompressed()) {
|
||||
// compression is enabled for the current field
|
||||
byte[] data = null;
|
||||
// check if it is a binary field
|
||||
if (field.isBinary()) {
|
||||
data = compress(field.binaryValue());
|
||||
}
|
||||
else {
|
||||
data = compress(field.stringValue().getBytes("UTF-8"));
|
||||
|
||||
if (disableCompression) {
|
||||
// optimized case for merging, the data
|
||||
// is already compressed
|
||||
data = field.binaryValue();
|
||||
} else {
|
||||
// check if it is a binary field
|
||||
if (field.isBinary()) {
|
||||
data = compress(field.binaryValue());
|
||||
}
|
||||
else {
|
||||
data = compress(field.stringValue().getBytes("UTF-8"));
|
||||
}
|
||||
}
|
||||
final int len = data.length;
|
||||
fieldsStream.writeVInt(len);
|
||||
|
@ -21,6 +21,8 @@ import java.util.Iterator;
|
||||
import java.util.Collection;
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.document.FieldSelector;
|
||||
import org.apache.lucene.document.FieldSelectorResult;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.store.RAMOutputStream;
|
||||
@ -177,13 +179,22 @@ final class SegmentMerger {
|
||||
|
||||
FieldsWriter fieldsWriter = // merge field values
|
||||
new FieldsWriter(directory, segment, fieldInfos);
|
||||
|
||||
// for merging we don't want to compress/uncompress the data, so to tell the FieldsReader that we're
|
||||
// in merge mode, we use this FieldSelector
|
||||
FieldSelector fieldSelectorMerge = new FieldSelector() {
|
||||
public FieldSelectorResult accept(String fieldName) {
|
||||
return FieldSelectorResult.LOAD_FOR_MERGE;
|
||||
}
|
||||
};
|
||||
|
||||
try {
|
||||
for (int i = 0; i < readers.size(); i++) {
|
||||
IndexReader reader = (IndexReader) readers.elementAt(i);
|
||||
int maxDoc = reader.maxDoc();
|
||||
for (int j = 0; j < maxDoc; j++)
|
||||
if (!reader.isDeleted(j)) { // skip deleted docs
|
||||
fieldsWriter.addDocument(reader.document(j));
|
||||
fieldsWriter.addDocument(reader.document(j, fieldSelectorMerge));
|
||||
docCount++;
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user