- LUCENE-629: indexing and optimizing performance improvements when working with compressed fields

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@431148 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Otis Gospodnetic 2006-08-13 06:12:07 +00:00
parent 395115d8d7
commit 1a933665d6
5 changed files with 100 additions and 16 deletions

View File

@ -108,6 +108,10 @@ Optimizations
internal "files", allowing them to be GCed even if references to the
RAMDirectory itself still exist. (Nadav Har'El via Chris Hostetter)
3. LUCENE-629: Compressed fields are no longer uncompressed and recompressed
during segment merges (e.g. during indexing or optimizing), thus improving
performance . (Michael Busch via Otis Gospodnetic)
Release 2.0.0 2006-05-26
API Changes

View File

@ -26,11 +26,11 @@ public final class FieldSelectorResult {
public static final FieldSelectorResult LAZY_LOAD = new FieldSelectorResult(1);
public static final FieldSelectorResult NO_LOAD = new FieldSelectorResult(2);
public static final FieldSelectorResult LOAD_AND_BREAK = new FieldSelectorResult(3);
public static final FieldSelectorResult LOAD_FOR_MERGE = new FieldSelectorResult(4);
private int id;
private FieldSelectorResult(int id)
{
private FieldSelectorResult(int id) {
this.id = id;
}

View File

@ -16,16 +16,21 @@ package org.apache.lucene.index;
* limitations under the License.
*/
import org.apache.lucene.document.*;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IndexInput;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.Reader;
import java.util.zip.DataFormatException;
import java.util.zip.Inflater;
import org.apache.lucene.document.AbstractField;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldSelector;
import org.apache.lucene.document.FieldSelectorResult;
import org.apache.lucene.document.Fieldable;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IndexInput;
/**
* Class responsible for access to stored document fields.
* <p/>
@ -89,6 +94,9 @@ final class FieldsReader {
if (acceptField.equals(FieldSelectorResult.LOAD) == true) {
addField(doc, fi, binary, compressed, tokenize);
}
else if (acceptField.equals(FieldSelectorResult.LOAD_FOR_MERGE) == true) {
addFieldForMerge(doc, fi, binary, compressed, tokenize);
}
else if (acceptField.equals(FieldSelectorResult.LOAD_AND_BREAK) == true){
addField(doc, fi, binary, compressed, tokenize);
break;//Get out of this loop
@ -161,6 +169,22 @@ final class FieldsReader {
}
// in merge mode we don't uncompress the data of a compressed field
private void addFieldForMerge(Document doc, FieldInfo fi, boolean binary, boolean compressed, boolean tokenize) throws IOException {
Object data;
if (binary || compressed) {
int toRead = fieldsStream.readVInt();
final byte[] b = new byte[toRead];
fieldsStream.readBytes(b, 0, b.length);
data = b;
} else {
data = fieldsStream.readString();
}
doc.add(new FieldForMerge(data, fi, binary, compressed, tokenize));
}
private void addField(Document doc, FieldInfo fi, boolean binary, boolean compressed, boolean tokenize) throws IOException {
//we have a binary stored field, and it may be compressed
@ -370,4 +394,37 @@ final class FieldsReader {
// Get the decompressed data
return bos.toByteArray();
}
// Instances of this class hold field properties and data
// for merge
final static class FieldForMerge extends AbstractField {
public String stringValue() {
return (String) this.fieldsData;
}
public Reader readerValue() {
// not needed for merge
return null;
}
public byte[] binaryValue() {
return (byte[]) this.fieldsData;
}
public FieldForMerge(Object value, FieldInfo fi, boolean binary, boolean compressed, boolean tokenize) {
this.isStored = true;
this.fieldsData = value;
this.isCompressed = compressed;
this.isBinary = binary;
this.isTokenized = tokenize;
this.name = fi.name.intern();
this.isIndexed = fi.isIndexed;
this.omitNorms = fi.omitNorms;
this.storeOffsetWithTermVector = fi.storeOffsetWithTermVector;
this.storePositionWithTermVector = fi.storePositionWithTermVector;
this.storeTermVector = fi.storeTermVector;
}
}
}

View File

@ -23,6 +23,7 @@ import java.util.zip.Deflater;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Fieldable;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IndexOutput;
@ -55,7 +56,7 @@ final class FieldsWriter
int storedCount = 0;
Enumeration fields = doc.fields();
while (fields.hasMoreElements()) {
Field field = (Field) fields.nextElement();
Fieldable field = (Fieldable) fields.nextElement();
if (field.isStored())
storedCount++;
}
@ -63,7 +64,11 @@ final class FieldsWriter
fields = doc.fields();
while (fields.hasMoreElements()) {
Field field = (Field) fields.nextElement();
Fieldable field = (Fieldable) fields.nextElement();
// if the field as an instanceof FieldsReader.FieldForMerge, we're in merge mode
// and field.binaryValue() already returns the compressed value for a field
// with isCompressed()==true, so we disable compression in that case
boolean disableCompression = (field instanceof FieldsReader.FieldForMerge);
if (field.isStored()) {
fieldsStream.writeVInt(fieldInfos.fieldNumber(field.name()));
@ -80,12 +85,19 @@ final class FieldsWriter
if (field.isCompressed()) {
// compression is enabled for the current field
byte[] data = null;
// check if it is a binary field
if (field.isBinary()) {
data = compress(field.binaryValue());
}
else {
data = compress(field.stringValue().getBytes("UTF-8"));
if (disableCompression) {
// optimized case for merging, the data
// is already compressed
data = field.binaryValue();
} else {
// check if it is a binary field
if (field.isBinary()) {
data = compress(field.binaryValue());
}
else {
data = compress(field.stringValue().getBytes("UTF-8"));
}
}
final int len = data.length;
fieldsStream.writeVInt(len);

View File

@ -21,6 +21,8 @@ import java.util.Iterator;
import java.util.Collection;
import java.io.IOException;
import org.apache.lucene.document.FieldSelector;
import org.apache.lucene.document.FieldSelectorResult;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.RAMOutputStream;
@ -177,13 +179,22 @@ final class SegmentMerger {
FieldsWriter fieldsWriter = // merge field values
new FieldsWriter(directory, segment, fieldInfos);
// for merging we don't want to compress/uncompress the data, so to tell the FieldsReader that we're
// in merge mode, we use this FieldSelector
FieldSelector fieldSelectorMerge = new FieldSelector() {
public FieldSelectorResult accept(String fieldName) {
return FieldSelectorResult.LOAD_FOR_MERGE;
}
};
try {
for (int i = 0; i < readers.size(); i++) {
IndexReader reader = (IndexReader) readers.elementAt(i);
int maxDoc = reader.maxDoc();
for (int j = 0; j < maxDoc; j++)
if (!reader.isDeleted(j)) { // skip deleted docs
fieldsWriter.addDocument(reader.document(j));
fieldsWriter.addDocument(reader.document(j, fieldSelectorMerge));
docCount++;
}
}