LUCENE-1960: Add support for reading Field.Store.COMPRESS fields again and decompress on merge/optimize. Also convert to a new stored file version number.

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@829972 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Uwe Schindler 2009-10-26 21:16:57 +00:00
parent 6a1f2d5729
commit c61fa48daf
6 changed files with 259 additions and 49 deletions

View File

@ -15,6 +15,25 @@ Changes in runtime behavior
* LUCENE-1677: Remove the system property to set SegmentReader class
implementation. (Uwe Schindler)
* LUCENE-1960: As a consequence of the removal of Field.Store.COMPRESS,
support for this type of fields was removed. Lucene 3.0 is still able
to read indexes with compressed fields, but as soon as merges occur
or the index is optimized, all compressed fields are decompressed
and converted to Field.Store.YES. Because of this, indexes with
compressed fields can suddenly get larger. Also the first merge with
decompression cannot be done in raw mode, it is therefore slower.
This change has no effect for code that uses such old indexes,
they behave as before (fields are automatically decompressed
during read). Indexes converted to Lucene 3.0 format cannot be read
anymore with previous versions.
It is recommended to optimize your indexes after upgrading to convert
to the new format and decompress all fields.
If you want compressed fields, you can use CompressionTools, that
creates compressed byte[] to be added as binary stored field. This
cannot be done automatically, as you also have to decompress such
fields when reading. You have to reindex to do that.
(Michael Busch, Uwe Schindler)
API Changes
* LUCENE-1257, LUCENE-1984, LUCENE-1985, ...: Port to Java 1.5 [not yet finished].

View File

@ -19,6 +19,7 @@ package org.apache.lucene.index;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.AbstractField;
import org.apache.lucene.document.CompressionTools;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldSelector;
@ -32,6 +33,7 @@ import org.apache.lucene.util.CloseableThreadLocal;
import java.io.IOException;
import java.io.Reader;
import java.util.zip.DataFormatException;
/**
* Class responsible for access to stored document fields.
@ -203,7 +205,11 @@ final class FieldsReader implements Cloneable {
}
boolean canReadRawDocs() {
return format >= FieldsWriter.FORMAT_VERSION_UTF8_LENGTH_IN_BYTES;
// Disable reading raw docs in 2.x format, because of the removal of compressed
// fields in 3.0. We don't want rawDocs() to decode field bits to figure out
// if a field was compressed, hence we enforce ordinary (non-raw) stored field merges
// for <3.0 indexes.
return format >= FieldsWriter.FORMAT_LUCENE_3_0_NO_COMPRESSED_FIELDS;
}
final Document doc(int n, FieldSelector fieldSelector) throws CorruptIndexException, IOException {
@ -219,31 +225,34 @@ final class FieldsReader implements Cloneable {
FieldSelectorResult acceptField = fieldSelector == null ? FieldSelectorResult.LOAD : fieldSelector.accept(fi.name);
byte bits = fieldsStream.readByte();
assert bits <= FieldsWriter.FIELD_IS_TOKENIZED + FieldsWriter.FIELD_IS_BINARY;
assert bits <= FieldsWriter.FIELD_IS_COMPRESSED + FieldsWriter.FIELD_IS_TOKENIZED + FieldsWriter.FIELD_IS_BINARY;
boolean compressed = (bits & FieldsWriter.FIELD_IS_COMPRESSED) != 0;
assert (compressed ? (format < FieldsWriter.FORMAT_LUCENE_3_0_NO_COMPRESSED_FIELDS) : true)
: "compressed fields are only allowed in indexes of version <= 2.9";
boolean tokenize = (bits & FieldsWriter.FIELD_IS_TOKENIZED) != 0;
boolean binary = (bits & FieldsWriter.FIELD_IS_BINARY) != 0;
//TODO: Find an alternative approach here if this list continues to grow beyond the
//list of 5 or 6 currently here. See Lucene 762 for discussion
if (acceptField.equals(FieldSelectorResult.LOAD)) {
addField(doc, fi, binary, tokenize);
addField(doc, fi, binary, compressed, tokenize);
}
else if (acceptField.equals(FieldSelectorResult.LOAD_AND_BREAK)){
addField(doc, fi, binary, tokenize);
addField(doc, fi, binary, compressed, tokenize);
break;//Get out of this loop
}
else if (acceptField.equals(FieldSelectorResult.LAZY_LOAD)) {
addFieldLazy(doc, fi, binary, tokenize);
addFieldLazy(doc, fi, binary, compressed, tokenize);
}
else if (acceptField.equals(FieldSelectorResult.SIZE)){
skipField(binary, addFieldSize(doc, fi, binary));
skipField(binary, compressed, addFieldSize(doc, fi, binary, compressed));
}
else if (acceptField.equals(FieldSelectorResult.SIZE_AND_BREAK)){
addFieldSize(doc, fi, binary);
addFieldSize(doc, fi, binary, compressed);
break;
}
else {
skipField(binary);
skipField(binary, compressed);
}
}
@ -280,12 +289,12 @@ final class FieldsReader implements Cloneable {
* Skip the field. We still have to read some of the information about the field, but can skip past the actual content.
* This will have the most payoff on large fields.
*/
private void skipField(boolean binary) throws IOException {
skipField(binary, fieldsStream.readVInt());
private void skipField(boolean binary, boolean compressed) throws IOException {
skipField(binary, compressed, fieldsStream.readVInt());
}
private void skipField(boolean binary, int toRead) throws IOException {
if (format >= FieldsWriter.FORMAT_VERSION_UTF8_LENGTH_IN_BYTES || binary) {
private void skipField(boolean binary, boolean compressed, int toRead) throws IOException {
if (format >= FieldsWriter.FORMAT_VERSION_UTF8_LENGTH_IN_BYTES || binary || compressed) {
fieldsStream.seek(fieldsStream.getFilePointer() + toRead);
} else {
// We need to skip chars. This will slow us down, but still better
@ -293,12 +302,12 @@ final class FieldsReader implements Cloneable {
}
}
private void addFieldLazy(Document doc, FieldInfo fi, boolean binary, boolean tokenize) throws IOException {
private void addFieldLazy(Document doc, FieldInfo fi, boolean binary, boolean compressed, boolean tokenize) throws IOException {
if (binary) {
int toRead = fieldsStream.readVInt();
long pointer = fieldsStream.getFilePointer();
//was: doc.add(new Fieldable(fi.name, b, Fieldable.Store.YES));
doc.add(new LazyField(fi.name, Field.Store.YES, toRead, pointer, binary));
doc.add(new LazyField(fi.name, Field.Store.YES, toRead, pointer, binary, compressed));
//Need to move the pointer ahead by toRead positions
fieldsStream.seek(pointer + toRead);
} else {
@ -307,43 +316,75 @@ final class FieldsReader implements Cloneable {
Field.TermVector termVector = Field.TermVector.toTermVector(fi.storeTermVector, fi.storeOffsetWithTermVector, fi.storePositionWithTermVector);
AbstractField f;
int length = fieldsStream.readVInt();
long pointer = fieldsStream.getFilePointer();
//Skip ahead of where we are by the length of what is stored
if (format >= FieldsWriter.FORMAT_VERSION_UTF8_LENGTH_IN_BYTES)
fieldsStream.seek(pointer+length);
else
fieldsStream.skipChars(length);
f = new LazyField(fi.name, store, index, termVector, length, pointer, binary);
f.setOmitNorms(fi.omitNorms);
f.setOmitTermFreqAndPositions(fi.omitTermFreqAndPositions);
if (compressed) {
int toRead = fieldsStream.readVInt();
long pointer = fieldsStream.getFilePointer();
f = new LazyField(fi.name, store, toRead, pointer, binary, compressed);
//skip over the part that we aren't loading
fieldsStream.seek(pointer + toRead);
f.setOmitNorms(fi.omitNorms);
f.setOmitTermFreqAndPositions(fi.omitTermFreqAndPositions);
} else {
int length = fieldsStream.readVInt();
long pointer = fieldsStream.getFilePointer();
//Skip ahead of where we are by the length of what is stored
if (format >= FieldsWriter.FORMAT_VERSION_UTF8_LENGTH_IN_BYTES) {
fieldsStream.seek(pointer+length);
} else {
fieldsStream.skipChars(length);
}
f = new LazyField(fi.name, store, index, termVector, length, pointer, binary, compressed);
f.setOmitNorms(fi.omitNorms);
f.setOmitTermFreqAndPositions(fi.omitTermFreqAndPositions);
}
doc.add(f);
}
}
private void addField(Document doc, FieldInfo fi, boolean binary, boolean tokenize) throws CorruptIndexException, IOException {
private void addField(Document doc, FieldInfo fi, boolean binary, boolean compressed, boolean tokenize) throws CorruptIndexException, IOException {
//we have a binary stored field, and it may be compressed
if (binary) {
int toRead = fieldsStream.readVInt();
final byte[] b = new byte[toRead];
fieldsStream.readBytes(b, 0, b.length);
doc.add(new Field(fi.name, b, Field.Store.YES));
if (compressed) {
doc.add(new Field(fi.name, uncompress(b), Field.Store.YES));
} else {
doc.add(new Field(fi.name, b, Field.Store.YES));
}
} else {
Field.Store store = Field.Store.YES;
Field.Index index = Field.Index.toIndex(fi.isIndexed, tokenize);
Field.TermVector termVector = Field.TermVector.toTermVector(fi.storeTermVector, fi.storeOffsetWithTermVector, fi.storePositionWithTermVector);
AbstractField f;
f = new Field(fi.name, // name
false,
fieldsStream.readString(), // read value
store,
index,
termVector);
f.setOmitTermFreqAndPositions(fi.omitTermFreqAndPositions);
f.setOmitNorms(fi.omitNorms);
if (compressed) {
int toRead = fieldsStream.readVInt();
final byte[] b = new byte[toRead];
fieldsStream.readBytes(b, 0, b.length);
f = new Field(fi.name, // field name
false,
new String(uncompress(b), "UTF-8"), // uncompress the value and add as string
store,
index,
termVector);
f.setOmitTermFreqAndPositions(fi.omitTermFreqAndPositions);
f.setOmitNorms(fi.omitNorms);
} else {
f = new Field(fi.name, // name
false,
fieldsStream.readString(), // read value
store,
index,
termVector);
f.setOmitTermFreqAndPositions(fi.omitTermFreqAndPositions);
f.setOmitNorms(fi.omitNorms);
}
doc.add(f);
}
}
@ -351,8 +392,8 @@ final class FieldsReader implements Cloneable {
// Add the size of field as a byte[] containing the 4 bytes of the integer byte size (high order byte first; char = 2 bytes)
// Read just the size -- caller must skip the field content to continue reading fields
// Return the size in bytes or chars, depending on field type
private int addFieldSize(Document doc, FieldInfo fi, boolean binary) throws IOException {
int size = fieldsStream.readVInt(), bytesize = binary ? size : 2*size;
private int addFieldSize(Document doc, FieldInfo fi, boolean binary, boolean compressed) throws IOException {
int size = fieldsStream.readVInt(), bytesize = binary || compressed ? size : 2*size;
byte[] sizebytes = new byte[4];
sizebytes[0] = (byte) (bytesize>>>24);
sizebytes[1] = (byte) (bytesize>>>16);
@ -369,8 +410,10 @@ final class FieldsReader implements Cloneable {
private class LazyField extends AbstractField implements Fieldable {
private int toRead;
private long pointer;
/** @deprecated Only kept for backward-compatbility with <3.0 indexes. Will be removed in 4.0. */
private boolean isCompressed;
public LazyField(String name, Field.Store store, int toRead, long pointer, boolean isBinary) {
public LazyField(String name, Field.Store store, int toRead, long pointer, boolean isBinary, boolean isCompressed) {
super(name, store, Field.Index.NO, Field.TermVector.NO);
this.toRead = toRead;
this.pointer = pointer;
@ -378,9 +421,10 @@ final class FieldsReader implements Cloneable {
if (isBinary)
binaryLength = toRead;
lazy = true;
this.isCompressed = isCompressed;
}
public LazyField(String name, Field.Store store, Field.Index index, Field.TermVector termVector, int toRead, long pointer, boolean isBinary) {
public LazyField(String name, Field.Store store, Field.Index index, Field.TermVector termVector, int toRead, long pointer, boolean isBinary, boolean isCompressed) {
super(name, store, index, termVector);
this.toRead = toRead;
this.pointer = pointer;
@ -388,6 +432,7 @@ final class FieldsReader implements Cloneable {
if (isBinary)
binaryLength = toRead;
lazy = true;
this.isCompressed = isCompressed;
}
private IndexInput getFieldStream() {
@ -427,15 +472,21 @@ final class FieldsReader implements Cloneable {
IndexInput localFieldsStream = getFieldStream();
try {
localFieldsStream.seek(pointer);
if (format >= FieldsWriter.FORMAT_VERSION_UTF8_LENGTH_IN_BYTES) {
byte[] bytes = new byte[toRead];
localFieldsStream.readBytes(bytes, 0, toRead);
fieldsData = new String(bytes, "UTF-8");
if (isCompressed) {
final byte[] b = new byte[toRead];
localFieldsStream.readBytes(b, 0, b.length);
fieldsData = new String(uncompress(b), "UTF-8");
} else {
//read in chars b/c we already know the length we need to read
char[] chars = new char[toRead];
localFieldsStream.readChars(chars, 0, toRead);
fieldsData = new String(chars);
if (format >= FieldsWriter.FORMAT_VERSION_UTF8_LENGTH_IN_BYTES) {
byte[] bytes = new byte[toRead];
localFieldsStream.readBytes(bytes, 0, toRead);
fieldsData = new String(bytes, "UTF-8");
} else {
//read in chars b/c we already know the length we need to read
char[] chars = new char[toRead];
localFieldsStream.readChars(chars, 0, toRead);
fieldsData = new String(chars);
}
}
} catch (IOException e) {
throw new FieldReaderException(e);
@ -484,7 +535,11 @@ final class FieldsReader implements Cloneable {
try {
localFieldsStream.seek(pointer);
localFieldsStream.readBytes(b, 0, toRead);
fieldsData = b;
if (isCompressed == true) {
fieldsData = uncompress(b);
} else {
fieldsData = b;
}
} catch (IOException e) {
throw new FieldReaderException(e);
}
@ -498,4 +553,16 @@ final class FieldsReader implements Cloneable {
return null;
}
}
private byte[] uncompress(byte[] b)
throws CorruptIndexException {
try {
return CompressionTools.decompress(b);
} catch (DataFormatException e) {
// this will happen if the field is not compressed
CorruptIndexException newException = new CorruptIndexException("field data are in wrong format: " + e.toString());
newException.initCause(e);
throw newException;
}
}
}

View File

@ -31,16 +31,22 @@ final class FieldsWriter
static final byte FIELD_IS_TOKENIZED = 0x1;
static final byte FIELD_IS_BINARY = 0x2;
/** @deprecated Kept for backwards-compatibility with <3.0 indexes; will be removed in 4.0 */
static final byte FIELD_IS_COMPRESSED = 0x4;
// Original format
static final int FORMAT = 0;
// Changed strings to UTF8
static final int FORMAT_VERSION_UTF8_LENGTH_IN_BYTES = 1;
// Lucene 3.0: Removal of compressed fields
static final int FORMAT_LUCENE_3_0_NO_COMPRESSED_FIELDS = 2;
// NOTE: if you introduce a new format, make it 1 higher
// than the current one, and always change this if you
// switch to a new format!
static final int FORMAT_CURRENT = FORMAT_VERSION_UTF8_LENGTH_IN_BYTES;
static final int FORMAT_CURRENT = FORMAT_LUCENE_3_0_NO_COMPRESSED_FIELDS;
private FieldInfos fieldInfos;

View File

@ -22,21 +22,28 @@ import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.ByteArrayInputStream;
import java.io.DataInputStream;
import java.io.OutputStream;
import java.util.Arrays;
import java.util.Enumeration;
import java.util.List;
import java.util.ArrayList;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Fieldable;
import org.apache.lucene.document.FieldSelector;
import org.apache.lucene.document.FieldSelectorResult;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.ReaderUtil;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util._TestUtil;
@ -127,22 +134,105 @@ public class TestBackwardsCompatibility extends LuceneTestCase
"23.nocfs",
"24.cfs",
"24.nocfs",
"29.cfs",
"29.nocfs",
};
private void assertCompressedFields29(Directory dir, boolean shouldStillBeCompressed) throws IOException {
int count = 0;
final int TEXT_PLAIN_LENGTH = TEXT_TO_COMPRESS.length() * 2;
// FieldSelectorResult.SIZE returns 2*number_of_chars for String fields:
final int BINARY_PLAIN_LENGTH = BINARY_TO_COMPRESS.length;
IndexReader reader = IndexReader.open(dir, true);
try {
// look into sub readers and check if raw merge is on/off
List<IndexReader> readers = new ArrayList<IndexReader>();
ReaderUtil.gatherSubReaders(readers, reader);
for (IndexReader ir : readers) {
final FieldsReader fr = ((SegmentReader) ir).getFieldsReader();
assertTrue("for a 2.9 index, FieldsReader.canReadRawDocs() must be false and other way round for a trunk index",
shouldStillBeCompressed != fr.canReadRawDocs());
}
// test that decompression works correctly
for(int i=0; i<reader.maxDoc(); i++) {
if (!reader.isDeleted(i)) {
Document d = reader.document(i);
if (d.get("content3") != null) continue;
count++;
Fieldable compressed = d.getFieldable("compressed");
if (Integer.parseInt(d.get("id")) % 2 == 0) {
assertFalse(compressed.isBinary());
assertEquals("incorrectly decompressed string", TEXT_TO_COMPRESS, compressed.stringValue());
} else {
assertTrue(compressed.isBinary());
assertTrue("incorrectly decompressed binary", Arrays.equals(BINARY_TO_COMPRESS, compressed.getBinaryValue()));
}
}
}
// check if field was decompressed after optimize
for(int i=0; i<reader.maxDoc(); i++) {
if (!reader.isDeleted(i)) {
Document d = reader.document(i, new FieldSelector() {
public FieldSelectorResult accept(String fieldName) {
return ("compressed".equals(fieldName)) ? FieldSelectorResult.SIZE : FieldSelectorResult.LOAD;
}
});
if (d.get("content3") != null) continue;
count++;
// read the size from the binary value using DataInputStream (this prevents us from doing the shift ops ourselves):
final DataInputStream ds = new DataInputStream(new ByteArrayInputStream(d.getFieldable("compressed").getBinaryValue()));
final int actualSize = ds.readInt();
ds.close();
final int compressedSize = Integer.parseInt(d.get("compressedSize"));
final boolean binary = Integer.parseInt(d.get("id")) % 2 > 0;
final int shouldSize = shouldStillBeCompressed ?
compressedSize :
(binary ? BINARY_PLAIN_LENGTH : TEXT_PLAIN_LENGTH);
assertEquals("size incorrect", shouldSize, actualSize);
if (!shouldStillBeCompressed) {
assertFalse("uncompressed field should have another size than recorded in index", compressedSize == actualSize);
}
}
}
assertEquals("correct number of tests", 34 * 2, count);
} finally {
reader.close();
}
}
public void testOptimizeOldIndex() throws IOException {
int hasTested29 = 0;
for(int i=0;i<oldNames.length;i++) {
String dirName = "src/test/org/apache/lucene/index/index." + oldNames[i];
unzip(dirName, oldNames[i]);
String fullPath = fullDir(oldNames[i]);
Directory dir = FSDirectory.open(new File(fullPath));
if (oldNames[i].startsWith("29.")) {
assertCompressedFields29(dir, true);
hasTested29++;
}
IndexWriter w = new IndexWriter(dir, new WhitespaceAnalyzer(), IndexWriter.MaxFieldLength.LIMITED);
w.optimize();
w.close();
_TestUtil.checkIndex(dir);
if (oldNames[i].startsWith("29.")) {
assertCompressedFields29(dir, false);
hasTested29++;
}
dir.close();
rmDir(oldNames[i]);
}
assertEquals("test for compressed field should have run 4 times", 4, hasTested29);
}
public void testSearchOldIndex() throws IOException {
@ -203,7 +293,8 @@ public class TestBackwardsCompatibility extends LuceneTestCase
!oldName.startsWith("22.")) {
if (d.getField("content3") == null) {
assertEquals(5, fields.size());
final int numFields = oldName.startsWith("29.") ? 7 : 5;
assertEquals(numFields, fields.size());
Field f = (Field) d.getField("id");
assertEquals(""+i, f.stringValue());
@ -497,6 +588,15 @@ public class TestBackwardsCompatibility extends LuceneTestCase
doc.add(new Field("utf8", "Lu\uD834\uDD1Ece\uD834\uDD60ne \u0000 \u2620 ab\ud917\udc17cd", Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
doc.add(new Field("content2", "here is more content with aaa aaa aaa", Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
doc.add(new Field("fie\u2C77ld", "field with non-ascii name", Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
/* This was used in 2.9 to generate an index with compressed field:
if (id % 2 == 0) {
doc.add(new Field("compressed", TEXT_TO_COMPRESS, Field.Store.COMPRESS, Field.Index.NOT_ANALYZED));
doc.add(new Field("compressedSize", Integer.toString(TEXT_COMPRESSED_LENGTH), Field.Store.YES, Field.Index.NOT_ANALYZED));
} else {
doc.add(new Field("compressed", BINARY_TO_COMPRESS, Field.Store.COMPRESS));
doc.add(new Field("compressedSize", Integer.toString(BINARY_COMPRESSED_LENGTH), Field.Store.YES, Field.Index.NOT_ANALYZED));
}
*/
writer.addDocument(doc);
}
@ -527,4 +627,22 @@ public class TestBackwardsCompatibility extends LuceneTestCase
public static String fullDir(String dirName) throws IOException {
return new File(System.getProperty("tempDir"), dirName).getCanonicalPath();
}
static final String TEXT_TO_COMPRESS = "this is a compressed field and should appear in 3.0 as an uncompressed field after merge";
// FieldSelectorResult.SIZE returns compressed size for compressed fields, which are internally handled as binary;
// do it in the same way like FieldsWriter, do not use CompressionTools.compressString() for compressed fields:
/* This was used in 2.9 to generate an index with compressed field:
static final int TEXT_COMPRESSED_LENGTH;
static {
try {
TEXT_COMPRESSED_LENGTH = CompressionTools.compress(TEXT_TO_COMPRESS.getBytes("UTF-8")).length;
} catch (Exception e) {
throw new RuntimeException();
}
}
*/
static final byte[] BINARY_TO_COMPRESS = new byte[]{1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20};
/* This was used in 2.9 to generate an index with compressed field:
static final int BINARY_COMPRESSED_LENGTH = CompressionTools.compress(BINARY_TO_COMPRESS).length;
*/
}

Binary file not shown.

Binary file not shown.