diff --git a/CHANGES.txt b/CHANGES.txt index 5d53f62ea85..2cdee087521 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -15,6 +15,25 @@ Changes in runtime behavior * LUCENE-1677: Remove the system property to set SegmentReader class implementation. (Uwe Schindler) +* LUCENE-1960: As a consequence of the removal of Field.Store.COMPRESS, + support for this type of fields was removed. Lucene 3.0 is still able + to read indexes with compressed fields, but as soon as merges occur + or the index is optimized, all compressed fields are decompressed + and converted to Field.Store.YES. Because of this, indexes with + compressed fields can suddenly get larger. Also the first merge with + decompression cannot be done in raw mode, it is therefore slower. + This change has no effect for code that uses such old indexes, + they behave as before (fields are automatically decompressed + during read). Indexes converted to Lucene 3.0 format cannot be read + anymore with previous versions. + It is recommended to optimize your indexes after upgrading to convert + to the new format and decompress all fields. + If you want compressed fields, you can use CompressionTools, that + creates compressed byte[] to be added as binary stored field. This + cannot be done automatically, as you also have to decompress such + fields when reading. You have to reindex to do that. + (Michael Busch, Uwe Schindler) + API Changes * LUCENE-1257, LUCENE-1984, LUCENE-1985, ...: Port to Java 1.5 [not yet finished]. diff --git a/src/java/org/apache/lucene/index/FieldsReader.java b/src/java/org/apache/lucene/index/FieldsReader.java index 346ef7eacf0..a99e8c52fe6 100644 --- a/src/java/org/apache/lucene/index/FieldsReader.java +++ b/src/java/org/apache/lucene/index/FieldsReader.java @@ -19,6 +19,7 @@ package org.apache.lucene.index; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.document.AbstractField; +import org.apache.lucene.document.CompressionTools; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldSelector; @@ -32,6 +33,7 @@ import org.apache.lucene.util.CloseableThreadLocal; import java.io.IOException; import java.io.Reader; +import java.util.zip.DataFormatException; /** * Class responsible for access to stored document fields. @@ -203,7 +205,11 @@ final class FieldsReader implements Cloneable { } boolean canReadRawDocs() { - return format >= FieldsWriter.FORMAT_VERSION_UTF8_LENGTH_IN_BYTES; + // Disable reading raw docs in 2.x format, because of the removal of compressed + // fields in 3.0. We don't want rawDocs() to decode field bits to figure out + // if a field was compressed, hence we enforce ordinary (non-raw) stored field merges + // for <3.0 indexes. + return format >= FieldsWriter.FORMAT_LUCENE_3_0_NO_COMPRESSED_FIELDS; } final Document doc(int n, FieldSelector fieldSelector) throws CorruptIndexException, IOException { @@ -219,31 +225,34 @@ final class FieldsReader implements Cloneable { FieldSelectorResult acceptField = fieldSelector == null ? FieldSelectorResult.LOAD : fieldSelector.accept(fi.name); byte bits = fieldsStream.readByte(); - assert bits <= FieldsWriter.FIELD_IS_TOKENIZED + FieldsWriter.FIELD_IS_BINARY; + assert bits <= FieldsWriter.FIELD_IS_COMPRESSED + FieldsWriter.FIELD_IS_TOKENIZED + FieldsWriter.FIELD_IS_BINARY; + boolean compressed = (bits & FieldsWriter.FIELD_IS_COMPRESSED) != 0; + assert (compressed ? (format < FieldsWriter.FORMAT_LUCENE_3_0_NO_COMPRESSED_FIELDS) : true) + : "compressed fields are only allowed in indexes of version <= 2.9"; boolean tokenize = (bits & FieldsWriter.FIELD_IS_TOKENIZED) != 0; boolean binary = (bits & FieldsWriter.FIELD_IS_BINARY) != 0; //TODO: Find an alternative approach here if this list continues to grow beyond the //list of 5 or 6 currently here. See Lucene 762 for discussion if (acceptField.equals(FieldSelectorResult.LOAD)) { - addField(doc, fi, binary, tokenize); + addField(doc, fi, binary, compressed, tokenize); } else if (acceptField.equals(FieldSelectorResult.LOAD_AND_BREAK)){ - addField(doc, fi, binary, tokenize); + addField(doc, fi, binary, compressed, tokenize); break;//Get out of this loop } else if (acceptField.equals(FieldSelectorResult.LAZY_LOAD)) { - addFieldLazy(doc, fi, binary, tokenize); + addFieldLazy(doc, fi, binary, compressed, tokenize); } else if (acceptField.equals(FieldSelectorResult.SIZE)){ - skipField(binary, addFieldSize(doc, fi, binary)); + skipField(binary, compressed, addFieldSize(doc, fi, binary, compressed)); } else if (acceptField.equals(FieldSelectorResult.SIZE_AND_BREAK)){ - addFieldSize(doc, fi, binary); + addFieldSize(doc, fi, binary, compressed); break; } else { - skipField(binary); + skipField(binary, compressed); } } @@ -280,12 +289,12 @@ final class FieldsReader implements Cloneable { * Skip the field. We still have to read some of the information about the field, but can skip past the actual content. * This will have the most payoff on large fields. */ - private void skipField(boolean binary) throws IOException { - skipField(binary, fieldsStream.readVInt()); + private void skipField(boolean binary, boolean compressed) throws IOException { + skipField(binary, compressed, fieldsStream.readVInt()); } - private void skipField(boolean binary, int toRead) throws IOException { - if (format >= FieldsWriter.FORMAT_VERSION_UTF8_LENGTH_IN_BYTES || binary) { + private void skipField(boolean binary, boolean compressed, int toRead) throws IOException { + if (format >= FieldsWriter.FORMAT_VERSION_UTF8_LENGTH_IN_BYTES || binary || compressed) { fieldsStream.seek(fieldsStream.getFilePointer() + toRead); } else { // We need to skip chars. This will slow us down, but still better @@ -293,12 +302,12 @@ final class FieldsReader implements Cloneable { } } - private void addFieldLazy(Document doc, FieldInfo fi, boolean binary, boolean tokenize) throws IOException { + private void addFieldLazy(Document doc, FieldInfo fi, boolean binary, boolean compressed, boolean tokenize) throws IOException { if (binary) { int toRead = fieldsStream.readVInt(); long pointer = fieldsStream.getFilePointer(); //was: doc.add(new Fieldable(fi.name, b, Fieldable.Store.YES)); - doc.add(new LazyField(fi.name, Field.Store.YES, toRead, pointer, binary)); + doc.add(new LazyField(fi.name, Field.Store.YES, toRead, pointer, binary, compressed)); //Need to move the pointer ahead by toRead positions fieldsStream.seek(pointer + toRead); } else { @@ -307,43 +316,75 @@ final class FieldsReader implements Cloneable { Field.TermVector termVector = Field.TermVector.toTermVector(fi.storeTermVector, fi.storeOffsetWithTermVector, fi.storePositionWithTermVector); AbstractField f; - int length = fieldsStream.readVInt(); - long pointer = fieldsStream.getFilePointer(); - //Skip ahead of where we are by the length of what is stored - if (format >= FieldsWriter.FORMAT_VERSION_UTF8_LENGTH_IN_BYTES) - fieldsStream.seek(pointer+length); - else - fieldsStream.skipChars(length); - f = new LazyField(fi.name, store, index, termVector, length, pointer, binary); - f.setOmitNorms(fi.omitNorms); - f.setOmitTermFreqAndPositions(fi.omitTermFreqAndPositions); + if (compressed) { + int toRead = fieldsStream.readVInt(); + long pointer = fieldsStream.getFilePointer(); + f = new LazyField(fi.name, store, toRead, pointer, binary, compressed); + //skip over the part that we aren't loading + fieldsStream.seek(pointer + toRead); + f.setOmitNorms(fi.omitNorms); + f.setOmitTermFreqAndPositions(fi.omitTermFreqAndPositions); + } else { + int length = fieldsStream.readVInt(); + long pointer = fieldsStream.getFilePointer(); + //Skip ahead of where we are by the length of what is stored + if (format >= FieldsWriter.FORMAT_VERSION_UTF8_LENGTH_IN_BYTES) { + fieldsStream.seek(pointer+length); + } else { + fieldsStream.skipChars(length); + } + f = new LazyField(fi.name, store, index, termVector, length, pointer, binary, compressed); + f.setOmitNorms(fi.omitNorms); + f.setOmitTermFreqAndPositions(fi.omitTermFreqAndPositions); + } + doc.add(f); } } - private void addField(Document doc, FieldInfo fi, boolean binary, boolean tokenize) throws CorruptIndexException, IOException { + private void addField(Document doc, FieldInfo fi, boolean binary, boolean compressed, boolean tokenize) throws CorruptIndexException, IOException { //we have a binary stored field, and it may be compressed if (binary) { int toRead = fieldsStream.readVInt(); final byte[] b = new byte[toRead]; fieldsStream.readBytes(b, 0, b.length); - doc.add(new Field(fi.name, b, Field.Store.YES)); + if (compressed) { + doc.add(new Field(fi.name, uncompress(b), Field.Store.YES)); + } else { + doc.add(new Field(fi.name, b, Field.Store.YES)); + } } else { Field.Store store = Field.Store.YES; Field.Index index = Field.Index.toIndex(fi.isIndexed, tokenize); Field.TermVector termVector = Field.TermVector.toTermVector(fi.storeTermVector, fi.storeOffsetWithTermVector, fi.storePositionWithTermVector); AbstractField f; - f = new Field(fi.name, // name - false, - fieldsStream.readString(), // read value - store, - index, - termVector); - f.setOmitTermFreqAndPositions(fi.omitTermFreqAndPositions); - f.setOmitNorms(fi.omitNorms); + if (compressed) { + int toRead = fieldsStream.readVInt(); + + final byte[] b = new byte[toRead]; + fieldsStream.readBytes(b, 0, b.length); + f = new Field(fi.name, // field name + false, + new String(uncompress(b), "UTF-8"), // uncompress the value and add as string + store, + index, + termVector); + f.setOmitTermFreqAndPositions(fi.omitTermFreqAndPositions); + f.setOmitNorms(fi.omitNorms); + } else { + f = new Field(fi.name, // name + false, + fieldsStream.readString(), // read value + store, + index, + termVector); + f.setOmitTermFreqAndPositions(fi.omitTermFreqAndPositions); + f.setOmitNorms(fi.omitNorms); + } + doc.add(f); } } @@ -351,8 +392,8 @@ final class FieldsReader implements Cloneable { // Add the size of field as a byte[] containing the 4 bytes of the integer byte size (high order byte first; char = 2 bytes) // Read just the size -- caller must skip the field content to continue reading fields // Return the size in bytes or chars, depending on field type - private int addFieldSize(Document doc, FieldInfo fi, boolean binary) throws IOException { - int size = fieldsStream.readVInt(), bytesize = binary ? size : 2*size; + private int addFieldSize(Document doc, FieldInfo fi, boolean binary, boolean compressed) throws IOException { + int size = fieldsStream.readVInt(), bytesize = binary || compressed ? size : 2*size; byte[] sizebytes = new byte[4]; sizebytes[0] = (byte) (bytesize>>>24); sizebytes[1] = (byte) (bytesize>>>16); @@ -369,8 +410,10 @@ final class FieldsReader implements Cloneable { private class LazyField extends AbstractField implements Fieldable { private int toRead; private long pointer; + /** @deprecated Only kept for backward-compatbility with <3.0 indexes. Will be removed in 4.0. */ + private boolean isCompressed; - public LazyField(String name, Field.Store store, int toRead, long pointer, boolean isBinary) { + public LazyField(String name, Field.Store store, int toRead, long pointer, boolean isBinary, boolean isCompressed) { super(name, store, Field.Index.NO, Field.TermVector.NO); this.toRead = toRead; this.pointer = pointer; @@ -378,9 +421,10 @@ final class FieldsReader implements Cloneable { if (isBinary) binaryLength = toRead; lazy = true; + this.isCompressed = isCompressed; } - public LazyField(String name, Field.Store store, Field.Index index, Field.TermVector termVector, int toRead, long pointer, boolean isBinary) { + public LazyField(String name, Field.Store store, Field.Index index, Field.TermVector termVector, int toRead, long pointer, boolean isBinary, boolean isCompressed) { super(name, store, index, termVector); this.toRead = toRead; this.pointer = pointer; @@ -388,6 +432,7 @@ final class FieldsReader implements Cloneable { if (isBinary) binaryLength = toRead; lazy = true; + this.isCompressed = isCompressed; } private IndexInput getFieldStream() { @@ -427,15 +472,21 @@ final class FieldsReader implements Cloneable { IndexInput localFieldsStream = getFieldStream(); try { localFieldsStream.seek(pointer); - if (format >= FieldsWriter.FORMAT_VERSION_UTF8_LENGTH_IN_BYTES) { - byte[] bytes = new byte[toRead]; - localFieldsStream.readBytes(bytes, 0, toRead); - fieldsData = new String(bytes, "UTF-8"); + if (isCompressed) { + final byte[] b = new byte[toRead]; + localFieldsStream.readBytes(b, 0, b.length); + fieldsData = new String(uncompress(b), "UTF-8"); } else { - //read in chars b/c we already know the length we need to read - char[] chars = new char[toRead]; - localFieldsStream.readChars(chars, 0, toRead); - fieldsData = new String(chars); + if (format >= FieldsWriter.FORMAT_VERSION_UTF8_LENGTH_IN_BYTES) { + byte[] bytes = new byte[toRead]; + localFieldsStream.readBytes(bytes, 0, toRead); + fieldsData = new String(bytes, "UTF-8"); + } else { + //read in chars b/c we already know the length we need to read + char[] chars = new char[toRead]; + localFieldsStream.readChars(chars, 0, toRead); + fieldsData = new String(chars); + } } } catch (IOException e) { throw new FieldReaderException(e); @@ -484,7 +535,11 @@ final class FieldsReader implements Cloneable { try { localFieldsStream.seek(pointer); localFieldsStream.readBytes(b, 0, toRead); - fieldsData = b; + if (isCompressed == true) { + fieldsData = uncompress(b); + } else { + fieldsData = b; + } } catch (IOException e) { throw new FieldReaderException(e); } @@ -498,4 +553,16 @@ final class FieldsReader implements Cloneable { return null; } } + + private byte[] uncompress(byte[] b) + throws CorruptIndexException { + try { + return CompressionTools.decompress(b); + } catch (DataFormatException e) { + // this will happen if the field is not compressed + CorruptIndexException newException = new CorruptIndexException("field data are in wrong format: " + e.toString()); + newException.initCause(e); + throw newException; + } + } } diff --git a/src/java/org/apache/lucene/index/FieldsWriter.java b/src/java/org/apache/lucene/index/FieldsWriter.java index c0a3db1e1e7..bc88612deda 100644 --- a/src/java/org/apache/lucene/index/FieldsWriter.java +++ b/src/java/org/apache/lucene/index/FieldsWriter.java @@ -30,17 +30,23 @@ final class FieldsWriter { static final byte FIELD_IS_TOKENIZED = 0x1; static final byte FIELD_IS_BINARY = 0x2; + + /** @deprecated Kept for backwards-compatibility with <3.0 indexes; will be removed in 4.0 */ + static final byte FIELD_IS_COMPRESSED = 0x4; // Original format static final int FORMAT = 0; // Changed strings to UTF8 static final int FORMAT_VERSION_UTF8_LENGTH_IN_BYTES = 1; + + // Lucene 3.0: Removal of compressed fields + static final int FORMAT_LUCENE_3_0_NO_COMPRESSED_FIELDS = 2; // NOTE: if you introduce a new format, make it 1 higher // than the current one, and always change this if you // switch to a new format! - static final int FORMAT_CURRENT = FORMAT_VERSION_UTF8_LENGTH_IN_BYTES; + static final int FORMAT_CURRENT = FORMAT_LUCENE_3_0_NO_COMPRESSED_FIELDS; private FieldInfos fieldInfos; diff --git a/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java b/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java index 4e6b48e96a2..07a7668f4ea 100644 --- a/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java +++ b/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java @@ -22,21 +22,28 @@ import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; +import java.io.ByteArrayInputStream; +import java.io.DataInputStream; import java.io.OutputStream; import java.util.Arrays; import java.util.Enumeration; import java.util.List; +import java.util.ArrayList; import java.util.zip.ZipEntry; import java.util.zip.ZipFile; import org.apache.lucene.analysis.WhitespaceAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; +import org.apache.lucene.document.Fieldable; +import org.apache.lucene.document.FieldSelector; +import org.apache.lucene.document.FieldSelectorResult; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TermQuery; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; +import org.apache.lucene.util.ReaderUtil; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util._TestUtil; @@ -127,22 +134,105 @@ public class TestBackwardsCompatibility extends LuceneTestCase "23.nocfs", "24.cfs", "24.nocfs", + "29.cfs", + "29.nocfs", }; + + private void assertCompressedFields29(Directory dir, boolean shouldStillBeCompressed) throws IOException { + int count = 0; + final int TEXT_PLAIN_LENGTH = TEXT_TO_COMPRESS.length() * 2; + // FieldSelectorResult.SIZE returns 2*number_of_chars for String fields: + final int BINARY_PLAIN_LENGTH = BINARY_TO_COMPRESS.length; + + IndexReader reader = IndexReader.open(dir, true); + try { + // look into sub readers and check if raw merge is on/off + List readers = new ArrayList(); + ReaderUtil.gatherSubReaders(readers, reader); + for (IndexReader ir : readers) { + final FieldsReader fr = ((SegmentReader) ir).getFieldsReader(); + assertTrue("for a 2.9 index, FieldsReader.canReadRawDocs() must be false and other way round for a trunk index", + shouldStillBeCompressed != fr.canReadRawDocs()); + } + + // test that decompression works correctly + for(int i=0; i 0; + final int shouldSize = shouldStillBeCompressed ? + compressedSize : + (binary ? BINARY_PLAIN_LENGTH : TEXT_PLAIN_LENGTH); + assertEquals("size incorrect", shouldSize, actualSize); + if (!shouldStillBeCompressed) { + assertFalse("uncompressed field should have another size than recorded in index", compressedSize == actualSize); + } + } + } + assertEquals("correct number of tests", 34 * 2, count); + } finally { + reader.close(); + } + } public void testOptimizeOldIndex() throws IOException { + int hasTested29 = 0; + for(int i=0;i