mirror of https://github.com/apache/lucene.git
LUCENE-1960: Add support for reading Field.Store.COMPRESS fields again and decompress on merge/optimize. Also convert to a new stored file version number.
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@829972 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
6a1f2d5729
commit
c61fa48daf
19
CHANGES.txt
19
CHANGES.txt
|
@ -15,6 +15,25 @@ Changes in runtime behavior
|
|||
* LUCENE-1677: Remove the system property to set SegmentReader class
|
||||
implementation. (Uwe Schindler)
|
||||
|
||||
* LUCENE-1960: As a consequence of the removal of Field.Store.COMPRESS,
|
||||
support for this type of fields was removed. Lucene 3.0 is still able
|
||||
to read indexes with compressed fields, but as soon as merges occur
|
||||
or the index is optimized, all compressed fields are decompressed
|
||||
and converted to Field.Store.YES. Because of this, indexes with
|
||||
compressed fields can suddenly get larger. Also the first merge with
|
||||
decompression cannot be done in raw mode, it is therefore slower.
|
||||
This change has no effect for code that uses such old indexes,
|
||||
they behave as before (fields are automatically decompressed
|
||||
during read). Indexes converted to Lucene 3.0 format cannot be read
|
||||
anymore with previous versions.
|
||||
It is recommended to optimize your indexes after upgrading to convert
|
||||
to the new format and decompress all fields.
|
||||
If you want compressed fields, you can use CompressionTools, that
|
||||
creates compressed byte[] to be added as binary stored field. This
|
||||
cannot be done automatically, as you also have to decompress such
|
||||
fields when reading. You have to reindex to do that.
|
||||
(Michael Busch, Uwe Schindler)
|
||||
|
||||
API Changes
|
||||
|
||||
* LUCENE-1257, LUCENE-1984, LUCENE-1985, ...: Port to Java 1.5 [not yet finished].
|
||||
|
|
|
@ -19,6 +19,7 @@ package org.apache.lucene.index;
|
|||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.document.AbstractField;
|
||||
import org.apache.lucene.document.CompressionTools;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.FieldSelector;
|
||||
|
@ -32,6 +33,7 @@ import org.apache.lucene.util.CloseableThreadLocal;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.zip.DataFormatException;
|
||||
|
||||
/**
|
||||
* Class responsible for access to stored document fields.
|
||||
|
@ -203,7 +205,11 @@ final class FieldsReader implements Cloneable {
|
|||
}
|
||||
|
||||
boolean canReadRawDocs() {
|
||||
return format >= FieldsWriter.FORMAT_VERSION_UTF8_LENGTH_IN_BYTES;
|
||||
// Disable reading raw docs in 2.x format, because of the removal of compressed
|
||||
// fields in 3.0. We don't want rawDocs() to decode field bits to figure out
|
||||
// if a field was compressed, hence we enforce ordinary (non-raw) stored field merges
|
||||
// for <3.0 indexes.
|
||||
return format >= FieldsWriter.FORMAT_LUCENE_3_0_NO_COMPRESSED_FIELDS;
|
||||
}
|
||||
|
||||
final Document doc(int n, FieldSelector fieldSelector) throws CorruptIndexException, IOException {
|
||||
|
@ -219,31 +225,34 @@ final class FieldsReader implements Cloneable {
|
|||
FieldSelectorResult acceptField = fieldSelector == null ? FieldSelectorResult.LOAD : fieldSelector.accept(fi.name);
|
||||
|
||||
byte bits = fieldsStream.readByte();
|
||||
assert bits <= FieldsWriter.FIELD_IS_TOKENIZED + FieldsWriter.FIELD_IS_BINARY;
|
||||
assert bits <= FieldsWriter.FIELD_IS_COMPRESSED + FieldsWriter.FIELD_IS_TOKENIZED + FieldsWriter.FIELD_IS_BINARY;
|
||||
|
||||
boolean compressed = (bits & FieldsWriter.FIELD_IS_COMPRESSED) != 0;
|
||||
assert (compressed ? (format < FieldsWriter.FORMAT_LUCENE_3_0_NO_COMPRESSED_FIELDS) : true)
|
||||
: "compressed fields are only allowed in indexes of version <= 2.9";
|
||||
boolean tokenize = (bits & FieldsWriter.FIELD_IS_TOKENIZED) != 0;
|
||||
boolean binary = (bits & FieldsWriter.FIELD_IS_BINARY) != 0;
|
||||
//TODO: Find an alternative approach here if this list continues to grow beyond the
|
||||
//list of 5 or 6 currently here. See Lucene 762 for discussion
|
||||
if (acceptField.equals(FieldSelectorResult.LOAD)) {
|
||||
addField(doc, fi, binary, tokenize);
|
||||
addField(doc, fi, binary, compressed, tokenize);
|
||||
}
|
||||
else if (acceptField.equals(FieldSelectorResult.LOAD_AND_BREAK)){
|
||||
addField(doc, fi, binary, tokenize);
|
||||
addField(doc, fi, binary, compressed, tokenize);
|
||||
break;//Get out of this loop
|
||||
}
|
||||
else if (acceptField.equals(FieldSelectorResult.LAZY_LOAD)) {
|
||||
addFieldLazy(doc, fi, binary, tokenize);
|
||||
addFieldLazy(doc, fi, binary, compressed, tokenize);
|
||||
}
|
||||
else if (acceptField.equals(FieldSelectorResult.SIZE)){
|
||||
skipField(binary, addFieldSize(doc, fi, binary));
|
||||
skipField(binary, compressed, addFieldSize(doc, fi, binary, compressed));
|
||||
}
|
||||
else if (acceptField.equals(FieldSelectorResult.SIZE_AND_BREAK)){
|
||||
addFieldSize(doc, fi, binary);
|
||||
addFieldSize(doc, fi, binary, compressed);
|
||||
break;
|
||||
}
|
||||
else {
|
||||
skipField(binary);
|
||||
skipField(binary, compressed);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -280,12 +289,12 @@ final class FieldsReader implements Cloneable {
|
|||
* Skip the field. We still have to read some of the information about the field, but can skip past the actual content.
|
||||
* This will have the most payoff on large fields.
|
||||
*/
|
||||
private void skipField(boolean binary) throws IOException {
|
||||
skipField(binary, fieldsStream.readVInt());
|
||||
private void skipField(boolean binary, boolean compressed) throws IOException {
|
||||
skipField(binary, compressed, fieldsStream.readVInt());
|
||||
}
|
||||
|
||||
private void skipField(boolean binary, int toRead) throws IOException {
|
||||
if (format >= FieldsWriter.FORMAT_VERSION_UTF8_LENGTH_IN_BYTES || binary) {
|
||||
private void skipField(boolean binary, boolean compressed, int toRead) throws IOException {
|
||||
if (format >= FieldsWriter.FORMAT_VERSION_UTF8_LENGTH_IN_BYTES || binary || compressed) {
|
||||
fieldsStream.seek(fieldsStream.getFilePointer() + toRead);
|
||||
} else {
|
||||
// We need to skip chars. This will slow us down, but still better
|
||||
|
@ -293,12 +302,12 @@ final class FieldsReader implements Cloneable {
|
|||
}
|
||||
}
|
||||
|
||||
private void addFieldLazy(Document doc, FieldInfo fi, boolean binary, boolean tokenize) throws IOException {
|
||||
private void addFieldLazy(Document doc, FieldInfo fi, boolean binary, boolean compressed, boolean tokenize) throws IOException {
|
||||
if (binary) {
|
||||
int toRead = fieldsStream.readVInt();
|
||||
long pointer = fieldsStream.getFilePointer();
|
||||
//was: doc.add(new Fieldable(fi.name, b, Fieldable.Store.YES));
|
||||
doc.add(new LazyField(fi.name, Field.Store.YES, toRead, pointer, binary));
|
||||
doc.add(new LazyField(fi.name, Field.Store.YES, toRead, pointer, binary, compressed));
|
||||
//Need to move the pointer ahead by toRead positions
|
||||
fieldsStream.seek(pointer + toRead);
|
||||
} else {
|
||||
|
@ -307,43 +316,75 @@ final class FieldsReader implements Cloneable {
|
|||
Field.TermVector termVector = Field.TermVector.toTermVector(fi.storeTermVector, fi.storeOffsetWithTermVector, fi.storePositionWithTermVector);
|
||||
|
||||
AbstractField f;
|
||||
int length = fieldsStream.readVInt();
|
||||
long pointer = fieldsStream.getFilePointer();
|
||||
//Skip ahead of where we are by the length of what is stored
|
||||
if (format >= FieldsWriter.FORMAT_VERSION_UTF8_LENGTH_IN_BYTES)
|
||||
fieldsStream.seek(pointer+length);
|
||||
else
|
||||
fieldsStream.skipChars(length);
|
||||
f = new LazyField(fi.name, store, index, termVector, length, pointer, binary);
|
||||
f.setOmitNorms(fi.omitNorms);
|
||||
f.setOmitTermFreqAndPositions(fi.omitTermFreqAndPositions);
|
||||
if (compressed) {
|
||||
int toRead = fieldsStream.readVInt();
|
||||
long pointer = fieldsStream.getFilePointer();
|
||||
f = new LazyField(fi.name, store, toRead, pointer, binary, compressed);
|
||||
//skip over the part that we aren't loading
|
||||
fieldsStream.seek(pointer + toRead);
|
||||
f.setOmitNorms(fi.omitNorms);
|
||||
f.setOmitTermFreqAndPositions(fi.omitTermFreqAndPositions);
|
||||
} else {
|
||||
int length = fieldsStream.readVInt();
|
||||
long pointer = fieldsStream.getFilePointer();
|
||||
//Skip ahead of where we are by the length of what is stored
|
||||
if (format >= FieldsWriter.FORMAT_VERSION_UTF8_LENGTH_IN_BYTES) {
|
||||
fieldsStream.seek(pointer+length);
|
||||
} else {
|
||||
fieldsStream.skipChars(length);
|
||||
}
|
||||
f = new LazyField(fi.name, store, index, termVector, length, pointer, binary, compressed);
|
||||
f.setOmitNorms(fi.omitNorms);
|
||||
f.setOmitTermFreqAndPositions(fi.omitTermFreqAndPositions);
|
||||
}
|
||||
|
||||
doc.add(f);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private void addField(Document doc, FieldInfo fi, boolean binary, boolean tokenize) throws CorruptIndexException, IOException {
|
||||
private void addField(Document doc, FieldInfo fi, boolean binary, boolean compressed, boolean tokenize) throws CorruptIndexException, IOException {
|
||||
|
||||
//we have a binary stored field, and it may be compressed
|
||||
if (binary) {
|
||||
int toRead = fieldsStream.readVInt();
|
||||
final byte[] b = new byte[toRead];
|
||||
fieldsStream.readBytes(b, 0, b.length);
|
||||
doc.add(new Field(fi.name, b, Field.Store.YES));
|
||||
if (compressed) {
|
||||
doc.add(new Field(fi.name, uncompress(b), Field.Store.YES));
|
||||
} else {
|
||||
doc.add(new Field(fi.name, b, Field.Store.YES));
|
||||
}
|
||||
} else {
|
||||
Field.Store store = Field.Store.YES;
|
||||
Field.Index index = Field.Index.toIndex(fi.isIndexed, tokenize);
|
||||
Field.TermVector termVector = Field.TermVector.toTermVector(fi.storeTermVector, fi.storeOffsetWithTermVector, fi.storePositionWithTermVector);
|
||||
|
||||
AbstractField f;
|
||||
f = new Field(fi.name, // name
|
||||
false,
|
||||
fieldsStream.readString(), // read value
|
||||
store,
|
||||
index,
|
||||
termVector);
|
||||
f.setOmitTermFreqAndPositions(fi.omitTermFreqAndPositions);
|
||||
f.setOmitNorms(fi.omitNorms);
|
||||
if (compressed) {
|
||||
int toRead = fieldsStream.readVInt();
|
||||
|
||||
final byte[] b = new byte[toRead];
|
||||
fieldsStream.readBytes(b, 0, b.length);
|
||||
f = new Field(fi.name, // field name
|
||||
false,
|
||||
new String(uncompress(b), "UTF-8"), // uncompress the value and add as string
|
||||
store,
|
||||
index,
|
||||
termVector);
|
||||
f.setOmitTermFreqAndPositions(fi.omitTermFreqAndPositions);
|
||||
f.setOmitNorms(fi.omitNorms);
|
||||
} else {
|
||||
f = new Field(fi.name, // name
|
||||
false,
|
||||
fieldsStream.readString(), // read value
|
||||
store,
|
||||
index,
|
||||
termVector);
|
||||
f.setOmitTermFreqAndPositions(fi.omitTermFreqAndPositions);
|
||||
f.setOmitNorms(fi.omitNorms);
|
||||
}
|
||||
|
||||
doc.add(f);
|
||||
}
|
||||
}
|
||||
|
@ -351,8 +392,8 @@ final class FieldsReader implements Cloneable {
|
|||
// Add the size of field as a byte[] containing the 4 bytes of the integer byte size (high order byte first; char = 2 bytes)
|
||||
// Read just the size -- caller must skip the field content to continue reading fields
|
||||
// Return the size in bytes or chars, depending on field type
|
||||
private int addFieldSize(Document doc, FieldInfo fi, boolean binary) throws IOException {
|
||||
int size = fieldsStream.readVInt(), bytesize = binary ? size : 2*size;
|
||||
private int addFieldSize(Document doc, FieldInfo fi, boolean binary, boolean compressed) throws IOException {
|
||||
int size = fieldsStream.readVInt(), bytesize = binary || compressed ? size : 2*size;
|
||||
byte[] sizebytes = new byte[4];
|
||||
sizebytes[0] = (byte) (bytesize>>>24);
|
||||
sizebytes[1] = (byte) (bytesize>>>16);
|
||||
|
@ -369,8 +410,10 @@ final class FieldsReader implements Cloneable {
|
|||
private class LazyField extends AbstractField implements Fieldable {
|
||||
private int toRead;
|
||||
private long pointer;
|
||||
/** @deprecated Only kept for backward-compatbility with <3.0 indexes. Will be removed in 4.0. */
|
||||
private boolean isCompressed;
|
||||
|
||||
public LazyField(String name, Field.Store store, int toRead, long pointer, boolean isBinary) {
|
||||
public LazyField(String name, Field.Store store, int toRead, long pointer, boolean isBinary, boolean isCompressed) {
|
||||
super(name, store, Field.Index.NO, Field.TermVector.NO);
|
||||
this.toRead = toRead;
|
||||
this.pointer = pointer;
|
||||
|
@ -378,9 +421,10 @@ final class FieldsReader implements Cloneable {
|
|||
if (isBinary)
|
||||
binaryLength = toRead;
|
||||
lazy = true;
|
||||
this.isCompressed = isCompressed;
|
||||
}
|
||||
|
||||
public LazyField(String name, Field.Store store, Field.Index index, Field.TermVector termVector, int toRead, long pointer, boolean isBinary) {
|
||||
public LazyField(String name, Field.Store store, Field.Index index, Field.TermVector termVector, int toRead, long pointer, boolean isBinary, boolean isCompressed) {
|
||||
super(name, store, index, termVector);
|
||||
this.toRead = toRead;
|
||||
this.pointer = pointer;
|
||||
|
@ -388,6 +432,7 @@ final class FieldsReader implements Cloneable {
|
|||
if (isBinary)
|
||||
binaryLength = toRead;
|
||||
lazy = true;
|
||||
this.isCompressed = isCompressed;
|
||||
}
|
||||
|
||||
private IndexInput getFieldStream() {
|
||||
|
@ -427,15 +472,21 @@ final class FieldsReader implements Cloneable {
|
|||
IndexInput localFieldsStream = getFieldStream();
|
||||
try {
|
||||
localFieldsStream.seek(pointer);
|
||||
if (format >= FieldsWriter.FORMAT_VERSION_UTF8_LENGTH_IN_BYTES) {
|
||||
byte[] bytes = new byte[toRead];
|
||||
localFieldsStream.readBytes(bytes, 0, toRead);
|
||||
fieldsData = new String(bytes, "UTF-8");
|
||||
if (isCompressed) {
|
||||
final byte[] b = new byte[toRead];
|
||||
localFieldsStream.readBytes(b, 0, b.length);
|
||||
fieldsData = new String(uncompress(b), "UTF-8");
|
||||
} else {
|
||||
//read in chars b/c we already know the length we need to read
|
||||
char[] chars = new char[toRead];
|
||||
localFieldsStream.readChars(chars, 0, toRead);
|
||||
fieldsData = new String(chars);
|
||||
if (format >= FieldsWriter.FORMAT_VERSION_UTF8_LENGTH_IN_BYTES) {
|
||||
byte[] bytes = new byte[toRead];
|
||||
localFieldsStream.readBytes(bytes, 0, toRead);
|
||||
fieldsData = new String(bytes, "UTF-8");
|
||||
} else {
|
||||
//read in chars b/c we already know the length we need to read
|
||||
char[] chars = new char[toRead];
|
||||
localFieldsStream.readChars(chars, 0, toRead);
|
||||
fieldsData = new String(chars);
|
||||
}
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new FieldReaderException(e);
|
||||
|
@ -484,7 +535,11 @@ final class FieldsReader implements Cloneable {
|
|||
try {
|
||||
localFieldsStream.seek(pointer);
|
||||
localFieldsStream.readBytes(b, 0, toRead);
|
||||
fieldsData = b;
|
||||
if (isCompressed == true) {
|
||||
fieldsData = uncompress(b);
|
||||
} else {
|
||||
fieldsData = b;
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new FieldReaderException(e);
|
||||
}
|
||||
|
@ -498,4 +553,16 @@ final class FieldsReader implements Cloneable {
|
|||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private byte[] uncompress(byte[] b)
|
||||
throws CorruptIndexException {
|
||||
try {
|
||||
return CompressionTools.decompress(b);
|
||||
} catch (DataFormatException e) {
|
||||
// this will happen if the field is not compressed
|
||||
CorruptIndexException newException = new CorruptIndexException("field data are in wrong format: " + e.toString());
|
||||
newException.initCause(e);
|
||||
throw newException;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -30,17 +30,23 @@ final class FieldsWriter
|
|||
{
|
||||
static final byte FIELD_IS_TOKENIZED = 0x1;
|
||||
static final byte FIELD_IS_BINARY = 0x2;
|
||||
|
||||
/** @deprecated Kept for backwards-compatibility with <3.0 indexes; will be removed in 4.0 */
|
||||
static final byte FIELD_IS_COMPRESSED = 0x4;
|
||||
|
||||
// Original format
|
||||
static final int FORMAT = 0;
|
||||
|
||||
// Changed strings to UTF8
|
||||
static final int FORMAT_VERSION_UTF8_LENGTH_IN_BYTES = 1;
|
||||
|
||||
// Lucene 3.0: Removal of compressed fields
|
||||
static final int FORMAT_LUCENE_3_0_NO_COMPRESSED_FIELDS = 2;
|
||||
|
||||
// NOTE: if you introduce a new format, make it 1 higher
|
||||
// than the current one, and always change this if you
|
||||
// switch to a new format!
|
||||
static final int FORMAT_CURRENT = FORMAT_VERSION_UTF8_LENGTH_IN_BYTES;
|
||||
static final int FORMAT_CURRENT = FORMAT_LUCENE_3_0_NO_COMPRESSED_FIELDS;
|
||||
|
||||
private FieldInfos fieldInfos;
|
||||
|
||||
|
|
|
@ -22,21 +22,28 @@ import java.io.File;
|
|||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.DataInputStream;
|
||||
import java.io.OutputStream;
|
||||
import java.util.Arrays;
|
||||
import java.util.Enumeration;
|
||||
import java.util.List;
|
||||
import java.util.ArrayList;
|
||||
import java.util.zip.ZipEntry;
|
||||
import java.util.zip.ZipFile;
|
||||
|
||||
import org.apache.lucene.analysis.WhitespaceAnalyzer;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.Fieldable;
|
||||
import org.apache.lucene.document.FieldSelector;
|
||||
import org.apache.lucene.document.FieldSelectorResult;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.ScoreDoc;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.FSDirectory;
|
||||
import org.apache.lucene.util.ReaderUtil;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util._TestUtil;
|
||||
|
||||
|
@ -127,22 +134,105 @@ public class TestBackwardsCompatibility extends LuceneTestCase
|
|||
"23.nocfs",
|
||||
"24.cfs",
|
||||
"24.nocfs",
|
||||
"29.cfs",
|
||||
"29.nocfs",
|
||||
};
|
||||
|
||||
private void assertCompressedFields29(Directory dir, boolean shouldStillBeCompressed) throws IOException {
|
||||
int count = 0;
|
||||
final int TEXT_PLAIN_LENGTH = TEXT_TO_COMPRESS.length() * 2;
|
||||
// FieldSelectorResult.SIZE returns 2*number_of_chars for String fields:
|
||||
final int BINARY_PLAIN_LENGTH = BINARY_TO_COMPRESS.length;
|
||||
|
||||
IndexReader reader = IndexReader.open(dir, true);
|
||||
try {
|
||||
// look into sub readers and check if raw merge is on/off
|
||||
List<IndexReader> readers = new ArrayList<IndexReader>();
|
||||
ReaderUtil.gatherSubReaders(readers, reader);
|
||||
for (IndexReader ir : readers) {
|
||||
final FieldsReader fr = ((SegmentReader) ir).getFieldsReader();
|
||||
assertTrue("for a 2.9 index, FieldsReader.canReadRawDocs() must be false and other way round for a trunk index",
|
||||
shouldStillBeCompressed != fr.canReadRawDocs());
|
||||
}
|
||||
|
||||
// test that decompression works correctly
|
||||
for(int i=0; i<reader.maxDoc(); i++) {
|
||||
if (!reader.isDeleted(i)) {
|
||||
Document d = reader.document(i);
|
||||
if (d.get("content3") != null) continue;
|
||||
count++;
|
||||
Fieldable compressed = d.getFieldable("compressed");
|
||||
if (Integer.parseInt(d.get("id")) % 2 == 0) {
|
||||
assertFalse(compressed.isBinary());
|
||||
assertEquals("incorrectly decompressed string", TEXT_TO_COMPRESS, compressed.stringValue());
|
||||
} else {
|
||||
assertTrue(compressed.isBinary());
|
||||
assertTrue("incorrectly decompressed binary", Arrays.equals(BINARY_TO_COMPRESS, compressed.getBinaryValue()));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// check if field was decompressed after optimize
|
||||
for(int i=0; i<reader.maxDoc(); i++) {
|
||||
if (!reader.isDeleted(i)) {
|
||||
Document d = reader.document(i, new FieldSelector() {
|
||||
public FieldSelectorResult accept(String fieldName) {
|
||||
return ("compressed".equals(fieldName)) ? FieldSelectorResult.SIZE : FieldSelectorResult.LOAD;
|
||||
}
|
||||
});
|
||||
if (d.get("content3") != null) continue;
|
||||
count++;
|
||||
// read the size from the binary value using DataInputStream (this prevents us from doing the shift ops ourselves):
|
||||
final DataInputStream ds = new DataInputStream(new ByteArrayInputStream(d.getFieldable("compressed").getBinaryValue()));
|
||||
final int actualSize = ds.readInt();
|
||||
ds.close();
|
||||
final int compressedSize = Integer.parseInt(d.get("compressedSize"));
|
||||
final boolean binary = Integer.parseInt(d.get("id")) % 2 > 0;
|
||||
final int shouldSize = shouldStillBeCompressed ?
|
||||
compressedSize :
|
||||
(binary ? BINARY_PLAIN_LENGTH : TEXT_PLAIN_LENGTH);
|
||||
assertEquals("size incorrect", shouldSize, actualSize);
|
||||
if (!shouldStillBeCompressed) {
|
||||
assertFalse("uncompressed field should have another size than recorded in index", compressedSize == actualSize);
|
||||
}
|
||||
}
|
||||
}
|
||||
assertEquals("correct number of tests", 34 * 2, count);
|
||||
} finally {
|
||||
reader.close();
|
||||
}
|
||||
}
|
||||
|
||||
public void testOptimizeOldIndex() throws IOException {
|
||||
int hasTested29 = 0;
|
||||
|
||||
for(int i=0;i<oldNames.length;i++) {
|
||||
String dirName = "src/test/org/apache/lucene/index/index." + oldNames[i];
|
||||
unzip(dirName, oldNames[i]);
|
||||
String fullPath = fullDir(oldNames[i]);
|
||||
Directory dir = FSDirectory.open(new File(fullPath));
|
||||
|
||||
if (oldNames[i].startsWith("29.")) {
|
||||
assertCompressedFields29(dir, true);
|
||||
hasTested29++;
|
||||
}
|
||||
|
||||
IndexWriter w = new IndexWriter(dir, new WhitespaceAnalyzer(), IndexWriter.MaxFieldLength.LIMITED);
|
||||
w.optimize();
|
||||
w.close();
|
||||
|
||||
_TestUtil.checkIndex(dir);
|
||||
|
||||
if (oldNames[i].startsWith("29.")) {
|
||||
assertCompressedFields29(dir, false);
|
||||
hasTested29++;
|
||||
}
|
||||
|
||||
dir.close();
|
||||
rmDir(oldNames[i]);
|
||||
}
|
||||
|
||||
assertEquals("test for compressed field should have run 4 times", 4, hasTested29);
|
||||
}
|
||||
|
||||
public void testSearchOldIndex() throws IOException {
|
||||
|
@ -203,7 +293,8 @@ public class TestBackwardsCompatibility extends LuceneTestCase
|
|||
!oldName.startsWith("22.")) {
|
||||
|
||||
if (d.getField("content3") == null) {
|
||||
assertEquals(5, fields.size());
|
||||
final int numFields = oldName.startsWith("29.") ? 7 : 5;
|
||||
assertEquals(numFields, fields.size());
|
||||
Field f = (Field) d.getField("id");
|
||||
assertEquals(""+i, f.stringValue());
|
||||
|
||||
|
@ -497,6 +588,15 @@ public class TestBackwardsCompatibility extends LuceneTestCase
|
|||
doc.add(new Field("utf8", "Lu\uD834\uDD1Ece\uD834\uDD60ne \u0000 \u2620 ab\ud917\udc17cd", Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
|
||||
doc.add(new Field("content2", "here is more content with aaa aaa aaa", Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
|
||||
doc.add(new Field("fie\u2C77ld", "field with non-ascii name", Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
|
||||
/* This was used in 2.9 to generate an index with compressed field:
|
||||
if (id % 2 == 0) {
|
||||
doc.add(new Field("compressed", TEXT_TO_COMPRESS, Field.Store.COMPRESS, Field.Index.NOT_ANALYZED));
|
||||
doc.add(new Field("compressedSize", Integer.toString(TEXT_COMPRESSED_LENGTH), Field.Store.YES, Field.Index.NOT_ANALYZED));
|
||||
} else {
|
||||
doc.add(new Field("compressed", BINARY_TO_COMPRESS, Field.Store.COMPRESS));
|
||||
doc.add(new Field("compressedSize", Integer.toString(BINARY_COMPRESSED_LENGTH), Field.Store.YES, Field.Index.NOT_ANALYZED));
|
||||
}
|
||||
*/
|
||||
writer.addDocument(doc);
|
||||
}
|
||||
|
||||
|
@ -527,4 +627,22 @@ public class TestBackwardsCompatibility extends LuceneTestCase
|
|||
public static String fullDir(String dirName) throws IOException {
|
||||
return new File(System.getProperty("tempDir"), dirName).getCanonicalPath();
|
||||
}
|
||||
|
||||
static final String TEXT_TO_COMPRESS = "this is a compressed field and should appear in 3.0 as an uncompressed field after merge";
|
||||
// FieldSelectorResult.SIZE returns compressed size for compressed fields, which are internally handled as binary;
|
||||
// do it in the same way like FieldsWriter, do not use CompressionTools.compressString() for compressed fields:
|
||||
/* This was used in 2.9 to generate an index with compressed field:
|
||||
static final int TEXT_COMPRESSED_LENGTH;
|
||||
static {
|
||||
try {
|
||||
TEXT_COMPRESSED_LENGTH = CompressionTools.compress(TEXT_TO_COMPRESS.getBytes("UTF-8")).length;
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException();
|
||||
}
|
||||
}
|
||||
*/
|
||||
static final byte[] BINARY_TO_COMPRESS = new byte[]{1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20};
|
||||
/* This was used in 2.9 to generate an index with compressed field:
|
||||
static final int BINARY_COMPRESSED_LENGTH = CompressionTools.compress(BINARY_TO_COMPRESS).length;
|
||||
*/
|
||||
}
|
||||
|
|
Binary file not shown.
Binary file not shown.
Loading…
Reference in New Issue