LUCENE-652: add org.apache.lucene.document.CompressionTools; deprecate Field.Store.COMPRESS

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@756635 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2009-03-20 17:13:42 +00:00
parent dc1447005d
commit 39c8421992
6 changed files with 160 additions and 81 deletions

View File

@ -59,6 +59,10 @@ API Changes
9. LUCENE-1186: Add Analyzer.close() to free internal ThreadLocal 9. LUCENE-1186: Add Analyzer.close() to free internal ThreadLocal
resources. (Christian Kohlschütter via Mike McCandless) resources. (Christian Kohlschütter via Mike McCandless)
10. LUCENE-652: Added org.apache.lucene.document.CompressionTools, to
enable compressing & decompressing binary content, external to
Lucene's indexing. Deprecated Field.Store.COMPRESS.
Bug fixes Bug fixes
1. LUCENE-1415: MultiPhraseQuery has incorrect hashCode() and equals() 1. LUCENE-1415: MultiPhraseQuery has incorrect hashCode() and equals()

View File

@ -0,0 +1,110 @@
package org.apache.lucene.document;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.zip.Deflater;
import java.util.zip.Inflater;
import java.util.zip.DataFormatException;
import java.io.ByteArrayOutputStream;
/** Simple utility class providing static methods to
* compress and decompress binary data for stored fields.
* This class uses java.util.zip.Deflater and Inflater
* classes to compress and decompress.
*
* To compress a String:
* <pre>
* String string = ...
* byte[] bytes = compress(string.getBytes("UTF-8");
* </pre>
* and to decompress:
* <pre>
* new String(decompress(bytes), "UTF-8");
* </pre>
*/
public class CompressionTools {
// Export only static methods
private CompressionTools() {}
/** Compresses the specified byte range using the
* specified compressionLevel (constants are defined in
* java.util.zip.Deflater). */
public static byte[] compress(byte[] value, int offset, int length, int compressionLevel) {
/* Create an expandable byte array to hold the compressed data.
* You cannot use an array that's the same size as the orginal because
* there is no guarantee that the compressed data will be smaller than
* the uncompressed data. */
ByteArrayOutputStream bos = new ByteArrayOutputStream(length);
Deflater compressor = new Deflater();
try {
compressor.setLevel(compressionLevel);
compressor.setInput(value, offset, length);
compressor.finish();
// Compress the data
final byte[] buf = new byte[1024];
while (!compressor.finished()) {
int count = compressor.deflate(buf);
bos.write(buf, 0, count);
}
} finally {
compressor.end();
}
return bos.toByteArray();
}
/** Compresses the specified byte range, with default BEST_COMPRESSION level */
public static byte[] compress(byte[] value, int offset, int length) {
return compress(value, offset, length, Deflater.BEST_COMPRESSION);
}
/** Compresses all bytes in the array, with default BEST_COMPRESSION level */
public static byte[] compress(byte[] value) {
return compress(value, 0, value.length, Deflater.BEST_COMPRESSION);
}
/** Decompress the byte array previously returned by
* compress */
public static byte[] decompress(byte[] value) throws DataFormatException {
// Create an expandable byte array to hold the decompressed data
ByteArrayOutputStream bos = new ByteArrayOutputStream(value.length);
Inflater decompressor = new Inflater();
try {
decompressor.setInput(value);
// Decompress the data
final byte[] buf = new byte[1024];
while (!decompressor.finished()) {
int count = decompressor.inflate(buf);
bos.write(buf, 0, count);
}
} finally {
decompressor.end();
}
return bos.toByteArray();
}
}

View File

@ -43,6 +43,7 @@ public final class Field extends AbstractField implements Fieldable, Serializabl
/** Store the original field value in the index in a compressed form. This is /** Store the original field value in the index in a compressed form. This is
* useful for long documents and for binary valued fields. * useful for long documents and for binary valued fields.
* @deprecated Please use {@link CompressionTools} instead
*/ */
public static final Store COMPRESS = new Store("COMPRESS"); public static final Store COMPRESS = new Store("COMPRESS");

View File

@ -25,11 +25,9 @@ import org.apache.lucene.store.AlreadyClosedException;
import org.apache.lucene.store.BufferedIndexInput; import org.apache.lucene.store.BufferedIndexInput;
import org.apache.lucene.util.CloseableThreadLocal; import org.apache.lucene.util.CloseableThreadLocal;
import java.io.ByteArrayOutputStream;
import java.io.IOException; import java.io.IOException;
import java.io.Reader; import java.io.Reader;
import java.util.zip.DataFormatException; import java.util.zip.DataFormatException;
import java.util.zip.Inflater;
/** /**
* Class responsible for access to stored document fields. * Class responsible for access to stored document fields.
@ -596,40 +594,19 @@ final class FieldsReader implements Cloneable {
return null; return null;
} }
} }
private final byte[] uncompress(final byte[] input)
throws CorruptIndexException, IOException {
// Create an expandable byte array to hold the decompressed data
ByteArrayOutputStream bos = new ByteArrayOutputStream(input.length);
Inflater decompressor = new Inflater();
private byte[] uncompress(byte[] b)
throws CorruptIndexException {
try { try {
decompressor.setInput(input); return CompressionTools.decompress(b);
} catch (DataFormatException e) {
// Decompress the data // this will happen if the field is not compressed
byte[] buf = new byte[1024]; CorruptIndexException newException = new CorruptIndexException("field data are in wrong format: " + e.toString());
while (!decompressor.finished()) { newException.initCause(e);
try { throw newException;
int count = decompressor.inflate(buf);
bos.write(buf, 0, count);
}
catch (DataFormatException e) {
// this will happen if the field is not compressed
CorruptIndexException newException = new CorruptIndexException("field data are in wrong format: " + e.toString());
newException.initCause(e);
throw newException;
}
}
} finally {
decompressor.end();
} }
// Get the decompressed data
return bos.toByteArray();
} }
// Instances of this class hold field properties and data // Instances of this class hold field properties and data
// for merge // for merge
final static class FieldForMerge extends AbstractField { final static class FieldForMerge extends AbstractField {

View File

@ -16,13 +16,12 @@ package org.apache.lucene.index;
* the License. * the License.
*/ */
import java.io.ByteArrayOutputStream;
import java.io.IOException; import java.io.IOException;
import java.util.Iterator; import java.util.Iterator;
import java.util.zip.Deflater;
import org.apache.lucene.document.Document; import org.apache.lucene.document.Document;
import org.apache.lucene.document.Fieldable; import org.apache.lucene.document.Fieldable;
import org.apache.lucene.document.CompressionTools;
import org.apache.lucene.store.Directory; import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMOutputStream; import org.apache.lucene.store.RAMOutputStream;
import org.apache.lucene.store.IndexOutput; import org.apache.lucene.store.IndexOutput;
@ -203,10 +202,10 @@ final class FieldsWriter
} else { } else {
// check if it is a binary field // check if it is a binary field
if (field.isBinary()) { if (field.isBinary()) {
data = compress(field.getBinaryValue(), field.getBinaryOffset(), field.getBinaryLength()); data = CompressionTools.compress(field.getBinaryValue(), field.getBinaryOffset(), field.getBinaryLength());
} else { } else {
byte x[] = field.stringValue().getBytes("UTF-8"); byte x[] = field.stringValue().getBytes("UTF-8");
data = compress(x, 0, x.length); data = CompressionTools.compress(x, 0, x.length);
} }
len = data.length; len = data.length;
offset = 0; offset = 0;
@ -269,43 +268,4 @@ final class FieldsWriter
writeField(fieldInfos.fieldInfo(field.name()), field); writeField(fieldInfos.fieldInfo(field.name()), field);
} }
} }
private final byte[] compress (byte[] input, int offset, int length) {
// Create the compressor with highest level of compression
Deflater compressor = new Deflater();
compressor.setLevel(Deflater.BEST_COMPRESSION);
// Give the compressor the data to compress
compressor.setInput(input, offset, length);
compressor.finish();
/*
* Create an expandable byte array to hold the compressed data.
* You cannot use an array that's the same size as the orginal because
* there is no guarantee that the compressed data will be smaller than
* the uncompressed data.
*/
ByteArrayOutputStream bos = new ByteArrayOutputStream(length);
try {
compressor.setLevel(Deflater.BEST_COMPRESSION);
// Give the compressor the data to compress
compressor.setInput(input);
compressor.finish();
// Compress the data
byte[] buf = new byte[1024];
while (!compressor.finished()) {
int count = compressor.deflate(buf);
bos.write(buf, 0, count);
}
} finally {
compressor.end();
}
// Get the compressed data
return bos.toByteArray();
}
} }

View File

@ -5,7 +5,7 @@ import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.store.MockRAMDirectory;
/** /**
* Licensed to the Apache Software Foundation (ASF) under one or more * Licensed to the Apache Software Foundation (ASF) under one or more
@ -43,7 +43,7 @@ public class TestBinaryDocument extends LuceneTestCase
Fieldable binaryFldCompressed = new Field("binaryCompressed", binaryValCompressed.getBytes(), Field.Store.COMPRESS); Fieldable binaryFldCompressed = new Field("binaryCompressed", binaryValCompressed.getBytes(), Field.Store.COMPRESS);
Fieldable stringFldStored = new Field("stringStored", binaryValStored, Field.Store.YES, Field.Index.NO, Field.TermVector.NO); Fieldable stringFldStored = new Field("stringStored", binaryValStored, Field.Store.YES, Field.Index.NO, Field.TermVector.NO);
Fieldable stringFldCompressed = new Field("stringCompressed", binaryValCompressed, Field.Store.COMPRESS, Field.Index.NO, Field.TermVector.NO); Fieldable stringFldCompressed = new Field("stringCompressed", binaryValCompressed, Field.Store.COMPRESS, Field.Index.NO, Field.TermVector.NO);
try { try {
// binary fields with store off are not allowed // binary fields with store off are not allowed
new Field("fail", binaryValCompressed.getBytes(), Field.Store.NO); new Field("fail", binaryValCompressed.getBytes(), Field.Store.NO);
@ -60,12 +60,12 @@ public class TestBinaryDocument extends LuceneTestCase
doc.add(stringFldStored); doc.add(stringFldStored);
doc.add(stringFldCompressed); doc.add(stringFldCompressed);
/** test for field count */ /** test for field count */
assertEquals(4, doc.fields.size()); assertEquals(4, doc.fields.size());
/** add the doc to a ram index */ /** add the doc to a ram index */
RAMDirectory dir = new RAMDirectory(); MockRAMDirectory dir = new MockRAMDirectory();
IndexWriter writer = new IndexWriter(dir, new StandardAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED); IndexWriter writer = new IndexWriter(dir, new StandardAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED);
writer.addDocument(doc); writer.addDocument(doc);
writer.close(); writer.close();
@ -90,13 +90,40 @@ public class TestBinaryDocument extends LuceneTestCase
/** fetch the compressed string field and compare it's content with the original one */ /** fetch the compressed string field and compare it's content with the original one */
String stringFldCompressedTest = docFromReader.get("stringCompressed"); String stringFldCompressedTest = docFromReader.get("stringCompressed");
assertTrue(stringFldCompressedTest.equals(binaryValCompressed)); assertTrue(stringFldCompressedTest.equals(binaryValCompressed));
/** delete the document from index */ /** delete the document from index */
reader.deleteDocument(0); reader.deleteDocument(0);
assertEquals(0, reader.numDocs()); assertEquals(0, reader.numDocs());
reader.close(); reader.close();
dir.close();
} }
public void testCompressionTools()
throws Exception
{
Fieldable binaryFldCompressed = new Field("binaryCompressed", CompressionTools.compress(binaryValCompressed.getBytes()), Field.Store.YES);
Document doc = new Document();
doc.add(binaryFldCompressed);
/** add the doc to a ram index */
MockRAMDirectory dir = new MockRAMDirectory();
IndexWriter writer = new IndexWriter(dir, new StandardAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED);
writer.addDocument(doc);
writer.close();
/** open a reader and fetch the document */
IndexReader reader = IndexReader.open(dir);
Document docFromReader = reader.document(0);
assertTrue(docFromReader != null);
/** fetch the binary compressed field and compare it's content with the original one */
String binaryFldCompressedTest = new String(CompressionTools.decompress(docFromReader.getBinaryValue("binaryCompressed")));
assertTrue(binaryFldCompressedTest.equals(binaryValCompressed));
reader.close();
dir.close();
}
} }