mirror of https://github.com/apache/lucene.git
Allow stored fields to be compressed (see Bug#31149)
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150546 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
168a155a8b
commit
3edae1d01c
|
@ -33,15 +33,16 @@ import org.apache.lucene.search.Similarity;
|
|||
|
||||
public final class Field implements java.io.Serializable {
|
||||
private String name = "body";
|
||||
private String stringValue = null;
|
||||
private Reader readerValue = null;
|
||||
private byte[] binaryValue = null;
|
||||
|
||||
// the one and only data object for all different kind of field values
|
||||
private Object fieldsData = null;
|
||||
|
||||
private boolean storeTermVector = false;
|
||||
private boolean isStored = false;
|
||||
private boolean isIndexed = true;
|
||||
private boolean isTokenized = true;
|
||||
private boolean isBinary = false;
|
||||
private boolean isCompressed = false;
|
||||
|
||||
private float boost = 1.0f;
|
||||
|
||||
|
@ -54,6 +55,10 @@ public final class Field implements java.io.Serializable {
|
|||
public String toString() {
|
||||
return name;
|
||||
}
|
||||
/** Store the original field value in the index in a compressed form. This is
|
||||
* useful for long documents and for binary valued fields.
|
||||
*/
|
||||
public static final Store COMPRESS = new Store("COMPRESS");
|
||||
/** Store the original field value in the index. This is useful for short texts
|
||||
* like a document's title which should be displayed with the results. The
|
||||
* value is stored in its original form, i.e. no analyzer is used before it is
|
||||
|
@ -220,18 +225,22 @@ public final class Field implements java.io.Serializable {
|
|||
|
||||
/** The name of the field (e.g., "date", "title", "body", ...)
|
||||
as an interned string. */
|
||||
public String name() { return name; }
|
||||
public String name() { return name; }
|
||||
|
||||
/** The value of the field as a String, or null. If null, the Reader value
|
||||
is used. Exactly one of stringValue() and readerValue() must be set. */
|
||||
public String stringValue() { return stringValue; }
|
||||
* or binary value is used. Exactly one of stringValue(), readerValue(), and
|
||||
* binaryValue() must be set. */
|
||||
public String stringValue() { try { return (String)fieldsData; } catch (ClassCastException ignore) { return null; } }
|
||||
|
||||
/** The value of the field as a Reader, or null. If null, the String value
|
||||
is used. Exactly one of stringValue() and readerValue() must be set. */
|
||||
public Reader readerValue() { return readerValue; }
|
||||
* or binary value is used. Exactly one of stringValue(), readerValue(),
|
||||
* and binaryValue() must be set. */
|
||||
public Reader readerValue() { try { return (Reader)fieldsData; } catch (ClassCastException ignore) { return null; } }
|
||||
|
||||
/** The value of the field in Binary, or null. If null, the Reader or
|
||||
String value is used. Exactly one of stringValue(), readerValue() and
|
||||
binaryValue() must be set. */
|
||||
public byte[] binaryValue() { return binaryValue; }
|
||||
* String value is used. Exactly one of stringValue(), readerValue() and
|
||||
* binaryValue() must be set. */
|
||||
public byte[] binaryValue() { try { return (byte[])fieldsData; } catch (ClassCastException ignore) { return null; } }
|
||||
|
||||
/**
|
||||
* Create a field by specifying its name, value and how it will
|
||||
|
@ -277,12 +286,16 @@ public final class Field implements java.io.Serializable {
|
|||
if (index == Index.NO && termVector != TermVector.NO)
|
||||
throw new IllegalArgumentException("cannot store term vector information "
|
||||
+ "for a field that is not indexed");
|
||||
|
||||
|
||||
this.name = name.intern(); // field names are interned
|
||||
this.stringValue = value;
|
||||
this.fieldsData = value;
|
||||
|
||||
if (store == Store.YES)
|
||||
this.isStored = true;
|
||||
else if (store == Store.COMPRESS) {
|
||||
this.isStored = true;
|
||||
this.isCompressed = true;
|
||||
}
|
||||
else if (store == Store.NO)
|
||||
this.isStored = false;
|
||||
else
|
||||
|
@ -331,7 +344,7 @@ public final class Field implements java.io.Serializable {
|
|||
if (reader == null)
|
||||
throw new NullPointerException("reader cannot be null");
|
||||
this.name = name.intern(); // field names are interned
|
||||
this.readerValue = reader;
|
||||
this.fieldsData = reader;
|
||||
this.isStored = false;
|
||||
this.isIndexed = true;
|
||||
this.isTokenized = true;
|
||||
|
@ -344,18 +357,31 @@ public final class Field implements java.io.Serializable {
|
|||
* @deprecated use {@link #Field(String, String, Field.Store, Field.Index)} instead
|
||||
*/
|
||||
public Field(String name, String string,
|
||||
boolean store, boolean index, boolean token) {
|
||||
boolean store, boolean index, boolean token) {
|
||||
this(name, string, store, index, token, false);
|
||||
}
|
||||
|
||||
public Field(String name, byte[] value) {
|
||||
|
||||
/**
|
||||
* Create a stored field with binary value. Optionally the value may be compressed.
|
||||
*
|
||||
* @param name The name of the field
|
||||
* @param value The binary value
|
||||
* @param store How <code>value</code> should be stored (compressed or not.)
|
||||
*/
|
||||
public Field(String name, byte[] value, Store store) {
|
||||
if (name == null)
|
||||
throw new IllegalArgumentException("name cannot be null");
|
||||
if (value == null)
|
||||
throw new IllegalArgumentException("value cannot be null");
|
||||
if (store == Store.NO)
|
||||
throw new IllegalArgumentException("binary values can't be unstored");
|
||||
if (store == Store.COMPRESS)
|
||||
this.isCompressed = true;
|
||||
|
||||
this.name = name.intern();
|
||||
this.binaryValue = value;
|
||||
//wrap the byte[] to a ByteBuffer object
|
||||
this.fieldsData = value;
|
||||
|
||||
this.isBinary = true;
|
||||
this.isStored = true;
|
||||
|
@ -377,7 +403,7 @@ public final class Field implements java.io.Serializable {
|
|||
* @deprecated use {@link #Field(String, String, Field.Store, Field.Index, Field.TermVector)} instead
|
||||
*/
|
||||
public Field(String name, String string,
|
||||
boolean store, boolean index, boolean token, boolean storeTermVector) {
|
||||
boolean store, boolean index, boolean token, boolean storeTermVector) {
|
||||
if (name == null)
|
||||
throw new NullPointerException("name cannot be null");
|
||||
if (string == null)
|
||||
|
@ -385,8 +411,8 @@ public final class Field implements java.io.Serializable {
|
|||
if (!index && storeTermVector)
|
||||
throw new IllegalArgumentException("cannot store a term vector for fields that are not indexed");
|
||||
|
||||
this.name = name.intern(); // field names are interned
|
||||
this.stringValue = string;
|
||||
this.name = name.intern(); // field names are interned
|
||||
this.fieldsData = string;
|
||||
this.isStored = store;
|
||||
this.isIndexed = index;
|
||||
this.isTokenized = token;
|
||||
|
@ -406,16 +432,19 @@ public final class Field implements java.io.Serializable {
|
|||
/** True iff the value of the field is to be stored in the index for return
|
||||
with search hits. It is an error for this to be true if a field is
|
||||
Reader-valued. */
|
||||
public final boolean isStored() { return isStored; }
|
||||
public final boolean isStored() { return isStored; }
|
||||
|
||||
/** True iff the value of the field is to be indexed, so that it may be
|
||||
searched on. */
|
||||
public final boolean isIndexed() { return isIndexed; }
|
||||
public final boolean isIndexed() { return isIndexed; }
|
||||
|
||||
/** True iff the value of the field should be tokenized as text prior to
|
||||
indexing. Un-tokenized fields are indexed as a single word and may not be
|
||||
Reader-valued. */
|
||||
public final boolean isTokenized() { return isTokenized; }
|
||||
public final boolean isTokenized() { return isTokenized; }
|
||||
|
||||
/** True if the value of the field is stored and compressed within the index */
|
||||
public final boolean isCompressed() { return isCompressed; }
|
||||
|
||||
/** True iff the term or terms used to index this field are stored as a term
|
||||
* vector, available from {@link IndexReader#getTermFreqVector(int,String)}.
|
||||
|
@ -456,14 +485,20 @@ public final class Field implements java.io.Serializable {
|
|||
result.append("binary");
|
||||
}
|
||||
|
||||
if (isCompressed) {
|
||||
if (result.length() > 0)
|
||||
result.append(",");
|
||||
result.append("compressed");
|
||||
}
|
||||
|
||||
result.append('<');
|
||||
result.append(name);
|
||||
result.append(':');
|
||||
if (readerValue != null) {
|
||||
result.append(readerValue.toString());
|
||||
} else {
|
||||
result.append(stringValue);
|
||||
|
||||
if (fieldsData != null) {
|
||||
result.append(fieldsData);
|
||||
}
|
||||
|
||||
result.append('>');
|
||||
return result.toString();
|
||||
}
|
||||
|
|
|
@ -16,7 +16,10 @@ package org.apache.lucene.index;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.util.zip.DataFormatException;
|
||||
import java.util.zip.Inflater;
|
||||
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
|
@ -66,29 +69,77 @@ final class FieldsReader {
|
|||
FieldInfo fi = fieldInfos.fieldInfo(fieldNumber);
|
||||
|
||||
byte bits = fieldsStream.readByte();
|
||||
|
||||
if ((bits & 2) != 0) {
|
||||
|
||||
boolean compressed = (bits & FieldsWriter.FIELD_IS_COMPRESSED) != 0;
|
||||
boolean tokenize = (bits & FieldsWriter.FIELD_IS_TOKENIZED) != 0;
|
||||
|
||||
if ((bits & FieldsWriter.FIELD_IS_BINARY) != 0) {
|
||||
final byte[] b = new byte[fieldsStream.readVInt()];
|
||||
fieldsStream.readBytes(b, 0, b.length);
|
||||
doc.add(new Field(fi.name, b));
|
||||
if (compressed)
|
||||
doc.add(new Field(fi.name, uncompress(b), Field.Store.COMPRESS));
|
||||
else
|
||||
doc.add(new Field(fi.name, b, Field.Store.YES));
|
||||
}
|
||||
else {
|
||||
Field.Index index;
|
||||
boolean tokenize = (bits & 1) != 0;
|
||||
Field.Store store = Field.Store.YES;
|
||||
|
||||
if (fi.isIndexed && tokenize)
|
||||
index = Field.Index.TOKENIZED;
|
||||
else if (fi.isIndexed && !tokenize)
|
||||
index = Field.Index.UN_TOKENIZED;
|
||||
else
|
||||
index = Field.Index.NO;
|
||||
doc.add(new Field(fi.name, // name
|
||||
fieldsStream.readString(), // read value
|
||||
Field.Store.YES, index,
|
||||
fi.storeTermVector ? Field.TermVector.YES : Field.TermVector.NO));
|
||||
|
||||
if (compressed) {
|
||||
store = Field.Store.COMPRESS;
|
||||
final byte[] b = new byte[fieldsStream.readVInt()];
|
||||
fieldsStream.readBytes(b, 0, b.length);
|
||||
doc.add(new Field(fi.name, // field name
|
||||
new String(uncompress(b), "UTF-8"), // uncompress the value and add as string
|
||||
store,
|
||||
index,
|
||||
fi.storeTermVector ? Field.TermVector.YES : Field.TermVector.NO));
|
||||
}
|
||||
else
|
||||
doc.add(new Field(fi.name, // name
|
||||
fieldsStream.readString(), // read value
|
||||
store,
|
||||
index,
|
||||
fi.storeTermVector ? Field.TermVector.YES : Field.TermVector.NO));
|
||||
}
|
||||
}
|
||||
|
||||
return doc;
|
||||
}
|
||||
|
||||
private final byte[] uncompress(final byte[] input)
|
||||
throws IOException
|
||||
{
|
||||
|
||||
Inflater decompressor = new Inflater();
|
||||
decompressor.setInput(input);
|
||||
|
||||
// Create an expandable byte array to hold the decompressed data
|
||||
ByteArrayOutputStream bos = new ByteArrayOutputStream(input.length);
|
||||
|
||||
// Decompress the data
|
||||
byte[] buf = new byte[1024];
|
||||
while (!decompressor.finished()) {
|
||||
try {
|
||||
int count = decompressor.inflate(buf);
|
||||
bos.write(buf, 0, count);
|
||||
}
|
||||
catch (DataFormatException e) {
|
||||
// this will happen if the field is not compressed
|
||||
throw new IOException ("field data are in wrong format: " + e.toString());
|
||||
}
|
||||
}
|
||||
|
||||
decompressor.end();
|
||||
|
||||
// Get the decompressed data
|
||||
return bos.toByteArray();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -16,8 +16,10 @@ package org.apache.lucene.index;
|
|||
* the License.
|
||||
*/
|
||||
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.util.Enumeration;
|
||||
import java.util.zip.Deflater;
|
||||
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
|
@ -26,6 +28,10 @@ import org.apache.lucene.store.IndexOutput;
|
|||
|
||||
final class FieldsWriter
|
||||
{
|
||||
static final short FIELD_IS_TOKENIZED = 1;
|
||||
static final short FIELD_IS_BINARY = 2;
|
||||
static final short FIELD_IS_COMPRESSED = 4;
|
||||
|
||||
private FieldInfos fieldInfos;
|
||||
|
||||
private IndexOutput fieldsStream;
|
||||
|
@ -63,21 +69,72 @@ final class FieldsWriter
|
|||
|
||||
byte bits = 0;
|
||||
if (field.isTokenized())
|
||||
bits |= 1;
|
||||
bits |= FieldsWriter.FIELD_IS_TOKENIZED;
|
||||
if (field.isBinary())
|
||||
bits |= 2;
|
||||
bits |= FieldsWriter.FIELD_IS_BINARY;
|
||||
if (field.isCompressed())
|
||||
bits |= FieldsWriter.FIELD_IS_COMPRESSED;
|
||||
|
||||
fieldsStream.writeByte(bits);
|
||||
|
||||
if (field.isBinary()) {
|
||||
|
||||
if (field.isCompressed()) {
|
||||
// compression is enabled for the current field
|
||||
byte[] data = null;
|
||||
// check if it is a binary field
|
||||
if (field.isBinary()) {
|
||||
data = compress(field.binaryValue());
|
||||
}
|
||||
else {
|
||||
data = compress(field.stringValue().getBytes("UTF-8"));
|
||||
}
|
||||
final int len = data.length;
|
||||
fieldsStream.writeVInt(len);
|
||||
fieldsStream.writeBytes(data, len);
|
||||
}
|
||||
else {
|
||||
// compression is disabled for the current field
|
||||
if (field.isBinary()) {
|
||||
byte[] data = field.binaryValue();
|
||||
final int len = data.length;
|
||||
fieldsStream.writeVInt(len);
|
||||
fieldsStream.writeBytes(data, len);
|
||||
} else {
|
||||
}
|
||||
else {
|
||||
fieldsStream.writeString(field.stringValue());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private final byte[] compress (byte[] input) {
|
||||
|
||||
// Create the compressor with highest level of compression
|
||||
Deflater compressor = new Deflater();
|
||||
compressor.setLevel(Deflater.BEST_COMPRESSION);
|
||||
|
||||
// Give the compressor the data to compress
|
||||
compressor.setInput(input);
|
||||
compressor.finish();
|
||||
|
||||
/*
|
||||
* Create an expandable byte array to hold the compressed data.
|
||||
* You cannot use an array that's the same size as the orginal because
|
||||
* there is no guarantee that the compressed data will be smaller than
|
||||
* the uncompressed data.
|
||||
*/
|
||||
ByteArrayOutputStream bos = new ByteArrayOutputStream(input.length);
|
||||
|
||||
// Compress the data
|
||||
byte[] buf = new byte[1024];
|
||||
while (!compressor.finished()) {
|
||||
int count = compressor.deflate(buf);
|
||||
bos.write(buf, 0, count);
|
||||
}
|
||||
|
||||
compressor.end();
|
||||
|
||||
// Get the compressed data
|
||||
return bos.toByteArray();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,101 @@
|
|||
package org.apache.lucene.document;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.store.RAMDirectory;
|
||||
|
||||
/**
|
||||
* Copyright 2004 The Apache Software Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Tests {@link Document} class.
|
||||
*
|
||||
* @author Bernhard Messer
|
||||
* @version $Id$
|
||||
*/
|
||||
public class TestBinaryDocument extends TestCase
|
||||
{
|
||||
|
||||
String binaryValStored = "this text will be stored as a byte array in the index";
|
||||
String binaryValCompressed = "this text will be also stored and compressed as a byte array in the index";
|
||||
|
||||
public void testBinaryFieldInIndex()
|
||||
throws Exception
|
||||
{
|
||||
Field binaryFldStored = new Field("binaryStored", binaryValStored.getBytes(), Field.Store.YES);
|
||||
Field binaryFldCompressed = new Field("binaryCompressed", binaryValCompressed.getBytes(), Field.Store.COMPRESS);
|
||||
Field stringFldStored = new Field("stringStored", binaryValStored, Field.Store.YES, Field.Index.NO, Field.TermVector.NO);
|
||||
Field stringFldCompressed = new Field("stringCompressed", binaryValCompressed, Field.Store.COMPRESS, Field.Index.NO, Field.TermVector.NO);
|
||||
|
||||
try {
|
||||
// binary fields with store off are not allowed
|
||||
new Field("fail", binaryValCompressed.getBytes(), Field.Store.NO);
|
||||
fail();
|
||||
}
|
||||
catch (IllegalArgumentException iae) {
|
||||
;
|
||||
}
|
||||
|
||||
Document doc = new Document();
|
||||
|
||||
doc.add(binaryFldStored);
|
||||
doc.add(binaryFldCompressed);
|
||||
|
||||
doc.add(stringFldStored);
|
||||
doc.add(stringFldCompressed);
|
||||
|
||||
/** test for field count */
|
||||
assertEquals(4, doc.fields.size());
|
||||
|
||||
/** add the doc to a ram index */
|
||||
RAMDirectory dir = new RAMDirectory();
|
||||
IndexWriter writer = new IndexWriter(dir, new StandardAnalyzer(), true);
|
||||
writer.addDocument(doc);
|
||||
writer.close();
|
||||
|
||||
/** open a reader and fetch the document */
|
||||
IndexReader reader = IndexReader.open(dir);
|
||||
Document docFromReader = reader.document(0);
|
||||
assertTrue(docFromReader != null);
|
||||
|
||||
/** fetch the binary stored field and compare it's content with the original one */
|
||||
String binaryFldStoredTest = new String(docFromReader.getBinaryValue("binaryStored"));
|
||||
assertTrue(binaryFldStoredTest.equals(binaryValStored));
|
||||
|
||||
/** fetch the binary compressed field and compare it's content with the original one */
|
||||
String binaryFldCompressedTest = new String(docFromReader.getBinaryValue("binaryCompressed"));
|
||||
assertTrue(binaryFldCompressedTest.equals(binaryValCompressed));
|
||||
|
||||
/** fetch the string field and compare it's content with the original one */
|
||||
String stringFldStoredTest = new String(docFromReader.get("stringStored"));
|
||||
assertTrue(stringFldStoredTest.equals(binaryValStored));
|
||||
|
||||
/** fetch the compressed string field and compare it's content with the original one */
|
||||
String stringFldCompressedTest = new String(docFromReader.get("stringCompressed"));
|
||||
assertTrue(stringFldCompressedTest.equals(binaryValCompressed));
|
||||
|
||||
/** delete the document from index */
|
||||
reader.delete(0);
|
||||
assertEquals(0, reader.numDocs());
|
||||
|
||||
reader.close();
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -47,8 +47,8 @@ public class TestDocument extends TestCase
|
|||
{
|
||||
Document doc = new Document();
|
||||
Field stringFld = new Field("string", binaryVal, Field.Store.YES, Field.Index.NO);
|
||||
Field binaryFld = new Field("binary", binaryVal.getBytes());
|
||||
Field binaryFld2 = new Field("binary", binaryVal2.getBytes());
|
||||
Field binaryFld = new Field("binary", binaryVal.getBytes(), Field.Store.YES);
|
||||
Field binaryFld2 = new Field("binary", binaryVal2.getBytes(), Field.Store.YES);
|
||||
|
||||
doc.add(stringFld);
|
||||
doc.add(binaryFld);
|
||||
|
|
Loading…
Reference in New Issue