From 3edae1d01c8e560333c08d62cc1d05795a2b9221 Mon Sep 17 00:00:00 2001 From: Christoph Goller Date: Thu, 30 Sep 2004 12:40:28 +0000 Subject: [PATCH] Allow stored fields to be compressed (see Bug#31149) git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150546 13f79535-47bb-0310-9956-ffa450edef68 --- .../org/apache/lucene/document/Field.java | 89 ++++++++++----- .../org/apache/lucene/index/FieldsReader.java | 67 ++++++++++-- .../org/apache/lucene/index/FieldsWriter.java | 67 +++++++++++- .../lucene/document/TestBinaryDocument.java | 101 ++++++++++++++++++ .../apache/lucene/document/TestDocument.java | 4 +- 5 files changed, 286 insertions(+), 42 deletions(-) create mode 100644 src/test/org/apache/lucene/document/TestBinaryDocument.java diff --git a/src/java/org/apache/lucene/document/Field.java b/src/java/org/apache/lucene/document/Field.java index 90fe3fb822a..5757f7aa1a9 100644 --- a/src/java/org/apache/lucene/document/Field.java +++ b/src/java/org/apache/lucene/document/Field.java @@ -33,15 +33,16 @@ import org.apache.lucene.search.Similarity; public final class Field implements java.io.Serializable { private String name = "body"; - private String stringValue = null; - private Reader readerValue = null; - private byte[] binaryValue = null; + + // the one and only data object for all different kind of field values + private Object fieldsData = null; private boolean storeTermVector = false; private boolean isStored = false; private boolean isIndexed = true; private boolean isTokenized = true; private boolean isBinary = false; + private boolean isCompressed = false; private float boost = 1.0f; @@ -54,6 +55,10 @@ public final class Field implements java.io.Serializable { public String toString() { return name; } + /** Store the original field value in the index in a compressed form. This is + * useful for long documents and for binary valued fields. + */ + public static final Store COMPRESS = new Store("COMPRESS"); /** Store the original field value in the index. This is useful for short texts * like a document's title which should be displayed with the results. The * value is stored in its original form, i.e. no analyzer is used before it is @@ -220,18 +225,22 @@ public final class Field implements java.io.Serializable { /** The name of the field (e.g., "date", "title", "body", ...) as an interned string. */ - public String name() { return name; } + public String name() { return name; } /** The value of the field as a String, or null. If null, the Reader value - is used. Exactly one of stringValue() and readerValue() must be set. */ - public String stringValue() { return stringValue; } + * or binary value is used. Exactly one of stringValue(), readerValue(), and + * binaryValue() must be set. */ + public String stringValue() { try { return (String)fieldsData; } catch (ClassCastException ignore) { return null; } } + /** The value of the field as a Reader, or null. If null, the String value - is used. Exactly one of stringValue() and readerValue() must be set. */ - public Reader readerValue() { return readerValue; } + * or binary value is used. Exactly one of stringValue(), readerValue(), + * and binaryValue() must be set. */ + public Reader readerValue() { try { return (Reader)fieldsData; } catch (ClassCastException ignore) { return null; } } + /** The value of the field in Binary, or null. If null, the Reader or - String value is used. Exactly one of stringValue(), readerValue() and - binaryValue() must be set. */ - public byte[] binaryValue() { return binaryValue; } + * String value is used. Exactly one of stringValue(), readerValue() and + * binaryValue() must be set. */ + public byte[] binaryValue() { try { return (byte[])fieldsData; } catch (ClassCastException ignore) { return null; } } /** * Create a field by specifying its name, value and how it will @@ -277,12 +286,16 @@ public final class Field implements java.io.Serializable { if (index == Index.NO && termVector != TermVector.NO) throw new IllegalArgumentException("cannot store term vector information " + "for a field that is not indexed"); - + this.name = name.intern(); // field names are interned - this.stringValue = value; + this.fieldsData = value; if (store == Store.YES) this.isStored = true; + else if (store == Store.COMPRESS) { + this.isStored = true; + this.isCompressed = true; + } else if (store == Store.NO) this.isStored = false; else @@ -331,7 +344,7 @@ public final class Field implements java.io.Serializable { if (reader == null) throw new NullPointerException("reader cannot be null"); this.name = name.intern(); // field names are interned - this.readerValue = reader; + this.fieldsData = reader; this.isStored = false; this.isIndexed = true; this.isTokenized = true; @@ -344,18 +357,31 @@ public final class Field implements java.io.Serializable { * @deprecated use {@link #Field(String, String, Field.Store, Field.Index)} instead */ public Field(String name, String string, - boolean store, boolean index, boolean token) { + boolean store, boolean index, boolean token) { this(name, string, store, index, token, false); } - public Field(String name, byte[] value) { + + /** + * Create a stored field with binary value. Optionally the value may be compressed. + * + * @param name The name of the field + * @param value The binary value + * @param store How value should be stored (compressed or not.) + */ + public Field(String name, byte[] value, Store store) { if (name == null) throw new IllegalArgumentException("name cannot be null"); if (value == null) throw new IllegalArgumentException("value cannot be null"); + if (store == Store.NO) + throw new IllegalArgumentException("binary values can't be unstored"); + if (store == Store.COMPRESS) + this.isCompressed = true; this.name = name.intern(); - this.binaryValue = value; + //wrap the byte[] to a ByteBuffer object + this.fieldsData = value; this.isBinary = true; this.isStored = true; @@ -377,7 +403,7 @@ public final class Field implements java.io.Serializable { * @deprecated use {@link #Field(String, String, Field.Store, Field.Index, Field.TermVector)} instead */ public Field(String name, String string, - boolean store, boolean index, boolean token, boolean storeTermVector) { + boolean store, boolean index, boolean token, boolean storeTermVector) { if (name == null) throw new NullPointerException("name cannot be null"); if (string == null) @@ -385,8 +411,8 @@ public final class Field implements java.io.Serializable { if (!index && storeTermVector) throw new IllegalArgumentException("cannot store a term vector for fields that are not indexed"); - this.name = name.intern(); // field names are interned - this.stringValue = string; + this.name = name.intern(); // field names are interned + this.fieldsData = string; this.isStored = store; this.isIndexed = index; this.isTokenized = token; @@ -406,16 +432,19 @@ public final class Field implements java.io.Serializable { /** True iff the value of the field is to be stored in the index for return with search hits. It is an error for this to be true if a field is Reader-valued. */ - public final boolean isStored() { return isStored; } + public final boolean isStored() { return isStored; } /** True iff the value of the field is to be indexed, so that it may be searched on. */ - public final boolean isIndexed() { return isIndexed; } + public final boolean isIndexed() { return isIndexed; } /** True iff the value of the field should be tokenized as text prior to indexing. Un-tokenized fields are indexed as a single word and may not be Reader-valued. */ - public final boolean isTokenized() { return isTokenized; } + public final boolean isTokenized() { return isTokenized; } + + /** True if the value of the field is stored and compressed within the index */ + public final boolean isCompressed() { return isCompressed; } /** True iff the term or terms used to index this field are stored as a term * vector, available from {@link IndexReader#getTermFreqVector(int,String)}. @@ -456,14 +485,20 @@ public final class Field implements java.io.Serializable { result.append("binary"); } + if (isCompressed) { + if (result.length() > 0) + result.append(","); + result.append("compressed"); + } + result.append('<'); result.append(name); result.append(':'); - if (readerValue != null) { - result.append(readerValue.toString()); - } else { - result.append(stringValue); + + if (fieldsData != null) { + result.append(fieldsData); } + result.append('>'); return result.toString(); } diff --git a/src/java/org/apache/lucene/index/FieldsReader.java b/src/java/org/apache/lucene/index/FieldsReader.java index c8f37873b84..3d8a8d7f166 100644 --- a/src/java/org/apache/lucene/index/FieldsReader.java +++ b/src/java/org/apache/lucene/index/FieldsReader.java @@ -16,7 +16,10 @@ package org.apache.lucene.index; * limitations under the License. */ +import java.io.ByteArrayOutputStream; import java.io.IOException; +import java.util.zip.DataFormatException; +import java.util.zip.Inflater; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; @@ -66,29 +69,77 @@ final class FieldsReader { FieldInfo fi = fieldInfos.fieldInfo(fieldNumber); byte bits = fieldsStream.readByte(); - - if ((bits & 2) != 0) { + + boolean compressed = (bits & FieldsWriter.FIELD_IS_COMPRESSED) != 0; + boolean tokenize = (bits & FieldsWriter.FIELD_IS_TOKENIZED) != 0; + + if ((bits & FieldsWriter.FIELD_IS_BINARY) != 0) { final byte[] b = new byte[fieldsStream.readVInt()]; fieldsStream.readBytes(b, 0, b.length); - doc.add(new Field(fi.name, b)); + if (compressed) + doc.add(new Field(fi.name, uncompress(b), Field.Store.COMPRESS)); + else + doc.add(new Field(fi.name, b, Field.Store.YES)); } else { Field.Index index; - boolean tokenize = (bits & 1) != 0; + Field.Store store = Field.Store.YES; + if (fi.isIndexed && tokenize) index = Field.Index.TOKENIZED; else if (fi.isIndexed && !tokenize) index = Field.Index.UN_TOKENIZED; else index = Field.Index.NO; - doc.add(new Field(fi.name, // name - fieldsStream.readString(), // read value - Field.Store.YES, index, - fi.storeTermVector ? Field.TermVector.YES : Field.TermVector.NO)); + + if (compressed) { + store = Field.Store.COMPRESS; + final byte[] b = new byte[fieldsStream.readVInt()]; + fieldsStream.readBytes(b, 0, b.length); + doc.add(new Field(fi.name, // field name + new String(uncompress(b), "UTF-8"), // uncompress the value and add as string + store, + index, + fi.storeTermVector ? Field.TermVector.YES : Field.TermVector.NO)); + } + else + doc.add(new Field(fi.name, // name + fieldsStream.readString(), // read value + store, + index, + fi.storeTermVector ? Field.TermVector.YES : Field.TermVector.NO)); } } return doc; } + private final byte[] uncompress(final byte[] input) + throws IOException + { + + Inflater decompressor = new Inflater(); + decompressor.setInput(input); + + // Create an expandable byte array to hold the decompressed data + ByteArrayOutputStream bos = new ByteArrayOutputStream(input.length); + + // Decompress the data + byte[] buf = new byte[1024]; + while (!decompressor.finished()) { + try { + int count = decompressor.inflate(buf); + bos.write(buf, 0, count); + } + catch (DataFormatException e) { + // this will happen if the field is not compressed + throw new IOException ("field data are in wrong format: " + e.toString()); + } + } + + decompressor.end(); + + // Get the decompressed data + return bos.toByteArray(); + } } diff --git a/src/java/org/apache/lucene/index/FieldsWriter.java b/src/java/org/apache/lucene/index/FieldsWriter.java index 4e99e231619..77f6e419f98 100644 --- a/src/java/org/apache/lucene/index/FieldsWriter.java +++ b/src/java/org/apache/lucene/index/FieldsWriter.java @@ -16,8 +16,10 @@ package org.apache.lucene.index; * the License. */ +import java.io.ByteArrayOutputStream; import java.io.IOException; import java.util.Enumeration; +import java.util.zip.Deflater; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; @@ -26,6 +28,10 @@ import org.apache.lucene.store.IndexOutput; final class FieldsWriter { + static final short FIELD_IS_TOKENIZED = 1; + static final short FIELD_IS_BINARY = 2; + static final short FIELD_IS_COMPRESSED = 4; + private FieldInfos fieldInfos; private IndexOutput fieldsStream; @@ -63,21 +69,72 @@ final class FieldsWriter byte bits = 0; if (field.isTokenized()) - bits |= 1; + bits |= FieldsWriter.FIELD_IS_TOKENIZED; if (field.isBinary()) - bits |= 2; + bits |= FieldsWriter.FIELD_IS_BINARY; + if (field.isCompressed()) + bits |= FieldsWriter.FIELD_IS_COMPRESSED; + fieldsStream.writeByte(bits); - - if (field.isBinary()) { + + if (field.isCompressed()) { + // compression is enabled for the current field + byte[] data = null; + // check if it is a binary field + if (field.isBinary()) { + data = compress(field.binaryValue()); + } + else { + data = compress(field.stringValue().getBytes("UTF-8")); + } + final int len = data.length; + fieldsStream.writeVInt(len); + fieldsStream.writeBytes(data, len); + } + else { + // compression is disabled for the current field + if (field.isBinary()) { byte[] data = field.binaryValue(); final int len = data.length; fieldsStream.writeVInt(len); fieldsStream.writeBytes(data, len); - } else { + } + else { fieldsStream.writeString(field.stringValue()); + } } } } } + private final byte[] compress (byte[] input) { + + // Create the compressor with highest level of compression + Deflater compressor = new Deflater(); + compressor.setLevel(Deflater.BEST_COMPRESSION); + + // Give the compressor the data to compress + compressor.setInput(input); + compressor.finish(); + + /* + * Create an expandable byte array to hold the compressed data. + * You cannot use an array that's the same size as the orginal because + * there is no guarantee that the compressed data will be smaller than + * the uncompressed data. + */ + ByteArrayOutputStream bos = new ByteArrayOutputStream(input.length); + + // Compress the data + byte[] buf = new byte[1024]; + while (!compressor.finished()) { + int count = compressor.deflate(buf); + bos.write(buf, 0, count); + } + + compressor.end(); + + // Get the compressed data + return bos.toByteArray(); + } } diff --git a/src/test/org/apache/lucene/document/TestBinaryDocument.java b/src/test/org/apache/lucene/document/TestBinaryDocument.java new file mode 100644 index 00000000000..02358f6503c --- /dev/null +++ b/src/test/org/apache/lucene/document/TestBinaryDocument.java @@ -0,0 +1,101 @@ +package org.apache.lucene.document; + +import junit.framework.TestCase; + +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.store.RAMDirectory; + +/** + * Copyright 2004 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Tests {@link Document} class. + * + * @author Bernhard Messer + * @version $Id$ + */ +public class TestBinaryDocument extends TestCase +{ + + String binaryValStored = "this text will be stored as a byte array in the index"; + String binaryValCompressed = "this text will be also stored and compressed as a byte array in the index"; + + public void testBinaryFieldInIndex() + throws Exception + { + Field binaryFldStored = new Field("binaryStored", binaryValStored.getBytes(), Field.Store.YES); + Field binaryFldCompressed = new Field("binaryCompressed", binaryValCompressed.getBytes(), Field.Store.COMPRESS); + Field stringFldStored = new Field("stringStored", binaryValStored, Field.Store.YES, Field.Index.NO, Field.TermVector.NO); + Field stringFldCompressed = new Field("stringCompressed", binaryValCompressed, Field.Store.COMPRESS, Field.Index.NO, Field.TermVector.NO); + + try { + // binary fields with store off are not allowed + new Field("fail", binaryValCompressed.getBytes(), Field.Store.NO); + fail(); + } + catch (IllegalArgumentException iae) { + ; + } + + Document doc = new Document(); + + doc.add(binaryFldStored); + doc.add(binaryFldCompressed); + + doc.add(stringFldStored); + doc.add(stringFldCompressed); + + /** test for field count */ + assertEquals(4, doc.fields.size()); + + /** add the doc to a ram index */ + RAMDirectory dir = new RAMDirectory(); + IndexWriter writer = new IndexWriter(dir, new StandardAnalyzer(), true); + writer.addDocument(doc); + writer.close(); + + /** open a reader and fetch the document */ + IndexReader reader = IndexReader.open(dir); + Document docFromReader = reader.document(0); + assertTrue(docFromReader != null); + + /** fetch the binary stored field and compare it's content with the original one */ + String binaryFldStoredTest = new String(docFromReader.getBinaryValue("binaryStored")); + assertTrue(binaryFldStoredTest.equals(binaryValStored)); + + /** fetch the binary compressed field and compare it's content with the original one */ + String binaryFldCompressedTest = new String(docFromReader.getBinaryValue("binaryCompressed")); + assertTrue(binaryFldCompressedTest.equals(binaryValCompressed)); + + /** fetch the string field and compare it's content with the original one */ + String stringFldStoredTest = new String(docFromReader.get("stringStored")); + assertTrue(stringFldStoredTest.equals(binaryValStored)); + + /** fetch the compressed string field and compare it's content with the original one */ + String stringFldCompressedTest = new String(docFromReader.get("stringCompressed")); + assertTrue(stringFldCompressedTest.equals(binaryValCompressed)); + + /** delete the document from index */ + reader.delete(0); + assertEquals(0, reader.numDocs()); + + reader.close(); + + } + +} diff --git a/src/test/org/apache/lucene/document/TestDocument.java b/src/test/org/apache/lucene/document/TestDocument.java index ff0c1aaa8a5..182056552d0 100644 --- a/src/test/org/apache/lucene/document/TestDocument.java +++ b/src/test/org/apache/lucene/document/TestDocument.java @@ -47,8 +47,8 @@ public class TestDocument extends TestCase { Document doc = new Document(); Field stringFld = new Field("string", binaryVal, Field.Store.YES, Field.Index.NO); - Field binaryFld = new Field("binary", binaryVal.getBytes()); - Field binaryFld2 = new Field("binary", binaryVal2.getBytes()); + Field binaryFld = new Field("binary", binaryVal.getBytes(), Field.Store.YES); + Field binaryFld2 = new Field("binary", binaryVal2.getBytes(), Field.Store.YES); doc.add(stringFld); doc.add(binaryFld);