From 3edae1d01c8e560333c08d62cc1d05795a2b9221 Mon Sep 17 00:00:00 2001
From: Christoph Goller <goller@apache.org>
Date: Thu, 30 Sep 2004 12:40:28 +0000
Subject: [PATCH] Allow stored fields to be compressed (see Bug#31149)

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150546 13f79535-47bb-0310-9956-ffa450edef68
---
 .../org/apache/lucene/document/Field.java     |  89 ++++++++++-----
 .../org/apache/lucene/index/FieldsReader.java |  67 ++++++++++--
 .../org/apache/lucene/index/FieldsWriter.java |  67 +++++++++++-
 .../lucene/document/TestBinaryDocument.java   | 101 ++++++++++++++++++
 .../apache/lucene/document/TestDocument.java  |   4 +-
 5 files changed, 286 insertions(+), 42 deletions(-)
 create mode 100644 src/test/org/apache/lucene/document/TestBinaryDocument.java

diff --git a/src/java/org/apache/lucene/document/Field.java b/src/java/org/apache/lucene/document/Field.java
index 90fe3fb822a..5757f7aa1a9 100644
--- a/src/java/org/apache/lucene/document/Field.java
+++ b/src/java/org/apache/lucene/document/Field.java
@@ -33,15 +33,16 @@ import org.apache.lucene.search.Similarity;
 
 public final class Field implements java.io.Serializable {
   private String name = "body";
-  private String stringValue = null;
-  private Reader readerValue = null;
-  private byte[] binaryValue = null;
+  
+  // the one and only data object for all different kind of field values
+  private Object fieldsData = null;
   
   private boolean storeTermVector = false;
   private boolean isStored = false;
   private boolean isIndexed = true;
   private boolean isTokenized = true;
   private boolean isBinary = false;
+  private boolean isCompressed = false;
   
   private float boost = 1.0f;
   
@@ -54,6 +55,10 @@ public final class Field implements java.io.Serializable {
     public String toString() {
       return name;
     }
+    /** Store the original field value in the index in a compressed form. This is
+     * useful for long documents and for binary valued fields.
+     */
+    public static final Store COMPRESS = new Store("COMPRESS");
     /** Store the original field value in the index. This is useful for short texts
      * like a document's title which should be displayed with the results. The
      * value is stored in its original form, i.e. no analyzer is used before it is
@@ -220,18 +225,22 @@ public final class Field implements java.io.Serializable {
   
   /** The name of the field (e.g., "date", "title", "body", ...)
     as an interned string. */
-  public String name() 		{ return name; }
+  public String name()    { return name; }
 
   /** The value of the field as a String, or null.  If null, the Reader value
-    is used.  Exactly one of stringValue() and readerValue() must be set. */
-  public String stringValue()		{ return stringValue; }
+   * or binary value is used.  Exactly one of stringValue(), readerValue(), and
+   * binaryValue() must be set. */
+  public String stringValue()   { try { return (String)fieldsData; } catch (ClassCastException ignore) { return null; } }
+  
   /** The value of the field as a Reader, or null.  If null, the String value
-    is used.  Exactly one of stringValue() and readerValue() must be set. */
-  public Reader readerValue()	{ return readerValue; }
+   * or binary value is  used.  Exactly one of stringValue(), readerValue(),
+   * and binaryValue() must be set. */
+  public Reader readerValue()   { try { return (Reader)fieldsData; } catch (ClassCastException ignore) { return null; } }
+  
   /** The value of the field in Binary, or null.  If null, the Reader or
-     String value is used.  Exactly one of stringValue(), readerValue() and
-     binaryValue() must be set. */
-  public byte[] binaryValue() { return binaryValue; }
+   * String value is used.  Exactly one of stringValue(), readerValue() and
+   * binaryValue() must be set. */
+  public byte[] binaryValue()   { try { return (byte[])fieldsData; } catch (ClassCastException ignore) { return null; } }
   
   /**
    * Create a field by specifying its name, value and how it will
@@ -277,12 +286,16 @@ public final class Field implements java.io.Serializable {
     if (index == Index.NO && termVector != TermVector.NO)
       throw new IllegalArgumentException("cannot store term vector information "
          + "for a field that is not indexed");
-
+          
     this.name = name.intern();        // field names are interned
-    this.stringValue = value;
+    this.fieldsData = value;
 
     if (store == Store.YES)
       this.isStored = true;
+    else if (store == Store.COMPRESS) {
+      this.isStored = true;
+      this.isCompressed = true;
+    }
     else if (store == Store.NO)
       this.isStored = false;
     else
@@ -331,7 +344,7 @@ public final class Field implements java.io.Serializable {
     if (reader == null)
       throw new NullPointerException("reader cannot be null");
     this.name = name.intern();        // field names are interned
-    this.readerValue = reader;
+    this.fieldsData = reader;
     this.isStored = false;
     this.isIndexed = true;
     this.isTokenized = true;
@@ -344,18 +357,31 @@ public final class Field implements java.io.Serializable {
    * @deprecated use {@link #Field(String, String, Field.Store, Field.Index)} instead
    */
   public Field(String name, String string,
-	       boolean store, boolean index, boolean token) {
+         boolean store, boolean index, boolean token) {
     this(name, string, store, index, token, false);
   }
 
-  public Field(String name, byte[] value) {
+  
+  /**
+   * Create a stored field with binary value. Optionally the value may be compressed.
+   * 
+   * @param name The name of the field
+   * @param value The binary value
+   * @param store How <code>value</code> should be stored (compressed or not.)
+   */
+  public Field(String name, byte[] value, Store store) {
     if (name == null)
       throw new IllegalArgumentException("name cannot be null");
     if (value == null)
       throw new IllegalArgumentException("value cannot be null");
+    if (store == Store.NO)
+      throw new IllegalArgumentException("binary values can't be unstored");
+    if (store == Store.COMPRESS)
+      this.isCompressed = true;
     
     this.name = name.intern();
-    this.binaryValue = value;
+    //wrap the byte[] to a ByteBuffer object
+    this.fieldsData = value;
     
     this.isBinary    = true;
     this.isStored    = true;
@@ -377,7 +403,7 @@ public final class Field implements java.io.Serializable {
    * @deprecated use {@link #Field(String, String, Field.Store, Field.Index, Field.TermVector)} instead
    */ 
   public Field(String name, String string,
-	       boolean store, boolean index, boolean token, boolean storeTermVector) {
+         boolean store, boolean index, boolean token, boolean storeTermVector) {
     if (name == null)
       throw new NullPointerException("name cannot be null");
     if (string == null)
@@ -385,8 +411,8 @@ public final class Field implements java.io.Serializable {
     if (!index && storeTermVector)
       throw new IllegalArgumentException("cannot store a term vector for fields that are not indexed");
 
-    this.name = name.intern();			  // field names are interned
-    this.stringValue = string;
+    this.name = name.intern();        // field names are interned
+    this.fieldsData = string;
     this.isStored = store;
     this.isIndexed = index;
     this.isTokenized = token;
@@ -406,16 +432,19 @@ public final class Field implements java.io.Serializable {
   /** True iff the value of the field is to be stored in the index for return
     with search hits.  It is an error for this to be true if a field is
     Reader-valued. */
-  public final boolean	isStored() 	{ return isStored; }
+  public final boolean  isStored()  { return isStored; }
 
   /** True iff the value of the field is to be indexed, so that it may be
     searched on. */
-  public final boolean 	isIndexed() 	{ return isIndexed; }
+  public final boolean  isIndexed()   { return isIndexed; }
 
   /** True iff the value of the field should be tokenized as text prior to
     indexing.  Un-tokenized fields are indexed as a single word and may not be
     Reader-valued. */
-  public final boolean 	isTokenized() 	{ return isTokenized; }
+  public final boolean  isTokenized()   { return isTokenized; }
+  
+  /** True if the value of the field is stored and compressed within the index */
+  public final boolean  isCompressed()   { return isCompressed; }
 
   /** True iff the term or terms used to index this field are stored as a term
    *  vector, available from {@link IndexReader#getTermFreqVector(int,String)}.
@@ -456,14 +485,20 @@ public final class Field implements java.io.Serializable {
       result.append("binary");
     }
     
+    if (isCompressed) {
+      if (result.length() > 0)
+        result.append(",");
+      result.append("compressed");
+    }
+    
     result.append('<');
     result.append(name);
     result.append(':');
-    if (readerValue != null) {
-      result.append(readerValue.toString());
-    } else {
-      result.append(stringValue);
+    
+    if (fieldsData != null) {
+      result.append(fieldsData);
     }
+    
     result.append('>');
     return result.toString();
   }
diff --git a/src/java/org/apache/lucene/index/FieldsReader.java b/src/java/org/apache/lucene/index/FieldsReader.java
index c8f37873b84..3d8a8d7f166 100644
--- a/src/java/org/apache/lucene/index/FieldsReader.java
+++ b/src/java/org/apache/lucene/index/FieldsReader.java
@@ -16,7 +16,10 @@ package org.apache.lucene.index;
  * limitations under the License.
  */
 
+import java.io.ByteArrayOutputStream;
 import java.io.IOException;
+import java.util.zip.DataFormatException;
+import java.util.zip.Inflater;
 
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
@@ -66,29 +69,77 @@ final class FieldsReader {
       FieldInfo fi = fieldInfos.fieldInfo(fieldNumber);
 
       byte bits = fieldsStream.readByte();
-
-      if ((bits & 2) != 0) {
+      
+      boolean compressed = (bits & FieldsWriter.FIELD_IS_COMPRESSED) != 0;
+      boolean tokenize = (bits & FieldsWriter.FIELD_IS_TOKENIZED) != 0;
+      
+      if ((bits & FieldsWriter.FIELD_IS_BINARY) != 0) {
         final byte[] b = new byte[fieldsStream.readVInt()];
         fieldsStream.readBytes(b, 0, b.length);
-        doc.add(new Field(fi.name, b));
+        if (compressed)
+          doc.add(new Field(fi.name, uncompress(b), Field.Store.COMPRESS));
+        else
+          doc.add(new Field(fi.name, b, Field.Store.YES));
       }
       else {
         Field.Index index;
-        boolean tokenize = (bits & 1) != 0;
+        Field.Store store = Field.Store.YES;
+        
         if (fi.isIndexed && tokenize)
           index = Field.Index.TOKENIZED;
         else if (fi.isIndexed && !tokenize)
           index = Field.Index.UN_TOKENIZED;
         else
           index = Field.Index.NO;
-        doc.add(new Field(fi.name,		  // name
-  			fieldsStream.readString(), // read value
-  			Field.Store.YES, index,
-  			fi.storeTermVector ? Field.TermVector.YES : Field.TermVector.NO));
+        
+        if (compressed) {
+          store = Field.Store.COMPRESS;
+          final byte[] b = new byte[fieldsStream.readVInt()];
+          fieldsStream.readBytes(b, 0, b.length);
+          doc.add(new Field(fi.name,      // field name
+              new String(uncompress(b), "UTF-8"), // uncompress the value and add as string
+              store,
+              index,
+              fi.storeTermVector ? Field.TermVector.YES : Field.TermVector.NO));
+        }
+        else
+          doc.add(new Field(fi.name,      // name
+                fieldsStream.readString(), // read value
+                store,
+                index,
+                fi.storeTermVector ? Field.TermVector.YES : Field.TermVector.NO));
       }
     }
 
     return doc;
   }
   
+  private final byte[] uncompress(final byte[] input)
+    throws IOException
+  {
+  
+    Inflater decompressor = new Inflater();
+    decompressor.setInput(input);
+  
+    // Create an expandable byte array to hold the decompressed data
+    ByteArrayOutputStream bos = new ByteArrayOutputStream(input.length);
+  
+    // Decompress the data
+    byte[] buf = new byte[1024];
+    while (!decompressor.finished()) {
+      try {
+        int count = decompressor.inflate(buf);
+        bos.write(buf, 0, count);
+      }
+      catch (DataFormatException e) {
+        // this will happen if the field is not compressed
+        throw new IOException ("field data are in wrong format: " + e.toString());
+      }
+    }
+  
+    decompressor.end();
+    
+    // Get the decompressed data
+    return bos.toByteArray();
+  }
 }
diff --git a/src/java/org/apache/lucene/index/FieldsWriter.java b/src/java/org/apache/lucene/index/FieldsWriter.java
index 4e99e231619..77f6e419f98 100644
--- a/src/java/org/apache/lucene/index/FieldsWriter.java
+++ b/src/java/org/apache/lucene/index/FieldsWriter.java
@@ -16,8 +16,10 @@ package org.apache.lucene.index;
  * the License.
  */
 
+import java.io.ByteArrayOutputStream;
 import java.io.IOException;
 import java.util.Enumeration;
+import java.util.zip.Deflater;
 
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
@@ -26,6 +28,10 @@ import org.apache.lucene.store.IndexOutput;
 
 final class FieldsWriter
 {
+  static final short FIELD_IS_TOKENIZED = 1;
+  static final short FIELD_IS_BINARY = 2;
+  static final short FIELD_IS_COMPRESSED = 4;
+  
     private FieldInfos fieldInfos;
 
     private IndexOutput fieldsStream;
@@ -63,21 +69,72 @@ final class FieldsWriter
 
                 byte bits = 0;
                 if (field.isTokenized())
-                    bits |= 1;
+                    bits |= FieldsWriter.FIELD_IS_TOKENIZED;
                 if (field.isBinary())
-                    bits |= 2;
+                    bits |= FieldsWriter.FIELD_IS_BINARY;
+                if (field.isCompressed())
+                    bits |= FieldsWriter.FIELD_IS_COMPRESSED;
+                
                 fieldsStream.writeByte(bits);
-
-                if (field.isBinary()) {
+                
+                if (field.isCompressed()) {
+                  // compression is enabled for the current field
+                  byte[] data = null;
+                  // check if it is a binary field
+                  if (field.isBinary()) {
+                    data = compress(field.binaryValue());
+                  }
+                  else {
+                    data = compress(field.stringValue().getBytes("UTF-8"));
+                  }
+                  final int len = data.length;
+                  fieldsStream.writeVInt(len);
+                  fieldsStream.writeBytes(data, len);
+                }
+                else {
+                  // compression is disabled for the current field
+                  if (field.isBinary()) {
                     byte[] data = field.binaryValue();
                     final int len = data.length;
                     fieldsStream.writeVInt(len);
                     fieldsStream.writeBytes(data, len);
-                } else {
+                  }
+                  else {
                     fieldsStream.writeString(field.stringValue());
+                  }
                 }
             }
         }
     }
 
+    private final byte[] compress (byte[] input) {
+
+      // Create the compressor with highest level of compression
+      Deflater compressor = new Deflater();
+      compressor.setLevel(Deflater.BEST_COMPRESSION);
+
+      // Give the compressor the data to compress
+      compressor.setInput(input);
+      compressor.finish();
+
+      /*
+       * Create an expandable byte array to hold the compressed data.
+       * You cannot use an array that's the same size as the orginal because
+       * there is no guarantee that the compressed data will be smaller than
+       * the uncompressed data.
+       */
+      ByteArrayOutputStream bos = new ByteArrayOutputStream(input.length);
+
+      // Compress the data
+      byte[] buf = new byte[1024];
+      while (!compressor.finished()) {
+        int count = compressor.deflate(buf);
+        bos.write(buf, 0, count);
+      }
+      
+      compressor.end();
+
+      // Get the compressed data
+      return bos.toByteArray();
+    }
 }
diff --git a/src/test/org/apache/lucene/document/TestBinaryDocument.java b/src/test/org/apache/lucene/document/TestBinaryDocument.java
new file mode 100644
index 00000000000..02358f6503c
--- /dev/null
+++ b/src/test/org/apache/lucene/document/TestBinaryDocument.java
@@ -0,0 +1,101 @@
+package org.apache.lucene.document;
+
+import junit.framework.TestCase;
+
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.store.RAMDirectory;
+
+/**
+ * Copyright 2004 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Tests {@link Document} class.
+ *
+ * @author Bernhard Messer
+ * @version $Id$
+ */
+public class TestBinaryDocument extends TestCase
+{
+
+  String binaryValStored = "this text will be stored as a byte array in the index";
+  String binaryValCompressed = "this text will be also stored and compressed as a byte array in the index";
+  
+  public void testBinaryFieldInIndex()
+    throws Exception
+  {
+    Field binaryFldStored = new Field("binaryStored", binaryValStored.getBytes(), Field.Store.YES);
+    Field binaryFldCompressed = new Field("binaryCompressed", binaryValCompressed.getBytes(), Field.Store.COMPRESS);
+    Field stringFldStored = new Field("stringStored", binaryValStored, Field.Store.YES, Field.Index.NO, Field.TermVector.NO);
+    Field stringFldCompressed = new Field("stringCompressed", binaryValCompressed, Field.Store.COMPRESS, Field.Index.NO, Field.TermVector.NO);
+    
+    try {
+      // binary fields with store off are not allowed
+      new Field("fail", binaryValCompressed.getBytes(), Field.Store.NO);
+      fail();
+    }
+    catch (IllegalArgumentException iae) {
+      ;
+    }
+    
+    Document doc = new Document();
+    
+    doc.add(binaryFldStored);
+    doc.add(binaryFldCompressed);
+    
+    doc.add(stringFldStored);
+    doc.add(stringFldCompressed);
+    
+    /** test for field count */
+    assertEquals(4, doc.fields.size());
+    
+    /** add the doc to a ram index */
+    RAMDirectory dir = new RAMDirectory();
+    IndexWriter writer = new IndexWriter(dir, new StandardAnalyzer(), true);
+    writer.addDocument(doc);
+    writer.close();
+    
+    /** open a reader and fetch the document */ 
+    IndexReader reader = IndexReader.open(dir);
+    Document docFromReader = reader.document(0);
+    assertTrue(docFromReader != null);
+    
+    /** fetch the binary stored field and compare it's content with the original one */
+    String binaryFldStoredTest = new String(docFromReader.getBinaryValue("binaryStored"));
+    assertTrue(binaryFldStoredTest.equals(binaryValStored));
+    
+    /** fetch the binary compressed field and compare it's content with the original one */
+    String binaryFldCompressedTest = new String(docFromReader.getBinaryValue("binaryCompressed"));
+    assertTrue(binaryFldCompressedTest.equals(binaryValCompressed));
+    
+    /** fetch the string field and compare it's content with the original one */
+    String stringFldStoredTest = new String(docFromReader.get("stringStored"));
+    assertTrue(stringFldStoredTest.equals(binaryValStored));
+    
+    /** fetch the compressed string field and compare it's content with the original one */
+    String stringFldCompressedTest = new String(docFromReader.get("stringCompressed"));
+    assertTrue(stringFldCompressedTest.equals(binaryValCompressed));
+    
+    /** delete the document from index */
+    reader.delete(0);
+    assertEquals(0, reader.numDocs());
+    
+    reader.close();
+    
+  }
+  
+}
diff --git a/src/test/org/apache/lucene/document/TestDocument.java b/src/test/org/apache/lucene/document/TestDocument.java
index ff0c1aaa8a5..182056552d0 100644
--- a/src/test/org/apache/lucene/document/TestDocument.java
+++ b/src/test/org/apache/lucene/document/TestDocument.java
@@ -47,8 +47,8 @@ public class TestDocument extends TestCase
   {
     Document doc = new Document();
     Field stringFld = new Field("string", binaryVal, Field.Store.YES, Field.Index.NO);
-    Field binaryFld = new Field("binary", binaryVal.getBytes());
-    Field binaryFld2 = new Field("binary", binaryVal2.getBytes());
+    Field binaryFld = new Field("binary", binaryVal.getBytes(), Field.Store.YES);
+    Field binaryFld2 = new Field("binary", binaryVal2.getBytes(), Field.Store.YES);
     
     doc.add(stringFld);
     doc.add(binaryFld);