Compressable field patch (SOLR-45)

git-svn-id: https://svn.apache.org/repos/asf/incubator/solr/trunk@440837 13f79535-47bb-0310-9956-ffa450edef68
2006-09-06 19:11:29 +00:00 · 2006-09-06 19:11:29 +00:00 · d34d6aa515
parent 2a7b13bd8f
commit d34d6aa515
9 changed files with 145 additions and 14 deletions
--- a/CHANGES.txt
+++ b/CHANGES.txt
@ -42,6 +42,10 @@ New Features
    flags (Greg Ludington via yonik, SOLR-39)
 22. A HyphenatedWordsFilter, a text analysis filter used during indexing to rejoin
    words that were hyphenated and split by a newline. (Boris Vitez via yonik, SOLR-41)
+23. Added a CompressableField base class which allows fields of derived types to
+    be compressed using the compress=true setting.  The field type also gains the
+    ability to specify a size threshold at which field data is compressed.
+    (klaas, SOLR-45)

 Changes in runtime behavior
 1. classes reorganized into different packages, package names changed to Apache
--- a/example/solr/conf/schema.xml
+++ b/example/solr/conf/schema.xml
@ -15,7 +15,11 @@
         attribute and any other attributes determine the real
         behavior of the fieldtype.  -->

-    <!-- The StringField type is not analyzed, but indexed/stored verbatim  -->
+    <!-- The StrField type is not analyzed, but indexed/stored verbatim.  
+       - StrField and TextField support an optional compressThreshold which
+       limits compression (if enabled in the derived fields) to values which
+       exceed a certain size (in characters).
+    -->
    <fieldtype name="string" class="solr.StrField" sortMissingLast="true"/>

    <!-- boolean type: "true" or "false" -->
@ -156,9 +160,13 @@
       type: mandatory - the name of a previously defined type from the <types> section
       indexed: true if this field should be indexed (searchable)
       stored: true if this field should be retrievable
+       compressed: [false] if this field should be stored using gzip compression
+                   (this will only apply if the field type is compressable; among
+                    the standard field types, only TextField and StrField are)
       multiValued: true if this field may contain multiple values per document
       omitNorms: (expert) set to true to omit the norms associated with this field
                  (this disables length normalization and index-time boosting for the field)
+       
   -->

   <field name="id" type="string" indexed="true" stored="true"/>
--- a/src/java/org/apache/solr/schema/CompressableField.java
+++ b/src/java/org/apache/solr/schema/CompressableField.java
@ -0,0 +1,68 @@
+/**
+ * Copyright 2006 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.schema;
+
+import org.apache.lucene.document.Field;
+
+import org.apache.solr.request.*;
+
+import java.util.Map;
+import java.io.IOException;
+
+/** <code>CompressableField</code> is an abstract field type which enables a
+ * field to be compressed (by specifying <code>compressed="true"</code> at the
+ * field definition level) and provides optional support for specifying a
+ * threshold at which compression is enabled.
+ *
+ * Optional settings:
+ * <ul>
+ *  <li><code>compressThreshold</code>: length, in characters, at which point the 
+ *      field contents should be compressed [default: 0]</li>
+ * </ul></p>
+ * 
+ * TODO: Enable compression level specification (not yet in lucene)
+ * 
+ * @author klaas
+ * @version $Id$
+ */
+public abstract class CompressableField extends FieldType {
+  /* if field size (in characters) is greater than this threshold, the field 
+     will be stored compressed */
+  public static int DEFAULT_COMPRESS_THRESHOLD = 0;
+
+  int compressThreshold;
+
+  private static String CT = "compressThreshold";
+
+  protected void init(IndexSchema schema, Map<String,String> args) {
+    SolrParams p = new MapSolrParams(args);
+    compressThreshold = p.getInt(CT, DEFAULT_COMPRESS_THRESHOLD);
+    args.remove(CT);
+    super.init(schema, args);    
+  }
+
+    /* Helpers for field construction */
+  protected Field.Store getFieldStore(SchemaField field,
+                                      String internalVal) {
+    /* compress field if length exceeds threshold */
+    if(field.isCompressed()) {
+      return internalVal.length() >= compressThreshold ? 
+        Field.Store.COMPRESS : Field.Store.YES;
+    } else
+      return super.getFieldStore(field, internalVal);
+  } 
+}
--- a/src/java/org/apache/solr/schema/FieldType.java
+++ b/src/java/org/apache/solr/schema/FieldType.java
@ -173,6 +173,18 @@ public abstract class FieldType extends FieldProperties {
    }
    if (val==null) return null;

+    Field f = new Field(field.getName(),
+                        val,
+                        getFieldStore(field, val),
+                        getFieldIndex(field, val),
+                        getFieldTermVec(field, val));
+    f.setOmitNorms(field.omitNorms());
+    f.setBoost(boost);
+    return f;
+  }
+  /* Helpers for field construction */
+  protected Field.TermVector getFieldTermVec(SchemaField field,
+                                             String internalVal) {
    Field.TermVector ftv = Field.TermVector.NO;
    if (field.storeTermPositions() && field.storeTermOffsets())
      ftv = Field.TermVector.WITH_POSITIONS_OFFSETS;
@ -182,17 +194,17 @@ public abstract class FieldType extends FieldProperties {
      ftv = Field.TermVector.WITH_OFFSETS;            
    else if (field.storeTermVector())
      ftv = Field.TermVector.YES;
-
-    Field f =  new Field(field.getName(),val,
-        field.stored() ? Field.Store.YES : Field.Store.NO ,
-        field.indexed() ? (isTokenized() ? Field.Index.TOKENIZED : 
-                           Field.Index.UN_TOKENIZED) : Field.Index.NO,
-        ftv);
-    f.setOmitNorms(field.omitNorms());
-    f.setBoost(boost);
-    return f;
+    return ftv;
+  }
+  protected Field.Store getFieldStore(SchemaField field,
+                                      String internalVal) {
+    return field.stored() ? Field.Store.YES : Field.Store.NO;
+  }
+  protected Field.Index getFieldIndex(SchemaField field,
+                                      String internalVal) {
+    return field.indexed() ? (isTokenized() ? Field.Index.TOKENIZED : 
+                              Field.Index.UN_TOKENIZED) : Field.Index.NO;
  }
-

  /**
   * Convert an external value (from XML update command or from query string)
--- a/src/java/org/apache/solr/schema/SchemaField.java
+++ b/src/java/org/apache/solr/schema/SchemaField.java
@ -76,11 +76,11 @@ public final class SchemaField extends FieldProperties {
  public boolean multiValued() { return (properties & MULTIVALUED)!=0; }
  public boolean sortMissingFirst() { return (properties & SORT_MISSING_FIRST)!=0; }
  public boolean sortMissingLast() { return (properties & SORT_MISSING_LAST)!=0; }
+  public boolean isCompressed() { return (properties & COMPRESSED)!=0; }

  // things that should be determined by field type, not set as options
  boolean isTokenized() { return (properties & TOKENIZED)!=0; }
  boolean isBinary() { return (properties & BINARY)!=0; }
-  boolean isCompressed() { return (properties & COMPRESSED)!=0; }

  public Field createField(String val, float boost) {
    return type.createField(this,val,boost);
--- a/src/java/org/apache/solr/schema/StrField.java
+++ b/src/java/org/apache/solr/schema/StrField.java
@ -28,8 +28,9 @@ import java.io.IOException;
 * @author yonik
 * @version $Id$
 */
-public class StrField extends FieldType {
+public class StrField extends CompressableField {
  protected void init(IndexSchema schema, Map<String,String> args) {
+    super.init(schema, args);    
  }

  public SortField getSortField(SchemaField field,boolean reverse) {
--- a/src/java/org/apache/solr/schema/TextField.java
+++ b/src/java/org/apache/solr/schema/TextField.java
@ -30,9 +30,10 @@ import java.io.IOException;
 * @author yonik
 * @version $Id$
 */
-public class TextField extends FieldType {
+public class TextField extends CompressableField {
  protected void init(IndexSchema schema, Map<String,String> args) {
    properties |= TOKENIZED;
+    super.init(schema, args);    
  }

  public SortField getSortField(SchemaField field, boolean reverse) {
--- a/src/test/org/apache/solr/BasicFunctionalityTest.java
+++ b/src/test/org/apache/solr/BasicFunctionalityTest.java
@ -319,6 +319,36 @@ public class BasicFunctionalityTest extends AbstractSolrTestCase {

  }
      
+
+  private String mkstr(int len) {
+    StringBuilder sb = new StringBuilder(len);
+    for (int i = 0; i < len; i++) {
+      sb.append((char)(65 + i%26));
+    }
+    return new String(sb);
+  }   
+  public void testCompressableFieldType() {
+    
+    IndexSchema ischema = new IndexSchema(getSchemaFile());
+    SchemaField f; // Solr field type
+    Field luf; // Lucene field
+
+    f = ischema.getField("test_hlt");
+    luf = f.createField("test", 0f);
+    assertFalse(luf.isCompressed());
+    assertTrue(luf.isStored());
+
+    f = ischema.getField("test_hlt");
+    luf = f.createField(mkstr(345), 0f);
+    assertTrue(luf.isCompressed());
+    assertTrue(luf.isStored());
+
+    f = ischema.getField("test_hlt_off");
+    luf = f.createField(mkstr(400), 0f);
+    assertFalse(luf.isCompressed());
+    assertTrue(luf.isStored());
+    
+  }
            

 //   /** this doesn't work, but if it did, this is how we'd test it. */
--- a/src/test/test-files/solr/conf/schema.xml
+++ b/src/test/test-files/solr/conf/schema.xml
@ -62,6 +62,9 @@
    </fieldtype>


+    <!-- HighlitText optimizes storage for (long) columns which will be highlit -->
+    <fieldtype name="highlittext" class="solr.TextField" compressThreshold="345" />
+
    <fieldtype name="boolean" class="solr.BoolField" sortMissingLast="true"/>
    <fieldtype name="string" class="solr.StrField" sortMissingLast="true"/>

@ -300,6 +303,10 @@
   <field name="test_posofftv" type="text" termVectors="true" 
     termPositions="true" termOffsets="true"/>

+   <!-- test highlit field settings -->
+   <field name="test_hlt" type="highlittext" indexed="true" compressed="true"/>
+   <field name="test_hlt_off" type="highlittext" indexed="true" compressed="false"/>
+
   <!-- fields to test individual tokenizers and tokenfilters -->
   <field name="teststop" type="teststop" indexed="true" stored="true"/>
   <field name="lowertok" type="lowertok" indexed="true" stored="true"/>