From d34d6aa515b359c4fee22a29ba50db890000c8e8 Mon Sep 17 00:00:00 2001 From: Mike Klaas Date: Wed, 6 Sep 2006 19:11:29 +0000 Subject: [PATCH] Compressable field patch (SOLR-45) git-svn-id: https://svn.apache.org/repos/asf/incubator/solr/trunk@440837 13f79535-47bb-0310-9956-ffa450edef68 --- CHANGES.txt | 4 ++ example/solr/conf/schema.xml | 10 ++- .../apache/solr/schema/CompressableField.java | 68 +++++++++++++++++++ .../org/apache/solr/schema/FieldType.java | 32 ++++++--- .../org/apache/solr/schema/SchemaField.java | 2 +- src/java/org/apache/solr/schema/StrField.java | 3 +- .../org/apache/solr/schema/TextField.java | 3 +- .../apache/solr/BasicFunctionalityTest.java | 30 ++++++++ src/test/test-files/solr/conf/schema.xml | 7 ++ 9 files changed, 145 insertions(+), 14 deletions(-) create mode 100644 src/java/org/apache/solr/schema/CompressableField.java diff --git a/CHANGES.txt b/CHANGES.txt index a4de816154c..f28aa04895b 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -42,6 +42,10 @@ New Features flags (Greg Ludington via yonik, SOLR-39) 22. A HyphenatedWordsFilter, a text analysis filter used during indexing to rejoin words that were hyphenated and split by a newline. (Boris Vitez via yonik, SOLR-41) +23. Added a CompressableField base class which allows fields of derived types to + be compressed using the compress=true setting. The field type also gains the + ability to specify a size threshold at which field data is compressed. + (klaas, SOLR-45) Changes in runtime behavior 1. classes reorganized into different packages, package names changed to Apache diff --git a/example/solr/conf/schema.xml b/example/solr/conf/schema.xml index cb25ac97892..a48e5d8109b 100755 --- a/example/solr/conf/schema.xml +++ b/example/solr/conf/schema.xml @@ -15,7 +15,11 @@ attribute and any other attributes determine the real behavior of the fieldtype. --> - + @@ -156,9 +160,13 @@ type: mandatory - the name of a previously defined type from the section indexed: true if this field should be indexed (searchable) stored: true if this field should be retrievable + compressed: [false] if this field should be stored using gzip compression + (this will only apply if the field type is compressable; among + the standard field types, only TextField and StrField are) multiValued: true if this field may contain multiple values per document omitNorms: (expert) set to true to omit the norms associated with this field (this disables length normalization and index-time boosting for the field) + --> diff --git a/src/java/org/apache/solr/schema/CompressableField.java b/src/java/org/apache/solr/schema/CompressableField.java new file mode 100644 index 00000000000..da20f610d86 --- /dev/null +++ b/src/java/org/apache/solr/schema/CompressableField.java @@ -0,0 +1,68 @@ +/** + * Copyright 2006 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.schema; + +import org.apache.lucene.document.Field; + +import org.apache.solr.request.*; + +import java.util.Map; +import java.io.IOException; + +/** CompressableField is an abstract field type which enables a + * field to be compressed (by specifying compressed="true" at the + * field definition level) and provides optional support for specifying a + * threshold at which compression is enabled. + * + * Optional settings: + *
    + *
  • compressThreshold: length, in characters, at which point the + * field contents should be compressed [default: 0]
  • + *

+ * + * TODO: Enable compression level specification (not yet in lucene) + * + * @author klaas + * @version $Id$ + */ +public abstract class CompressableField extends FieldType { + /* if field size (in characters) is greater than this threshold, the field + will be stored compressed */ + public static int DEFAULT_COMPRESS_THRESHOLD = 0; + + int compressThreshold; + + private static String CT = "compressThreshold"; + + protected void init(IndexSchema schema, Map args) { + SolrParams p = new MapSolrParams(args); + compressThreshold = p.getInt(CT, DEFAULT_COMPRESS_THRESHOLD); + args.remove(CT); + super.init(schema, args); + } + + /* Helpers for field construction */ + protected Field.Store getFieldStore(SchemaField field, + String internalVal) { + /* compress field if length exceeds threshold */ + if(field.isCompressed()) { + return internalVal.length() >= compressThreshold ? + Field.Store.COMPRESS : Field.Store.YES; + } else + return super.getFieldStore(field, internalVal); + } +} diff --git a/src/java/org/apache/solr/schema/FieldType.java b/src/java/org/apache/solr/schema/FieldType.java index 106c15de870..9996803f0ad 100644 --- a/src/java/org/apache/solr/schema/FieldType.java +++ b/src/java/org/apache/solr/schema/FieldType.java @@ -173,6 +173,18 @@ public abstract class FieldType extends FieldProperties { } if (val==null) return null; + Field f = new Field(field.getName(), + val, + getFieldStore(field, val), + getFieldIndex(field, val), + getFieldTermVec(field, val)); + f.setOmitNorms(field.omitNorms()); + f.setBoost(boost); + return f; + } + /* Helpers for field construction */ + protected Field.TermVector getFieldTermVec(SchemaField field, + String internalVal) { Field.TermVector ftv = Field.TermVector.NO; if (field.storeTermPositions() && field.storeTermOffsets()) ftv = Field.TermVector.WITH_POSITIONS_OFFSETS; @@ -182,17 +194,17 @@ public abstract class FieldType extends FieldProperties { ftv = Field.TermVector.WITH_OFFSETS; else if (field.storeTermVector()) ftv = Field.TermVector.YES; - - Field f = new Field(field.getName(),val, - field.stored() ? Field.Store.YES : Field.Store.NO , - field.indexed() ? (isTokenized() ? Field.Index.TOKENIZED : - Field.Index.UN_TOKENIZED) : Field.Index.NO, - ftv); - f.setOmitNorms(field.omitNorms()); - f.setBoost(boost); - return f; + return ftv; + } + protected Field.Store getFieldStore(SchemaField field, + String internalVal) { + return field.stored() ? Field.Store.YES : Field.Store.NO; + } + protected Field.Index getFieldIndex(SchemaField field, + String internalVal) { + return field.indexed() ? (isTokenized() ? Field.Index.TOKENIZED : + Field.Index.UN_TOKENIZED) : Field.Index.NO; } - /** * Convert an external value (from XML update command or from query string) diff --git a/src/java/org/apache/solr/schema/SchemaField.java b/src/java/org/apache/solr/schema/SchemaField.java index db6306a7088..f6e28226ce6 100644 --- a/src/java/org/apache/solr/schema/SchemaField.java +++ b/src/java/org/apache/solr/schema/SchemaField.java @@ -76,11 +76,11 @@ public final class SchemaField extends FieldProperties { public boolean multiValued() { return (properties & MULTIVALUED)!=0; } public boolean sortMissingFirst() { return (properties & SORT_MISSING_FIRST)!=0; } public boolean sortMissingLast() { return (properties & SORT_MISSING_LAST)!=0; } + public boolean isCompressed() { return (properties & COMPRESSED)!=0; } // things that should be determined by field type, not set as options boolean isTokenized() { return (properties & TOKENIZED)!=0; } boolean isBinary() { return (properties & BINARY)!=0; } - boolean isCompressed() { return (properties & COMPRESSED)!=0; } public Field createField(String val, float boost) { return type.createField(this,val,boost); diff --git a/src/java/org/apache/solr/schema/StrField.java b/src/java/org/apache/solr/schema/StrField.java index ba30a4fecbe..fdaadbaf604 100644 --- a/src/java/org/apache/solr/schema/StrField.java +++ b/src/java/org/apache/solr/schema/StrField.java @@ -28,8 +28,9 @@ import java.io.IOException; * @author yonik * @version $Id$ */ -public class StrField extends FieldType { +public class StrField extends CompressableField { protected void init(IndexSchema schema, Map args) { + super.init(schema, args); } public SortField getSortField(SchemaField field,boolean reverse) { diff --git a/src/java/org/apache/solr/schema/TextField.java b/src/java/org/apache/solr/schema/TextField.java index 74f3d626000..141cdb4e7dc 100644 --- a/src/java/org/apache/solr/schema/TextField.java +++ b/src/java/org/apache/solr/schema/TextField.java @@ -30,9 +30,10 @@ import java.io.IOException; * @author yonik * @version $Id$ */ -public class TextField extends FieldType { +public class TextField extends CompressableField { protected void init(IndexSchema schema, Map args) { properties |= TOKENIZED; + super.init(schema, args); } public SortField getSortField(SchemaField field, boolean reverse) { diff --git a/src/test/org/apache/solr/BasicFunctionalityTest.java b/src/test/org/apache/solr/BasicFunctionalityTest.java index ba87e498046..48c727b4306 100644 --- a/src/test/org/apache/solr/BasicFunctionalityTest.java +++ b/src/test/org/apache/solr/BasicFunctionalityTest.java @@ -319,6 +319,36 @@ public class BasicFunctionalityTest extends AbstractSolrTestCase { } + + private String mkstr(int len) { + StringBuilder sb = new StringBuilder(len); + for (int i = 0; i < len; i++) { + sb.append((char)(65 + i%26)); + } + return new String(sb); + } + public void testCompressableFieldType() { + + IndexSchema ischema = new IndexSchema(getSchemaFile()); + SchemaField f; // Solr field type + Field luf; // Lucene field + + f = ischema.getField("test_hlt"); + luf = f.createField("test", 0f); + assertFalse(luf.isCompressed()); + assertTrue(luf.isStored()); + + f = ischema.getField("test_hlt"); + luf = f.createField(mkstr(345), 0f); + assertTrue(luf.isCompressed()); + assertTrue(luf.isStored()); + + f = ischema.getField("test_hlt_off"); + luf = f.createField(mkstr(400), 0f); + assertFalse(luf.isCompressed()); + assertTrue(luf.isStored()); + + } // /** this doesn't work, but if it did, this is how we'd test it. */ diff --git a/src/test/test-files/solr/conf/schema.xml b/src/test/test-files/solr/conf/schema.xml index 4e477aca9f8..88c43b597bb 100644 --- a/src/test/test-files/solr/conf/schema.xml +++ b/src/test/test-files/solr/conf/schema.xml @@ -62,6 +62,9 @@
+ + + @@ -300,6 +303,10 @@ + + + +