From d34d6aa515b359c4fee22a29ba50db890000c8e8 Mon Sep 17 00:00:00 2001
From: Mike Klaas
Date: Wed, 6 Sep 2006 19:11:29 +0000
Subject: [PATCH] Compressable field patch (SOLR-45)
git-svn-id: https://svn.apache.org/repos/asf/incubator/solr/trunk@440837 13f79535-47bb-0310-9956-ffa450edef68
---
CHANGES.txt | 4 ++
example/solr/conf/schema.xml | 10 ++-
.../apache/solr/schema/CompressableField.java | 68 +++++++++++++++++++
.../org/apache/solr/schema/FieldType.java | 32 ++++++---
.../org/apache/solr/schema/SchemaField.java | 2 +-
src/java/org/apache/solr/schema/StrField.java | 3 +-
.../org/apache/solr/schema/TextField.java | 3 +-
.../apache/solr/BasicFunctionalityTest.java | 30 ++++++++
src/test/test-files/solr/conf/schema.xml | 7 ++
9 files changed, 145 insertions(+), 14 deletions(-)
create mode 100644 src/java/org/apache/solr/schema/CompressableField.java
diff --git a/CHANGES.txt b/CHANGES.txt
index a4de816154c..f28aa04895b 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -42,6 +42,10 @@ New Features
flags (Greg Ludington via yonik, SOLR-39)
22. A HyphenatedWordsFilter, a text analysis filter used during indexing to rejoin
words that were hyphenated and split by a newline. (Boris Vitez via yonik, SOLR-41)
+23. Added a CompressableField base class which allows fields of derived types to
+ be compressed using the compress=true setting. The field type also gains the
+ ability to specify a size threshold at which field data is compressed.
+ (klaas, SOLR-45)
Changes in runtime behavior
1. classes reorganized into different packages, package names changed to Apache
diff --git a/example/solr/conf/schema.xml b/example/solr/conf/schema.xml
index cb25ac97892..a48e5d8109b 100755
--- a/example/solr/conf/schema.xml
+++ b/example/solr/conf/schema.xml
@@ -15,7 +15,11 @@
attribute and any other attributes determine the real
behavior of the fieldtype. -->
-
+
@@ -156,9 +160,13 @@
type: mandatory - the name of a previously defined type from the section
indexed: true if this field should be indexed (searchable)
stored: true if this field should be retrievable
+ compressed: [false] if this field should be stored using gzip compression
+ (this will only apply if the field type is compressable; among
+ the standard field types, only TextField and StrField are)
multiValued: true if this field may contain multiple values per document
omitNorms: (expert) set to true to omit the norms associated with this field
(this disables length normalization and index-time boosting for the field)
+
-->
diff --git a/src/java/org/apache/solr/schema/CompressableField.java b/src/java/org/apache/solr/schema/CompressableField.java
new file mode 100644
index 00000000000..da20f610d86
--- /dev/null
+++ b/src/java/org/apache/solr/schema/CompressableField.java
@@ -0,0 +1,68 @@
+/**
+ * Copyright 2006 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.schema;
+
+import org.apache.lucene.document.Field;
+
+import org.apache.solr.request.*;
+
+import java.util.Map;
+import java.io.IOException;
+
+/** CompressableField
is an abstract field type which enables a
+ * field to be compressed (by specifying compressed="true"
at the
+ * field definition level) and provides optional support for specifying a
+ * threshold at which compression is enabled.
+ *
+ * Optional settings:
+ *
+ * compressThreshold
: length, in characters, at which point the
+ * field contents should be compressed [default: 0]
+ *
+ *
+ * TODO: Enable compression level specification (not yet in lucene)
+ *
+ * @author klaas
+ * @version $Id$
+ */
+public abstract class CompressableField extends FieldType {
+ /* if field size (in characters) is greater than this threshold, the field
+ will be stored compressed */
+ public static int DEFAULT_COMPRESS_THRESHOLD = 0;
+
+ int compressThreshold;
+
+ private static String CT = "compressThreshold";
+
+ protected void init(IndexSchema schema, Map args) {
+ SolrParams p = new MapSolrParams(args);
+ compressThreshold = p.getInt(CT, DEFAULT_COMPRESS_THRESHOLD);
+ args.remove(CT);
+ super.init(schema, args);
+ }
+
+ /* Helpers for field construction */
+ protected Field.Store getFieldStore(SchemaField field,
+ String internalVal) {
+ /* compress field if length exceeds threshold */
+ if(field.isCompressed()) {
+ return internalVal.length() >= compressThreshold ?
+ Field.Store.COMPRESS : Field.Store.YES;
+ } else
+ return super.getFieldStore(field, internalVal);
+ }
+}
diff --git a/src/java/org/apache/solr/schema/FieldType.java b/src/java/org/apache/solr/schema/FieldType.java
index 106c15de870..9996803f0ad 100644
--- a/src/java/org/apache/solr/schema/FieldType.java
+++ b/src/java/org/apache/solr/schema/FieldType.java
@@ -173,6 +173,18 @@ public abstract class FieldType extends FieldProperties {
}
if (val==null) return null;
+ Field f = new Field(field.getName(),
+ val,
+ getFieldStore(field, val),
+ getFieldIndex(field, val),
+ getFieldTermVec(field, val));
+ f.setOmitNorms(field.omitNorms());
+ f.setBoost(boost);
+ return f;
+ }
+ /* Helpers for field construction */
+ protected Field.TermVector getFieldTermVec(SchemaField field,
+ String internalVal) {
Field.TermVector ftv = Field.TermVector.NO;
if (field.storeTermPositions() && field.storeTermOffsets())
ftv = Field.TermVector.WITH_POSITIONS_OFFSETS;
@@ -182,17 +194,17 @@ public abstract class FieldType extends FieldProperties {
ftv = Field.TermVector.WITH_OFFSETS;
else if (field.storeTermVector())
ftv = Field.TermVector.YES;
-
- Field f = new Field(field.getName(),val,
- field.stored() ? Field.Store.YES : Field.Store.NO ,
- field.indexed() ? (isTokenized() ? Field.Index.TOKENIZED :
- Field.Index.UN_TOKENIZED) : Field.Index.NO,
- ftv);
- f.setOmitNorms(field.omitNorms());
- f.setBoost(boost);
- return f;
+ return ftv;
+ }
+ protected Field.Store getFieldStore(SchemaField field,
+ String internalVal) {
+ return field.stored() ? Field.Store.YES : Field.Store.NO;
+ }
+ protected Field.Index getFieldIndex(SchemaField field,
+ String internalVal) {
+ return field.indexed() ? (isTokenized() ? Field.Index.TOKENIZED :
+ Field.Index.UN_TOKENIZED) : Field.Index.NO;
}
-
/**
* Convert an external value (from XML update command or from query string)
diff --git a/src/java/org/apache/solr/schema/SchemaField.java b/src/java/org/apache/solr/schema/SchemaField.java
index db6306a7088..f6e28226ce6 100644
--- a/src/java/org/apache/solr/schema/SchemaField.java
+++ b/src/java/org/apache/solr/schema/SchemaField.java
@@ -76,11 +76,11 @@ public final class SchemaField extends FieldProperties {
public boolean multiValued() { return (properties & MULTIVALUED)!=0; }
public boolean sortMissingFirst() { return (properties & SORT_MISSING_FIRST)!=0; }
public boolean sortMissingLast() { return (properties & SORT_MISSING_LAST)!=0; }
+ public boolean isCompressed() { return (properties & COMPRESSED)!=0; }
// things that should be determined by field type, not set as options
boolean isTokenized() { return (properties & TOKENIZED)!=0; }
boolean isBinary() { return (properties & BINARY)!=0; }
- boolean isCompressed() { return (properties & COMPRESSED)!=0; }
public Field createField(String val, float boost) {
return type.createField(this,val,boost);
diff --git a/src/java/org/apache/solr/schema/StrField.java b/src/java/org/apache/solr/schema/StrField.java
index ba30a4fecbe..fdaadbaf604 100644
--- a/src/java/org/apache/solr/schema/StrField.java
+++ b/src/java/org/apache/solr/schema/StrField.java
@@ -28,8 +28,9 @@ import java.io.IOException;
* @author yonik
* @version $Id$
*/
-public class StrField extends FieldType {
+public class StrField extends CompressableField {
protected void init(IndexSchema schema, Map args) {
+ super.init(schema, args);
}
public SortField getSortField(SchemaField field,boolean reverse) {
diff --git a/src/java/org/apache/solr/schema/TextField.java b/src/java/org/apache/solr/schema/TextField.java
index 74f3d626000..141cdb4e7dc 100644
--- a/src/java/org/apache/solr/schema/TextField.java
+++ b/src/java/org/apache/solr/schema/TextField.java
@@ -30,9 +30,10 @@ import java.io.IOException;
* @author yonik
* @version $Id$
*/
-public class TextField extends FieldType {
+public class TextField extends CompressableField {
protected void init(IndexSchema schema, Map args) {
properties |= TOKENIZED;
+ super.init(schema, args);
}
public SortField getSortField(SchemaField field, boolean reverse) {
diff --git a/src/test/org/apache/solr/BasicFunctionalityTest.java b/src/test/org/apache/solr/BasicFunctionalityTest.java
index ba87e498046..48c727b4306 100644
--- a/src/test/org/apache/solr/BasicFunctionalityTest.java
+++ b/src/test/org/apache/solr/BasicFunctionalityTest.java
@@ -319,6 +319,36 @@ public class BasicFunctionalityTest extends AbstractSolrTestCase {
}
+
+ private String mkstr(int len) {
+ StringBuilder sb = new StringBuilder(len);
+ for (int i = 0; i < len; i++) {
+ sb.append((char)(65 + i%26));
+ }
+ return new String(sb);
+ }
+ public void testCompressableFieldType() {
+
+ IndexSchema ischema = new IndexSchema(getSchemaFile());
+ SchemaField f; // Solr field type
+ Field luf; // Lucene field
+
+ f = ischema.getField("test_hlt");
+ luf = f.createField("test", 0f);
+ assertFalse(luf.isCompressed());
+ assertTrue(luf.isStored());
+
+ f = ischema.getField("test_hlt");
+ luf = f.createField(mkstr(345), 0f);
+ assertTrue(luf.isCompressed());
+ assertTrue(luf.isStored());
+
+ f = ischema.getField("test_hlt_off");
+ luf = f.createField(mkstr(400), 0f);
+ assertFalse(luf.isCompressed());
+ assertTrue(luf.isStored());
+
+ }
// /** this doesn't work, but if it did, this is how we'd test it. */
diff --git a/src/test/test-files/solr/conf/schema.xml b/src/test/test-files/solr/conf/schema.xml
index 4e477aca9f8..88c43b597bb 100644
--- a/src/test/test-files/solr/conf/schema.xml
+++ b/src/test/test-files/solr/conf/schema.xml
@@ -62,6 +62,9 @@
+
+
+
@@ -300,6 +303,10 @@
+
+
+
+