mirror of https://github.com/apache/lucene.git
Compressable field patch (SOLR-45)
git-svn-id: https://svn.apache.org/repos/asf/incubator/solr/trunk@440837 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
2a7b13bd8f
commit
d34d6aa515
|
@ -42,6 +42,10 @@ New Features
|
|||
flags (Greg Ludington via yonik, SOLR-39)
|
||||
22. A HyphenatedWordsFilter, a text analysis filter used during indexing to rejoin
|
||||
words that were hyphenated and split by a newline. (Boris Vitez via yonik, SOLR-41)
|
||||
23. Added a CompressableField base class which allows fields of derived types to
|
||||
be compressed using the compress=true setting. The field type also gains the
|
||||
ability to specify a size threshold at which field data is compressed.
|
||||
(klaas, SOLR-45)
|
||||
|
||||
Changes in runtime behavior
|
||||
1. classes reorganized into different packages, package names changed to Apache
|
||||
|
|
|
@ -15,7 +15,11 @@
|
|||
attribute and any other attributes determine the real
|
||||
behavior of the fieldtype. -->
|
||||
|
||||
<!-- The StringField type is not analyzed, but indexed/stored verbatim -->
|
||||
<!-- The StrField type is not analyzed, but indexed/stored verbatim.
|
||||
- StrField and TextField support an optional compressThreshold which
|
||||
limits compression (if enabled in the derived fields) to values which
|
||||
exceed a certain size (in characters).
|
||||
-->
|
||||
<fieldtype name="string" class="solr.StrField" sortMissingLast="true"/>
|
||||
|
||||
<!-- boolean type: "true" or "false" -->
|
||||
|
@ -156,9 +160,13 @@
|
|||
type: mandatory - the name of a previously defined type from the <types> section
|
||||
indexed: true if this field should be indexed (searchable)
|
||||
stored: true if this field should be retrievable
|
||||
compressed: [false] if this field should be stored using gzip compression
|
||||
(this will only apply if the field type is compressable; among
|
||||
the standard field types, only TextField and StrField are)
|
||||
multiValued: true if this field may contain multiple values per document
|
||||
omitNorms: (expert) set to true to omit the norms associated with this field
|
||||
(this disables length normalization and index-time boosting for the field)
|
||||
|
||||
-->
|
||||
|
||||
<field name="id" type="string" indexed="true" stored="true"/>
|
||||
|
|
|
@ -0,0 +1,68 @@
|
|||
/**
|
||||
* Copyright 2006 The Apache Software Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.schema;
|
||||
|
||||
import org.apache.lucene.document.Field;
|
||||
|
||||
import org.apache.solr.request.*;
|
||||
|
||||
import java.util.Map;
|
||||
import java.io.IOException;
|
||||
|
||||
/** <code>CompressableField</code> is an abstract field type which enables a
|
||||
* field to be compressed (by specifying <code>compressed="true"</code> at the
|
||||
* field definition level) and provides optional support for specifying a
|
||||
* threshold at which compression is enabled.
|
||||
*
|
||||
* Optional settings:
|
||||
* <ul>
|
||||
* <li><code>compressThreshold</code>: length, in characters, at which point the
|
||||
* field contents should be compressed [default: 0]</li>
|
||||
* </ul></p>
|
||||
*
|
||||
* TODO: Enable compression level specification (not yet in lucene)
|
||||
*
|
||||
* @author klaas
|
||||
* @version $Id$
|
||||
*/
|
||||
public abstract class CompressableField extends FieldType {
|
||||
/* if field size (in characters) is greater than this threshold, the field
|
||||
will be stored compressed */
|
||||
public static int DEFAULT_COMPRESS_THRESHOLD = 0;
|
||||
|
||||
int compressThreshold;
|
||||
|
||||
private static String CT = "compressThreshold";
|
||||
|
||||
protected void init(IndexSchema schema, Map<String,String> args) {
|
||||
SolrParams p = new MapSolrParams(args);
|
||||
compressThreshold = p.getInt(CT, DEFAULT_COMPRESS_THRESHOLD);
|
||||
args.remove(CT);
|
||||
super.init(schema, args);
|
||||
}
|
||||
|
||||
/* Helpers for field construction */
|
||||
protected Field.Store getFieldStore(SchemaField field,
|
||||
String internalVal) {
|
||||
/* compress field if length exceeds threshold */
|
||||
if(field.isCompressed()) {
|
||||
return internalVal.length() >= compressThreshold ?
|
||||
Field.Store.COMPRESS : Field.Store.YES;
|
||||
} else
|
||||
return super.getFieldStore(field, internalVal);
|
||||
}
|
||||
}
|
|
@ -173,6 +173,18 @@ public abstract class FieldType extends FieldProperties {
|
|||
}
|
||||
if (val==null) return null;
|
||||
|
||||
Field f = new Field(field.getName(),
|
||||
val,
|
||||
getFieldStore(field, val),
|
||||
getFieldIndex(field, val),
|
||||
getFieldTermVec(field, val));
|
||||
f.setOmitNorms(field.omitNorms());
|
||||
f.setBoost(boost);
|
||||
return f;
|
||||
}
|
||||
/* Helpers for field construction */
|
||||
protected Field.TermVector getFieldTermVec(SchemaField field,
|
||||
String internalVal) {
|
||||
Field.TermVector ftv = Field.TermVector.NO;
|
||||
if (field.storeTermPositions() && field.storeTermOffsets())
|
||||
ftv = Field.TermVector.WITH_POSITIONS_OFFSETS;
|
||||
|
@ -182,17 +194,17 @@ public abstract class FieldType extends FieldProperties {
|
|||
ftv = Field.TermVector.WITH_OFFSETS;
|
||||
else if (field.storeTermVector())
|
||||
ftv = Field.TermVector.YES;
|
||||
|
||||
Field f = new Field(field.getName(),val,
|
||||
field.stored() ? Field.Store.YES : Field.Store.NO ,
|
||||
field.indexed() ? (isTokenized() ? Field.Index.TOKENIZED :
|
||||
Field.Index.UN_TOKENIZED) : Field.Index.NO,
|
||||
ftv);
|
||||
f.setOmitNorms(field.omitNorms());
|
||||
f.setBoost(boost);
|
||||
return f;
|
||||
return ftv;
|
||||
}
|
||||
protected Field.Store getFieldStore(SchemaField field,
|
||||
String internalVal) {
|
||||
return field.stored() ? Field.Store.YES : Field.Store.NO;
|
||||
}
|
||||
protected Field.Index getFieldIndex(SchemaField field,
|
||||
String internalVal) {
|
||||
return field.indexed() ? (isTokenized() ? Field.Index.TOKENIZED :
|
||||
Field.Index.UN_TOKENIZED) : Field.Index.NO;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Convert an external value (from XML update command or from query string)
|
||||
|
|
|
@ -76,11 +76,11 @@ public final class SchemaField extends FieldProperties {
|
|||
public boolean multiValued() { return (properties & MULTIVALUED)!=0; }
|
||||
public boolean sortMissingFirst() { return (properties & SORT_MISSING_FIRST)!=0; }
|
||||
public boolean sortMissingLast() { return (properties & SORT_MISSING_LAST)!=0; }
|
||||
public boolean isCompressed() { return (properties & COMPRESSED)!=0; }
|
||||
|
||||
// things that should be determined by field type, not set as options
|
||||
boolean isTokenized() { return (properties & TOKENIZED)!=0; }
|
||||
boolean isBinary() { return (properties & BINARY)!=0; }
|
||||
boolean isCompressed() { return (properties & COMPRESSED)!=0; }
|
||||
|
||||
public Field createField(String val, float boost) {
|
||||
return type.createField(this,val,boost);
|
||||
|
|
|
@ -28,8 +28,9 @@ import java.io.IOException;
|
|||
* @author yonik
|
||||
* @version $Id$
|
||||
*/
|
||||
public class StrField extends FieldType {
|
||||
public class StrField extends CompressableField {
|
||||
protected void init(IndexSchema schema, Map<String,String> args) {
|
||||
super.init(schema, args);
|
||||
}
|
||||
|
||||
public SortField getSortField(SchemaField field,boolean reverse) {
|
||||
|
|
|
@ -30,9 +30,10 @@ import java.io.IOException;
|
|||
* @author yonik
|
||||
* @version $Id$
|
||||
*/
|
||||
public class TextField extends FieldType {
|
||||
public class TextField extends CompressableField {
|
||||
protected void init(IndexSchema schema, Map<String,String> args) {
|
||||
properties |= TOKENIZED;
|
||||
super.init(schema, args);
|
||||
}
|
||||
|
||||
public SortField getSortField(SchemaField field, boolean reverse) {
|
||||
|
|
|
@ -319,6 +319,36 @@ public class BasicFunctionalityTest extends AbstractSolrTestCase {
|
|||
|
||||
}
|
||||
|
||||
|
||||
private String mkstr(int len) {
|
||||
StringBuilder sb = new StringBuilder(len);
|
||||
for (int i = 0; i < len; i++) {
|
||||
sb.append((char)(65 + i%26));
|
||||
}
|
||||
return new String(sb);
|
||||
}
|
||||
public void testCompressableFieldType() {
|
||||
|
||||
IndexSchema ischema = new IndexSchema(getSchemaFile());
|
||||
SchemaField f; // Solr field type
|
||||
Field luf; // Lucene field
|
||||
|
||||
f = ischema.getField("test_hlt");
|
||||
luf = f.createField("test", 0f);
|
||||
assertFalse(luf.isCompressed());
|
||||
assertTrue(luf.isStored());
|
||||
|
||||
f = ischema.getField("test_hlt");
|
||||
luf = f.createField(mkstr(345), 0f);
|
||||
assertTrue(luf.isCompressed());
|
||||
assertTrue(luf.isStored());
|
||||
|
||||
f = ischema.getField("test_hlt_off");
|
||||
luf = f.createField(mkstr(400), 0f);
|
||||
assertFalse(luf.isCompressed());
|
||||
assertTrue(luf.isStored());
|
||||
|
||||
}
|
||||
|
||||
|
||||
// /** this doesn't work, but if it did, this is how we'd test it. */
|
||||
|
|
|
@ -62,6 +62,9 @@
|
|||
</fieldtype>
|
||||
|
||||
|
||||
<!-- HighlitText optimizes storage for (long) columns which will be highlit -->
|
||||
<fieldtype name="highlittext" class="solr.TextField" compressThreshold="345" />
|
||||
|
||||
<fieldtype name="boolean" class="solr.BoolField" sortMissingLast="true"/>
|
||||
<fieldtype name="string" class="solr.StrField" sortMissingLast="true"/>
|
||||
|
||||
|
@ -300,6 +303,10 @@
|
|||
<field name="test_posofftv" type="text" termVectors="true"
|
||||
termPositions="true" termOffsets="true"/>
|
||||
|
||||
<!-- test highlit field settings -->
|
||||
<field name="test_hlt" type="highlittext" indexed="true" compressed="true"/>
|
||||
<field name="test_hlt_off" type="highlittext" indexed="true" compressed="false"/>
|
||||
|
||||
<!-- fields to test individual tokenizers and tokenfilters -->
|
||||
<field name="teststop" type="teststop" indexed="true" stored="true"/>
|
||||
<field name="lowertok" type="lowertok" indexed="true" stored="true"/>
|
||||
|
|
Loading…
Reference in New Issue