SOLR-940 -- Add support for Lucene's Trie Range Queries

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@752562 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Shalin Shekhar Mangar 2009-03-11 18:42:43 +00:00
parent 1826b73a27
commit 266be45c7d
9 changed files with 942 additions and 9 deletions

View File

@ -179,6 +179,11 @@ New Features
and lucene query parser (the latter existed as an undocumented
feature in 1.3) (yonik)
30. SOLR-940: Add support for Lucene's Trie Range Queries by providing new FieldTypes in schema for int, float, long,
double and date. Range searches and term queries on such fields will automatically use the corresponding trie
range filter in Lucene contrib-queries and can be dramatically faster than normal range queries.
(Uwe Schindler, shalin)
Optimizations
----------------------

View File

@ -115,6 +115,39 @@
-->
<fieldType name="date" class="solr.DateField" sortMissingLast="true" omitNorms="true"/>
<!--
Numeric field types that manipulate the value into trie encoded strings which are not
human readable in the internal form. Range searches on such fields use the fast Trie Range Queries
which are much faster than range searches on the SortableNumberField types.
For the fast range search to work, trie fields must be indexed. Trie fields are <b>not</b> sortable
in numerical order. Also, they cannot be used in function queries. If one needs sorting as well as
fast range search, one should create a copy field specifically for sorting. Same workaround is
suggested for using trie fields in function queries as well.
For each number being added to this field, multiple terms are generated as per the algorithm described in
org.apache.lucene.search.trie package description. The possible number of terms depend on the precisionStep
attribute and increase dramatically with higher precision steps (factor 2**precisionStep). The default
value of precisionStep is 8.
Note that if you use a precisionStep of 32 for int/float and 64 for long/double, then multiple terms
will not be generated, range search will be no faster than any other number field,
but sorting will be possible.
-->
<fieldType name="tint" class="solr.TrieField" type="integer" omitNorms="true" positionIncrementGap="0" indexed="true" stored="false" />
<fieldType name="tfloat" class="solr.TrieField" type="float" omitNorms="true" positionIncrementGap="0" indexed="true" stored="false" />
<fieldType name="tlong" class="solr.TrieField" type="long" omitNorms="true" positionIncrementGap="0" indexed="true" stored="false" />
<fieldType name="tdouble" class="solr.TrieField" type="double" omitNorms="true" positionIncrementGap="0" indexed="true" stored="false" />
<fieldType name="tdouble4" class="solr.TrieField" type="double" precisionStep="4" omitNorms="true" positionIncrementGap="0" indexed="true" stored="false" />
<!--
This date field manipulates the value into a trie encoded strings for fast range searches. They follow the
same format and semantics as the normal DateField and support the date math syntax except that they are
not sortable and cannot be used in function queries.
-->
<fieldType name="tdate" class="solr.TrieField" type="date" omitNorms="true" positionIncrementGap="0" indexed="true" stored="false" />
<!-- The "RandomSortField" is not used to store or search any
data. You can declare fields of this type it in your schema
@ -343,6 +376,19 @@
<field name="timestamp" type="date" indexed="true" stored="true" default="NOW" multiValued="false"/>
<field name="spell" type="textSpell" indexed="true" stored="true" multiValued="true"/>
<!-- Some examples of trie fields -->
<field name="tint" type="tint" indexed="true" stored="false" />
<field name="tfloat" type="tfloat" indexed="true" stored="false" />
<field name="tlong" type="tlong" indexed="true" stored="false" />
<field name="tdouble" type="tdouble" indexed="true" stored="false" />
<!-- A double with a custom precisionStep -->
<field name="tdouble4" type="tdouble4" indexed="true" stored="false" />
<!-- An example for the trie date field -->
<field name="tdate" type="tdate" indexed="true" stored="true" />
<!-- Dynamic field definitions. If a field name is not found, dynamicFields
will be used if the name matches any of the patterns.
RESTRICTION: the glob-like pattern in the name attribute must have

View File

@ -0,0 +1,103 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.analysis;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.search.trie.TrieUtils;
import org.apache.solr.common.SolrException;
import org.apache.solr.schema.DateField;
import static org.apache.solr.schema.TrieField.TrieTypes;
import java.io.IOException;
import java.io.Reader;
/**
* Index time tokenizer for trie fields. It uses methods in TrieUtils to create multiple trie encoded string per number.
* Each string created by this tokenizer for a given number differs from the previous by the given precisionStep.
* <p/>
* Refer to {@linkplain org.apache.lucene.search.trie package description} for more details.
*
* @version $Id$
* @see org.apache.lucene.search.trie.TrieUtils
* @see org.apache.solr.schema.TrieField
* @since solr 1.4
*/
public class TrieIndexTokenizerFactory extends BaseTokenizerFactory {
private final int precisionStep;
private final TrieTypes type;
public TrieIndexTokenizerFactory(TrieTypes type, int precisionStep) {
this.type = type;
this.precisionStep = precisionStep;
}
public TokenStream create(Reader input) {
try {
return new TrieIndexTokenizer(input, type, precisionStep);
} catch (IOException e) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unable to create TrieIndexTokenizer", e);
}
}
/**
* @version $Id$
* @since solr 1.4
*/
public static class TrieIndexTokenizer extends Tokenizer {
private final String[] trieVals;
private int pos = 0;
protected static final DateField dateField = new DateField();
public TrieIndexTokenizer(Reader reader, TrieTypes type, int precisionStep) throws IOException {
super(reader);
StringBuilder builder = new StringBuilder();
char[] buf = new char[8];
int len;
while ((len = reader.read(buf)) != -1)
builder.append(buf, 0, len);
switch (type) {
case INTEGER:
this.trieVals = TrieUtils.trieCodeInt(Integer.parseInt(builder.toString()), precisionStep);
break;
case FLOAT:
this.trieVals = TrieUtils.trieCodeInt(TrieUtils.floatToSortableInt(Float.parseFloat(builder.toString())), precisionStep);
break;
case LONG:
this.trieVals = TrieUtils.trieCodeLong(Long.parseLong(builder.toString()), precisionStep);
break;
case DOUBLE:
this.trieVals = TrieUtils.trieCodeLong(TrieUtils.doubleToSortableLong(Double.parseDouble(builder.toString())), precisionStep);
break;
case DATE:
this.trieVals = TrieUtils.trieCodeLong(dateField.parseMath(null, builder.toString()).getTime(), precisionStep);
break;
default:
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unknown type for trie field");
}
}
public Token next(Token token) {
if (pos >= trieVals.length) return null;
token.reinit(trieVals[pos++], 0, 0);
token.setPositionIncrement(0);
return token;
}
}
}

View File

@ -0,0 +1,103 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.analysis;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.search.trie.TrieUtils;
import org.apache.solr.common.SolrException;
import org.apache.solr.schema.DateField;
import org.apache.solr.schema.TrieField;
import java.io.IOException;
import java.io.Reader;
/**
* Query time tokenizer for trie fields. It uses methods in TrieUtils to create a prefix coded representation of the
* given number which is used for term queries.
* <p/>
* Note that queries on trie date types are not tokenized and returned as is.
*
* @version $Id$
* @see org.apache.lucene.search.trie.TrieUtils
* @see org.apache.solr.schema.TrieField
* @since solr 1.4
*/
public class TrieQueryTokenizerFactory extends BaseTokenizerFactory {
private final TrieField.TrieTypes type;
public TrieQueryTokenizerFactory(TrieField.TrieTypes type) {
this.type = type;
}
public TokenStream create(Reader reader) {
try {
return new TrieQueryTokenizer(reader, type);
} catch (IOException e) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unable to create TrieQueryTokenizer", e);
}
}
public static class TrieQueryTokenizer extends Tokenizer {
private static final DateField dateField = new DateField();
private final String number;
private boolean used = false;
private TrieField.TrieTypes type;
public TrieQueryTokenizer(Reader reader, TrieField.TrieTypes type) throws IOException {
super(reader);
this.type = type;
StringBuilder builder = new StringBuilder();
char[] buf = new char[8];
int len;
while ((len = reader.read(buf)) != -1)
builder.append(buf, 0, len);
number = builder.toString();
}
@Override
public Token next(Token token) throws IOException {
if (used) return null;
String value = number;
switch (type) {
case INTEGER:
value = TrieUtils.intToPrefixCoded(Integer.parseInt(number));
break;
case FLOAT:
value = TrieUtils.intToPrefixCoded(TrieUtils.floatToSortableInt(Float.parseFloat(number)));
break;
case LONG:
value = TrieUtils.longToPrefixCoded(Long.parseLong(number));
break;
case DOUBLE:
value = TrieUtils.longToPrefixCoded(TrieUtils.doubleToSortableLong(Double.parseDouble(number)));
break;
case DATE:
value = TrieUtils.longToPrefixCoded(dateField.parseMath(null, number).getTime());
break;
default:
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unknown type for trie field");
}
token.reinit(value, 0, 0);
token.setPositionIncrement(0);
used = true;
return token;
}
}
}

View File

@ -209,6 +209,7 @@ public class BinaryResponseWriter implements BinaryQueryResponseWriter {
KNOWN_TYPES.add(SortableDoubleField.class);
KNOWN_TYPES.add(StrField.class);
KNOWN_TYPES.add(TextField.class);
KNOWN_TYPES.add(TrieField.class);
// We do not add UUIDField because UUID object is not a supported type in JavaBinCodec
// and if we write UUIDField.toObject, we wouldn't know how to handle it in the client side
}

View File

@ -0,0 +1,187 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.schema;
import org.apache.lucene.document.Fieldable;
import org.apache.lucene.search.Filter;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.trie.IntTrieRangeFilter;
import org.apache.lucene.search.trie.LongTrieRangeFilter;
import org.apache.lucene.search.trie.TrieUtils;
import org.apache.solr.analysis.*;
import org.apache.solr.common.SolrException;
import org.apache.solr.request.TextResponseWriter;
import org.apache.solr.request.XMLWriter;
import java.io.IOException;
import java.util.Map;
/**
* Provides field types to support for Lucene's Trie Range Queries. See {@linkplain org.apache.lucene.search.trie
* package description} for more details. It supports integer, float, long, double and date types.
* <p/>
* For each number being added to this field, multiple terms are generated as per the algorithm described in the above
* link. The possible number of terms increases dramatically with higher precision steps (factor 2^precisionStep). For
* the fast range search to work, trie fields must be indexed.
* <p/>
* Trie fields are <b>not</b> sortable in numerical order. Also, they cannot be used in function queries. If one needs
* sorting as well as fast range search, one should create a copy field specifically for sorting. Same workaround is
* suggested for using trie fields in function queries as well.
* <p/>
* Note that if you use a precisionStep of 32 for int/float and 64 for long/double, then multiple terms will not be
* generated, range search will be no faster than any other number field, but sorting will be possible.
*
* @version $Id$
* @see org.apache.lucene.search.trie.TrieUtils
* @since solr 1.4
*/
public class TrieField extends FieldType {
public static final int DEFAULT_PRECISION_STEP = 8;
protected int precisionStep = TrieField.DEFAULT_PRECISION_STEP;
protected TrieTypes type;
/**
* Used for handling date types following the same semantics as DateField
*/
private static final DateField dateField = new DateField();
@Override
protected void init(IndexSchema schema, Map<String, String> args) {
String p = args.remove("precisionStep");
if (p != null) {
precisionStep = Integer.parseInt(p);
}
String t = args.remove("type");
if (t == null) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
"Invalid type specified in schema.xml for field: " + args.get("name"));
} else {
try {
type = TrieTypes.valueOf(t.toUpperCase());
} catch (IllegalArgumentException e) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
"Invalid type specified in schema.xml for field: " + args.get("name"), e);
}
}
CharFilterFactory[] filterFactories = new CharFilterFactory[0];
TokenFilterFactory[] tokenFilterFactories = new TokenFilterFactory[0];
analyzer = new TokenizerChain(filterFactories, new TrieIndexTokenizerFactory(type, precisionStep), tokenFilterFactories);
queryAnalyzer = new TokenizerChain(filterFactories, new TrieQueryTokenizerFactory(type), tokenFilterFactories);
}
@Override
public Object toObject(Fieldable f) {
String s = f.stringValue();
switch (type) {
case INTEGER:
return Integer.parseInt(s);
case FLOAT:
return Float.parseFloat(s);
case LONG:
return Long.parseLong(s);
case DOUBLE:
return Double.parseDouble(s);
case DATE:
return dateField.toObject(f);
default:
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unknown type for trie field: " + f.name());
}
}
public SortField getSortField(SchemaField field, boolean top) {
switch (type) {
case INTEGER:
case FLOAT:
return TrieUtils.getIntSortField(field.getName(), top);
case LONG:
case DOUBLE:
case DATE:
return TrieUtils.getLongSortField(field.getName(), top);
default:
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unknown type for trie field: " + field.name);
}
}
public void write(XMLWriter xmlWriter, String name, Fieldable f) throws IOException {
xmlWriter.writeVal(name, toObject(f));
}
public void write(TextResponseWriter writer, String name, Fieldable f) throws IOException {
writer.writeVal(name, toObject(f));
}
@Override
public boolean isTokenized() {
return true;
}
/**
* @return the precisionStep used to index values into the field
*/
public int getPrecisionStep() {
return precisionStep;
}
/**
* @return the type of this field
*/
public TrieTypes getType() {
return type;
}
public Filter getTrieRangeFilter(String field, String min, String max, boolean minInclusive, boolean maxInclusive) {
switch (type) {
case INTEGER:
return new IntTrieRangeFilter(field, field, precisionStep,
"*".equals(min) ? null : Integer.parseInt(min),
"*".equals(max) ? null : Integer.parseInt(max),
minInclusive, maxInclusive);
case FLOAT:
return new IntTrieRangeFilter(field, field, precisionStep,
"*".equals(min) ? null : TrieUtils.floatToSortableInt(Float.parseFloat(min)),
"*".equals(max) ? null : TrieUtils.floatToSortableInt(Float.parseFloat(max)),
minInclusive, maxInclusive);
case LONG:
return new LongTrieRangeFilter(field, field, precisionStep,
"*".equals(min) ? null : Long.parseLong(min),
"*".equals(max) ? null : Long.parseLong(max),
minInclusive, maxInclusive);
case DOUBLE:
return new LongTrieRangeFilter(field, field, precisionStep,
"*".equals(min) ? null : TrieUtils.doubleToSortableLong(Double.parseDouble(min)),
"*".equals(max) ? null : TrieUtils.doubleToSortableLong(Double.parseDouble(max)),
minInclusive, maxInclusive);
case DATE:
return new LongTrieRangeFilter(field, field, precisionStep,
"*".equals(min) ? null : dateField.parseMath(null, min).getTime(),
"*".equals(max) ? null : dateField.parseMath(null, max).getTime(),
minInclusive, maxInclusive);
default:
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unknown type for trie field");
}
}
public enum TrieTypes {
INTEGER,
LONG,
FLOAT,
DOUBLE,
DATE
}
}

View File

@ -20,14 +20,12 @@ package org.apache.solr.search;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.ConstantScoreRangeQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.search.ConstantScoreQuery;
import org.apache.lucene.search.*;
import org.apache.lucene.analysis.Analyzer;
import org.apache.solr.common.SolrException;
import org.apache.solr.schema.FieldType;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.schema.TrieField;
// TODO: implement the analysis of simple fields with
// FieldType.toInternal() instead of going through the
@ -121,11 +119,18 @@ public class SolrQueryParser extends QueryParser {
protected Query getRangeQuery(String field, String part1, String part2, boolean inclusive) throws ParseException {
checkNullField(field);
FieldType ft = schema.getFieldType(field);
return new ConstantScoreRangeQuery(
field,
"*".equals(part1) ? null : ft.toInternal(part1),
"*".equals(part2) ? null : ft.toInternal(part2),
inclusive, inclusive);
if (ft instanceof TrieField) {
TrieField f = (TrieField) ft;
return new ConstantScoreQuery(f.getTrieRangeFilter(field, part1, part2, inclusive, inclusive));
} else {
RangeQuery rangeQuery = new RangeQuery(
field,
"*".equals(part1) ? null : ft.toInternal(part1),
"*".equals(part2) ? null : ft.toInternal(part2),
inclusive, inclusive);
rangeQuery.setConstantScoreRewrite(true);
return rangeQuery;
}
}
protected Query getPrefixQuery(String field, String termStr) throws ParseException {

View File

@ -0,0 +1,155 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.util.AbstractSolrTestCase;
import java.sql.Date;
import java.text.SimpleDateFormat;
/**
* Tests for TrieField functionality
*
* @version $Id$
* @since solr 1.4
*/
public class TestTrie extends AbstractSolrTestCase {
public String getSchemaFile() {
return "schema-trie.xml";
}
public String getSolrConfigFile() {
return "solrconfig.xml";
}
public void testTrieIntRangeSearch() throws Exception {
for (int i = 0; i < 10; i++) {
assertU(adoc("id", String.valueOf(i), "tint", String.valueOf(i)));
}
assertU(commit());
assertQ("Range filter must match only 5 documents", req("q", "*:*", "fq", "tint:[2 TO 6]"), "//*[@numFound='5']");
for (int i = 1; i < 11; i++) {
assertU(adoc("id", String.valueOf(-i), "tint", String.valueOf(-i)));
}
assertU(commit());
assertQ("Range filter must match only 5 documents", req("q", "*:*", "fq", "tint:[-6 TO -2]"), "//*[@numFound='5']");
// Test open ended range searches
assertQ("Range filter tint:[-9 to *] must match 20 documents", req("q", "*:*", "fq", "tint:[-10 TO *]"), "//*[@numFound='20']");
assertQ("Range filter tint:[* to 9] must match 20 documents", req("q", "*:*", "fq", "tint:[* TO 10]"), "//*[@numFound='20']");
assertQ("Range filter tint:[* to *] must match 20 documents", req("q", "*:*", "fq", "tint:[* TO *]"), "//*[@numFound='20']");
}
public void testTrieTermQuery() throws Exception {
for (int i = 0; i < 10; i++) {
assertU(adoc("id", String.valueOf(i),
"tint", String.valueOf(i),
"tfloat", String.valueOf(i * i * 31.11f),
"tlong", String.valueOf((long) Integer.MAX_VALUE + (long) i),
"tdouble", String.valueOf(i * 2.33d)));
}
assertU(commit());
// Use with q
assertQ("Term query on trie int field must match 1 document", req("q", "tint:2"), "//*[@numFound='1']");
assertQ("Term query on trie float field must match 1 document", req("q", "tfloat:124.44"), "//*[@numFound='1']");
assertQ("Term query on trie long field must match 1 document", req("q", "tlong:2147483648"), "//*[@numFound='1']");
assertQ("Term query on trie double field must match 1 document", req("q", "tdouble:4.66"), "//*[@numFound='1']");
// Use with fq
assertQ("Term query on trie int field must match 1 document", req("q", "*:*", "fq", "tint:2"), "//*[@numFound='1']");
assertQ("Term query on trie float field must match 1 document", req("q", "*:*", "fq", "tfloat:124.44"), "//*[@numFound='1']");
assertQ("Term query on trie long field must match 1 document", req("q", "*:*", "fq", "tlong:2147483648"), "//*[@numFound='1']");
assertQ("Term query on trie double field must match 1 document", req("q", "*:*", "fq", "tdouble:4.66"), "//*[@numFound='1']");
}
public void testTrieFloatRangeSearch() throws Exception {
for (int i = 0; i < 10; i++) {
assertU(adoc("id", String.valueOf(i), "tfloat", String.valueOf(i * i * 31.11f)));
}
assertU(commit());
SolrQueryRequest req = req("q", "*:*", "fq", "tfloat:[0 TO 2518.0]");
assertQ("Range filter must match only 5 documents", req, "//*[@numFound='9']");
req = req("q", "*:*", "fq", "tfloat:[0 TO *]");
assertQ("Range filter must match 10 documents", req, "//*[@numFound='10']");
}
public void testTrieLongRangeSearch() throws Exception {
for (long i = Integer.MAX_VALUE, c = 0; i < (long) Integer.MAX_VALUE + 10l; i++) {
assertU(adoc("id", String.valueOf(c++), "tlong", String.valueOf(i)));
}
assertU(commit());
String fq = "tlong:[" + Integer.MAX_VALUE + " TO " + (5l + Integer.MAX_VALUE) + "]";
SolrQueryRequest req = req("q", "*:*", "fq", fq);
assertQ("Range filter must match only 5 documents", req, "//*[@numFound='6']");
assertQ("Range filter tlong:[* to *] must match 10 documents", req("q", "*:*", "fq", "tlong:[* TO *]"), "//*[@numFound='10']");
}
public void testTrieDoubleRangeSearch() throws Exception {
for (long i = Integer.MAX_VALUE, c = 0; i < (long) Integer.MAX_VALUE + 10l; i++) {
assertU(adoc("id", String.valueOf(c++), "tdouble", String.valueOf(i * 2.33d)));
}
assertU(commit());
String fq = "tdouble:[" + Integer.MAX_VALUE * 2.33d + " TO " + (5l + Integer.MAX_VALUE) * 2.33d + "]";
assertQ("Range filter must match only 5 documents", req("q", "*:*", "fq", fq), "//*[@numFound='6']");
assertQ("Range filter tdouble:[* to *] must match 10 documents", req("q", "*:*", "fq", "tdouble:[* TO *]"), "//*[@numFound='10']");
}
public void testTrieDateRangeSearch() throws Exception {
for (int i = 0; i < 10; i++) {
assertU(adoc("id", String.valueOf(i), "tdate", "1995-12-31T23:" + (i < 10 ? "0" + i : i) + ":59.999Z"));
}
assertU(commit());
SolrQueryRequest req = req("q", "*:*", "fq", "tdate:[1995-12-31T23:00:59.999Z TO 1995-12-31T23:04:59.999Z]");
assertQ("Range filter must match only 5 documents", req, "//*[@numFound='5']");
// Test open ended range searches
assertQ("Range filter tint:[1995-12-31T23:00:59.999Z to *] must match 10 documents", req("q", "*:*", "fq", "tdate:[1995-12-31T23:00:59.999Z TO *]"), "//*[@numFound='10']");
assertQ("Range filter tint:[* to 1995-12-31T23:09:59.999Z] must match 10 documents", req("q", "*:*", "fq", "tdate:[* TO 1995-12-31T23:09:59.999Z]"), "//*[@numFound='10']");
assertQ("Range filter tint:[* to *] must match 10 documents", req("q", "*:*", "fq", "tdate:[* TO *]"), "//*[@numFound='10']");
// Test date math syntax
SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'");
assertU(delQ("*:*"));
long curTime = System.currentTimeMillis();
for (int i = 0; i < 10; i++) {
// index 10 days starting with today
long date = curTime + i * 86400 * 1000;
assertU(adoc("id", String.valueOf(i), "tdate", format.format(new Date(date))));
}
assertU(commit());
assertQ("Range filter must match only 10 documents", req("q", "*:*", "fq", "tdate:[* TO *]"), "//*[@numFound='10']");
req = req("q", "*:*", "fq", "tdate:[NOW/DAY TO NOW+5DAYS]");
assertQ("Range filter must match only 5 documents", req, "//*[@numFound='5']");
// Test Term Queries
assertU(adoc("id", "11", "tdate", "1995-12-31T23:59:59.999Z"));
assertU(commit());
assertQ("Term query must match only 1 document", req("q", "tdate:1995-12-31T23\\:59\\:59.999Z"), "//*[@numFound='1']");
assertQ("Term query must match only 1 document", req("q", "*:*", "fq", "tdate:1995-12-31T23\\:59\\:59.999Z"), "//*[@numFound='1']");
}
public void testTrieDoubleRangeSearch_CustomPrecisionStep() throws Exception {
for (long i = Integer.MAX_VALUE, c = 0; i < (long) Integer.MAX_VALUE + 10l; i++) {
assertU(adoc("id", String.valueOf(c++), "tdouble4", String.valueOf(i * 2.33d)));
}
assertU(commit());
String fq = "tdouble4:[" + Integer.MAX_VALUE * 2.33d + " TO " + (5l + Integer.MAX_VALUE) * 2.33d + "]";
assertQ("Range filter must match only 5 documents", req("q", "*:*", "fq", fq), "//*[@numFound='6']");
}
}

View File

@ -0,0 +1,328 @@
<?xml version="1.0" encoding="UTF-8" ?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<!--
This is the Solr schema file. This file should be named "schema.xml" and
should be in the conf directory under the solr home
(i.e. ./solr/conf/schema.xml by default)
or located where the classloader for the Solr webapp can find it.
This example schema is the recommended starting point for users.
It should be kept correct and concise, usable out-of-the-box.
For more information, on how to customize this file, please see
http://wiki.apache.org/solr/SchemaXml
-->
<schema name="example" version="1.2">
<!-- attribute "name" is the name of this schema and is only used for display purposes.
Applications should change this to reflect the nature of the search collection.
version="1.1" is Solr's version number for the schema syntax and semantics. It should
not normally be changed by applications.
1.0: multiValued attribute did not exist, all fields are multiValued by nature
1.1: multiValued attribute introduced, false by default -->
<types>
<!-- field type definitions. The "name" attribute is
just a label to be used by field definitions. The "class"
attribute and any other attributes determine the real
behavior of the fieldType.
Class names starting with "solr" refer to java classes in the
org.apache.solr.analysis package.
-->
<!-- The StrField type is not analyzed, but indexed/stored verbatim.
- StrField and TextField support an optional compressThreshold which
limits compression (if enabled in the derived fields) to values which
exceed a certain size (in characters).
-->
<fieldType name="string" class="solr.StrField" sortMissingLast="true" omitNorms="true"/>
<!-- boolean type: "true" or "false" -->
<fieldType name="boolean" class="solr.BoolField" sortMissingLast="true" omitNorms="true"/>
<!-- The optional sortMissingLast and sortMissingFirst attributes are
currently supported on types that are sorted internally as strings.
- If sortMissingLast="true", then a sort on this field will cause documents
without the field to come after documents with the field,
regardless of the requested sort order (asc or desc).
- If sortMissingFirst="true", then a sort on this field will cause documents
without the field to come before documents with the field,
regardless of the requested sort order.
- If sortMissingLast="false" and sortMissingFirst="false" (the default),
then default lucene sorting will be used which places docs without the
field first in an ascending sort and last in a descending sort.
-->
<!-- numeric field types that store and index the text
value verbatim (and hence don't support range queries, since the
lexicographic ordering isn't equal to the numeric ordering) -->
<fieldType name="integer" class="solr.IntField" omitNorms="true"/>
<fieldType name="long" class="solr.LongField" omitNorms="true"/>
<fieldType name="float" class="solr.FloatField" omitNorms="true"/>
<fieldType name="double" class="solr.DoubleField" omitNorms="true"/>
<!-- Numeric field types that manipulate the value into
a string value that isn't human-readable in its internal form,
but with a lexicographic ordering the same as the numeric ordering,
so that range queries work correctly. -->
<fieldType name="sint" class="solr.SortableIntField" sortMissingLast="true" omitNorms="true"/>
<fieldType name="slong" class="solr.SortableLongField" sortMissingLast="true" omitNorms="true"/>
<fieldType name="sfloat" class="solr.SortableFloatField" sortMissingLast="true" omitNorms="true"/>
<fieldType name="sdouble" class="solr.SortableDoubleField" sortMissingLast="true" omitNorms="true"/>
<fieldType name="tint" class="solr.TrieField" type="integer" omitNorms="true" positionIncrementGap="0" indexed="true" stored="false" />
<fieldType name="tfloat" class="solr.TrieField" type="float" omitNorms="true" positionIncrementGap="0" indexed="true" stored="false" />
<fieldType name="tlong" class="solr.TrieField" type="long" omitNorms="true" positionIncrementGap="0" indexed="true" stored="false" />
<fieldType name="tdouble" class="solr.TrieField" type="double" omitNorms="true" positionIncrementGap="0" indexed="true" stored="false" />
<fieldType name="tdouble4" class="solr.TrieField" type="double" precisionStep="4" omitNorms="true" positionIncrementGap="0" indexed="true" stored="false" />
<fieldType name="tdate" class="solr.TrieField" type="date" omitNorms="true" positionIncrementGap="0" indexed="true" stored="false" />
<!-- The format for this date field is of the form 1995-12-31T23:59:59Z, and
is a more restricted form of the canonical representation of dateTime
http://www.w3.org/TR/xmlschema-2/#dateTime
The trailing "Z" designates UTC time and is mandatory.
Optional fractional seconds are allowed: 1995-12-31T23:59:59.999Z
All other components are mandatory.
Expressions can also be used to denote calculations that should be
performed relative to "NOW" to determine the value, ie...
NOW/HOUR
... Round to the start of the current hour
NOW-1DAY
... Exactly 1 day prior to now
NOW/DAY+6MONTHS+3DAYS
... 6 months and 3 days in the future from the start of
the current day
Consult the DateField javadocs for more information.
-->
<fieldType name="date" class="solr.DateField" sortMissingLast="true" omitNorms="true"/>
<!-- The "RandomSortField" is not used to store or search any
data. You can declare fields of this type it in your schema
to generate psuedo-random orderings of your docs for sorting
purposes. The ordering is generated based on the field name
and the version of the index, As long as the index version
remains unchanged, and the same field name is reused,
the ordering of the docs will be consistent.
If you want differend psuedo-random orderings of documents,
for the same version of the index, use a dynamicField and
change the name
-->
<fieldType name="random" class="solr.RandomSortField" indexed="true" />
<!-- solr.TextField allows the specification of custom text analyzers
specified as a tokenizer and a list of token filters. Different
analyzers may be specified for indexing and querying.
The optional positionIncrementGap puts space between multiple fields of
this type on the same document, with the purpose of preventing false phrase
matching across fields.
For more info on customizing your analyzer chain, please see
http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters
-->
<!-- One can also specify an existing Analyzer class that has a
default constructor via the class attribute on the analyzer element
<fieldType name="text_greek" class="solr.TextField">
<analyzer class="org.apache.lucene.analysis.el.GreekAnalyzer"/>
</fieldType>
-->
<!-- A text field that only splits on whitespace for exact matching of words -->
<fieldType name="text_ws" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
</analyzer>
</fieldType>
<!-- A text field that uses WordDelimiterFilter to enable splitting and matching of
words on case-change, alpha numeric boundaries, and non-alphanumeric chars,
so that a query of "wifi" or "wi fi" could match a document containing "Wi-Fi".
Synonyms and stopwords are customized by external files, and stemming is enabled.
Duplicate tokens at the same position (which may result from Stemmed Synonyms or
WordDelim parts) are removed.
-->
<fieldType name="text" class="solr.TextField" positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<!-- in this example, we will only use synonyms at query time
<filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
-->
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/>
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.EnglishPorterFilterFactory" protected="protwords.txt"/>
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/>
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.EnglishPorterFilterFactory" protected="protwords.txt"/>
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
</fieldType>
<!-- Less flexible matching, but less false matches. Probably not ideal for product names,
but may be good for SKUs. Can insert dashes in the wrong place and still match. -->
<fieldType name="textTight" class="solr.TextField" positionIncrementGap="100" >
<analyzer>
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="false"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/>
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.EnglishPorterFilterFactory" protected="protwords.txt"/>
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
</fieldType>
<!-- This is an example of using the KeywordTokenizer along
With various TokenFilterFactories to produce a sortable field
that does not include some properties of the source text
-->
<fieldType name="alphaOnlySort" class="solr.TextField" sortMissingLast="true" omitNorms="true">
<analyzer>
<!-- KeywordTokenizer does no actual tokenizing, so the entire
input string is preserved as a single token
-->
<tokenizer class="solr.KeywordTokenizerFactory"/>
<!-- The LowerCase TokenFilter does what you expect, which can be
when you want your sorting to be case insensitive
-->
<filter class="solr.LowerCaseFilterFactory" />
<!-- The TrimFilter removes any leading or trailing whitespace -->
<filter class="solr.TrimFilterFactory" />
<!-- The PatternReplaceFilter gives you the flexibility to use
Java Regular expression to replace any sequence of characters
matching a pattern with an arbitrary replacement string,
which may include back refrences to portions of the orriginal
string matched by the pattern.
See the Java Regular Expression documentation for more
infomation on pattern and replacement string syntax.
http://java.sun.com/j2se/1.5.0/docs/api/java/util/regex/package-summary.html
-->
<filter class="solr.PatternReplaceFilterFactory"
pattern="([^a-z])" replacement="" replace="all"
/>
</analyzer>
</fieldType>
<!-- since fields of this type are by default not stored or indexed, any data added to
them will be ignored outright
-->
<fieldType name="ignored" stored="false" indexed="false" class="solr.StrField" />
<fieldType name="file" keyField="id" defVal="1" stored="false" indexed="false" class="solr.ExternalFileField" valType="float"/>
</types>
<fields>
<!-- Valid attributes for fields:
name: mandatory - the name for the field
type: mandatory - the name of a previously defined type from the <types> section
indexed: true if this field should be indexed (searchable or sortable)
stored: true if this field should be retrievable
compressed: [false] if this field should be stored using gzip compression
(this will only apply if the field type is compressable; among
the standard field types, only TextField and StrField are)
multiValued: true if this field may contain multiple values per document
omitNorms: (expert) set to true to omit the norms associated with
this field (this disables length normalization and index-time
boosting for the field, and saves some memory). Only full-text
fields or fields that need an index-time boost need norms.
termVectors: [false] set to true to store the term vector for a given field.
When using MoreLikeThis, fields used for similarity should be stored for
best performance.
-->
<!-- for testing, a type that does a transform to see if it's correctly done everywhere -->
<field name="id" type="sfloat" indexed="true" stored="true" required="true" />
<field name="text" type="text" indexed="true" stored="false" />
<field name="tint" type="tint" indexed="true" stored="false" />
<field name="tfloat" type="tfloat" indexed="true" stored="false" />
<field name="tlong" type="tlong" indexed="true" stored="false" />
<field name="tdouble" type="tdouble" indexed="true" stored="false" />
<field name="tdouble4" type="tdouble4" indexed="true" stored="false" />
<field name="tdate" type="tdate" indexed="true" stored="true" />
<!-- Dynamic field definitions. If a field name is not found, dynamicFields
will be used if the name matches any of the patterns.
RESTRICTION: the glob-like pattern in the name attribute must have
a "*" only at the start or the end.
EXAMPLE: name="*_i" will match any field ending in _i (like myid_i, z_i)
Longer patterns will be matched first. if equal size patterns
both match, the first appearing in the schema will be used. -->
<dynamicField name="*_s" type="string" indexed="true" stored="true"/>
<dynamicField name="*_ss" type="string" indexed="true" stored="true" multiValued="true"/>
<dynamicField name="*_i" type="sint" indexed="true" stored="true"/>
<dynamicField name="*_l" type="slong" indexed="true" stored="true"/>
<dynamicField name="*_f" type="sfloat" indexed="true" stored="true"/>
<dynamicField name="*_d" type="sdouble" indexed="true" stored="true"/>
<dynamicField name="*_pi" type="integer" indexed="true" stored="true"/>
<dynamicField name="*_pl" type="long" indexed="true" stored="true"/>
<dynamicField name="*_pf" type="float" indexed="true" stored="true"/>
<dynamicField name="*_pd" type="double" indexed="true" stored="true"/>
<dynamicField name="*_t" type="text" indexed="true" stored="true"/>
<dynamicField name="*_b" type="boolean" indexed="true" stored="true"/>
<dynamicField name="*_dt" type="date" indexed="true" stored="true"/>
<dynamicField name="*_ws" type="text_ws" indexed="true" stored="true"/>
<dynamicField name="*_extf" type="file"/>
<dynamicField name="*_random" type="random" />
<!-- uncomment the following to ignore any fields that don't already match an existing
field name or dynamic field, rather than reporting them as an error.
alternately, change the type="ignored" to some other type e.g. "text" if you want
unknown fields indexed and/or stored by default -->
<!--dynamicField name="*" type="ignored" /-->
</fields>
<!-- Field to use to determine and enforce document uniqueness.
Unless this field is marked with required="false", it will be a required field
-->
<uniqueKey>id</uniqueKey>
<!-- field for the QueryParser to use when an explicit fieldname is absent -->
<defaultSearchField>text</defaultSearchField>
</schema>