From 7b34ab8f306ace594b05292667384583eb6c9a1d Mon Sep 17 00:00:00 2001 From: Uwe Schindler Date: Fri, 19 Jun 2009 12:09:52 +0000 Subject: [PATCH] LUCENE-1673: Move TrieRange to core (part 1: addition to core) git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@786470 13f79535-47bb-0310-9956-ffa450edef68 --- CHANGES.txt | 15 +- .../apache/lucene/spatial/NumberUtils.java | 10 +- .../lucene/analysis/NumericTokenStream.java | 244 +++++++++ .../org/apache/lucene/document/DateField.java | 19 +- .../org/apache/lucene/document/DateTools.java | 12 + .../apache/lucene/document/NumberTools.java | 13 +- .../lucene/search/NumericRangeFilter.java | 122 +++++ .../lucene/search/NumericRangeQuery.java | 410 ++++++++++++++ .../org/apache/lucene/search/RangeFilter.java | 8 +- .../org/apache/lucene/search/RangeQuery.java | 6 +- .../org/apache/lucene/search/package.html | 17 + .../org/apache/lucene/util/NumericUtils.java | 503 ++++++++++++++++++ .../analysis/TestNumericTokenStream.java | 103 ++++ .../search/TestNumericRangeQuery32.java | 431 +++++++++++++++ .../search/TestNumericRangeQuery64.java | 427 +++++++++++++++ .../apache/lucene/util/TestNumericUtils.java | 339 ++++++++++++ 16 files changed, 2671 insertions(+), 8 deletions(-) create mode 100644 src/java/org/apache/lucene/analysis/NumericTokenStream.java create mode 100644 src/java/org/apache/lucene/search/NumericRangeFilter.java create mode 100644 src/java/org/apache/lucene/search/NumericRangeQuery.java create mode 100644 src/java/org/apache/lucene/util/NumericUtils.java create mode 100644 src/test/org/apache/lucene/analysis/TestNumericTokenStream.java create mode 100644 src/test/org/apache/lucene/search/TestNumericRangeQuery32.java create mode 100644 src/test/org/apache/lucene/search/TestNumericRangeQuery64.java create mode 100644 src/test/org/apache/lucene/util/TestNumericUtils.java diff --git a/CHANGES.txt b/CHANGES.txt index a740ac377dc..73e89040469 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -207,6 +207,10 @@ API Changes ReadOnlySegmentReader.class, along with the src/gcj/* specializations for GCJ, are now deprecated, to be removed in 3.0. (Earwin Burrfoot via Mike McCandless) + +23. LUCENE-1673: Deprecated NumberTools in favour of the new + NumericRangeQuery and its new indexing format for numeric or + date values. (Uwe Schindler) Bug fixes @@ -408,8 +412,15 @@ Bug fixes via Mike McCandless) 26. LUCENE-1550: Added new n-gram based String distance measure for spell checking. - See the Javadocs for NGramDistance.java for a reference paper on why this is helpful (Tom Morton via Grant Ingersoll) - + See the Javadocs for NGramDistance.java for a reference paper on why + this is helpful (Tom Morton via Grant Ingersoll) + +27. LUCENE-1470, LUCENE-1582, LUCENE-1602, LUCENE-1673: Added + NumericRangeQuery and NumericRangeFilter, a fast alternative to + RangeQuery/RangeFilter for numeric searches. They depend on a specific + structure of terms in the index that can be created by indexing + using the new NumericTokenStream class. (Uwe Schindler, + Yonik Seeley, Mike McCandless) Optimizations diff --git a/contrib/spatial/src/java/org/apache/lucene/spatial/NumberUtils.java b/contrib/spatial/src/java/org/apache/lucene/spatial/NumberUtils.java index d256011289c..f2dd5f8463a 100644 --- a/contrib/spatial/src/java/org/apache/lucene/spatial/NumberUtils.java +++ b/contrib/spatial/src/java/org/apache/lucene/spatial/NumberUtils.java @@ -17,12 +17,20 @@ package org.apache.lucene.spatial; +import org.apache.lucene.analysis.NumericTokenStream; // for javadocs +import org.apache.lucene.search.NumericRangeQuery; // for javadocs +import org.apache.lucene.util.NumericUtils; // for javadocs + /** * TODO -- when solr moves NumberUtils to lucene, this should be redundant * * This is a copy of solr's number utils with only the functions we use... * - * @deprecated will be replaced with lucene version of solr copy... + * @deprecated TODO: This helper class will be removed soonly. + * For new indexes use {@link NumericUtils} instead, which provides a sortable + * binary representation (prefix encoded) of numeric values. + * To index and efficiently query numeric values use {@link NumericTokenStream} + * and {@link NumericRangeQuery}. */ @Deprecated public class NumberUtils { diff --git a/src/java/org/apache/lucene/analysis/NumericTokenStream.java b/src/java/org/apache/lucene/analysis/NumericTokenStream.java new file mode 100644 index 00000000000..274630b78dc --- /dev/null +++ b/src/java/org/apache/lucene/analysis/NumericTokenStream.java @@ -0,0 +1,244 @@ +package org.apache.lucene.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.util.NumericUtils; +import org.apache.lucene.search.NumericRangeQuery; // for javadocs +import org.apache.lucene.search.NumericRangeFilter; // for javadocs +import org.apache.lucene.analysis.tokenattributes.TermAttribute; +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; + +/** + * This class provides a {@link TokenStream} for indexing numeric values + * that can be used by {@link NumericRangeQuery}/{@link NumericRangeFilter}. + * For more information, how to use this class and its configuration properties + * (precisionStep) + * read the docs of {@link NumericRangeQuery}. + * + *

This stream is not intended to be used in analyzers, its more for iterating the + * different precisions during indexing a specific numeric value. + * A numeric value is indexed as multiple string encoded terms, each reduced + * by zeroing bits from the right. Each value is also prefixed (in the first char) by the + * shift value (number of bits removed) used during encoding. + * The number of bits removed from the right for each trie entry is called + * precisionStep in this API. + * + *

The usage pattern is (it is recommened to switch off norms and term frequencies + * for numeric fields; it does not make sense to have them): + *

+ *  Field field = new Field(name, new NumericTokenStream(precisionStep).set???Value(value));
+ *  field.setOmitNorms(true);
+ *  field.setOmitTermFreqAndPositions(true);
+ *  document.add(field);
+ * 
+ *

For optimal performance, re-use the TokenStream and Field instance + * for more than one document: + *

+ *  // init
+ *  NumericTokenStream stream = new NumericTokenStream(precisionStep);
+ *  Field field = new Field(name, stream);
+ *  field.setOmitNorms(true);
+ *  field.setOmitTermFreqAndPositions(true);
+ *  Document doc = new Document();
+ *  document.add(field);
+ *  // use this code to index many documents:
+ *  stream.set???Value(value1)
+ *  writer.addDocument(document);
+ *  stream.set???Value(value2)
+ *  writer.addDocument(document);
+ *  ...
+ * 
+ *

Please note: Token streams are read, when the document is added to index. + * If you index more than one numeric field, use a separate instance for each. + * + *

Values indexed by this stream can be sorted on or loaded into the field cache. + * For that factories like {@link NumericUtils#getLongSortField} are available, + * as well as parsers for filling the field cache (e.g., {@link NumericUtils#FIELD_CACHE_LONG_PARSER}) + * + * @since 2.9 + */ +public final class NumericTokenStream extends TokenStream { + + /** The full precision 64 bit token gets this token type assigned. */ + public static final String TOKEN_TYPE_FULL_PREC_64 = "fullPrecNumeric64"; + + /** The lower precision 64 bit tokens gets this token type assigned. */ + public static final String TOKEN_TYPE_LOWER_PREC_64 = "lowerPrecNumeric64"; + + /** The full precision 32 bit token gets this token type assigned. */ + public static final String TOKEN_TYPE_FULL_PREC_32 = "fullPrecNumeric32"; + + /** The lower precision 32 bit tokens gets this token type assigned. */ + public static final String TOKEN_TYPE_LOWER_PREC_32 = "lowerPrecNumeric32"; + + /** + * Creates a token stream for numeric values. The stream is not yet initialized, + * before using set a value using the various set???Value() methods. + */ + public NumericTokenStream(final int precisionStep) { + this.precisionStep = precisionStep; + termAtt = (TermAttribute) addAttribute(TermAttribute.class); + typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class); + posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class); + } + + /** + * Initializes the token stream with the supplied long value. + * @param value the value, for which this TokenStream should enumerate tokens. + * @return this instance, because of this you can use it the following way: + * new Field(name, new NumericTokenStream(precisionStep).setLongValue(value)) + */ + public NumericTokenStream setLongValue(final long value) { + this.value = value; + valSize = 64; + shift = 0; + return this; + } + + /** + * Initializes the token stream with the supplied int value. + * @param value the value, for which this TokenStream should enumerate tokens. + * @return this instance, because of this you can use it the following way: + * new Field(name, new NumericTokenStream(precisionStep).setIntValue(value)) + */ + public NumericTokenStream setIntValue(final int value) { + this.value = (long) value; + valSize = 32; + shift = 0; + return this; + } + + /** + * Initializes the token stream with the supplied double value. + * @param value the value, for which this TokenStream should enumerate tokens. + * @return this instance, because of this you can use it the following way: + * new Field(name, new NumericTokenStream(precisionStep).setDoubleValue(value)) + */ + public NumericTokenStream setDoubleValue(final double value) { + this.value = NumericUtils.doubleToSortableLong(value); + valSize = 64; + shift = 0; + return this; + } + + /** + * Initializes the token stream with the supplied float value. + * @param value the value, for which this TokenStream should enumerate tokens. + * @return this instance, because of this you can use it the following way: + * new Field(name, new NumericTokenStream(precisionStep).setFloatValue(value)) + */ + public NumericTokenStream setFloatValue(final float value) { + this.value = (long) NumericUtils.floatToSortableInt(value); + valSize = 32; + shift = 0; + return this; + } + + // @Override + public void reset() { + if (valSize == 0) + throw new IllegalStateException("call set???Value() before usage"); + if (precisionStep < 1 || precisionStep > valSize) + throw new IllegalArgumentException("precisionStep may only be 1.."+valSize); + shift = 0; + } + + // @Override + public boolean incrementToken() { + if (valSize == 0) + throw new IllegalStateException("call set???Value() before usage"); + if (shift >= valSize) + return false; + + final char[] buffer; + switch (valSize) { + case 64: + buffer = termAtt.resizeTermBuffer(NumericUtils.LONG_BUF_SIZE); + termAtt.setTermLength(NumericUtils.longToPrefixCoded(value, shift, buffer)); + typeAtt.setType((shift == 0) ? TOKEN_TYPE_FULL_PREC_64 : TOKEN_TYPE_LOWER_PREC_64); + break; + + case 32: + buffer = termAtt.resizeTermBuffer(NumericUtils.INT_BUF_SIZE); + termAtt.setTermLength(NumericUtils.intToPrefixCoded((int) value, shift, buffer)); + typeAtt.setType((shift == 0) ? TOKEN_TYPE_FULL_PREC_32 : TOKEN_TYPE_LOWER_PREC_32); + break; + + default: + // should not happen + throw new IllegalArgumentException("valSize must be 32 or 64"); + } + + posIncrAtt.setPositionIncrement((shift == 0) ? 1 : 0); + shift += precisionStep; + return true; + } + + // @Override + /** @deprecated Will be removed in Lucene 3.0 */ + public Token next(final Token reusableToken) { + assert reusableToken != null; + if (valSize == 0) + throw new IllegalStateException("call set???Value() before usage"); + if (shift >= valSize) + return null; + + reusableToken.clear(); + + final char[] buffer; + switch (valSize) { + case 64: + buffer = reusableToken.resizeTermBuffer(NumericUtils.LONG_BUF_SIZE); + reusableToken.setTermLength(NumericUtils.longToPrefixCoded(value, shift, buffer)); + reusableToken.setType((shift == 0) ? TOKEN_TYPE_FULL_PREC_64 : TOKEN_TYPE_LOWER_PREC_64); + break; + + case 32: + buffer = reusableToken.resizeTermBuffer(NumericUtils.INT_BUF_SIZE); + reusableToken.setTermLength(NumericUtils.intToPrefixCoded((int) value, shift, buffer)); + reusableToken.setType((shift == 0) ? TOKEN_TYPE_FULL_PREC_32 : TOKEN_TYPE_LOWER_PREC_32); + break; + + default: + // should not happen + throw new IllegalArgumentException("valSize must be 32 or 64"); + } + + reusableToken.setPositionIncrement((shift == 0) ? 1 : 0); + shift += precisionStep; + return reusableToken; + } + + // @Override + public String toString() { + final StringBuffer sb = new StringBuffer("(numeric,valSize=").append(valSize); + sb.append(",precisionStep=").append(precisionStep).append(')'); + return sb.toString(); + } + + // members + private final TermAttribute termAtt; + private final TypeAttribute typeAtt; + private final PositionIncrementAttribute posIncrAtt; + + private int shift = 0, valSize = 0; // valSize==0 means not initialized + private final int precisionStep; + + private long value = 0L; +} diff --git a/src/java/org/apache/lucene/document/DateField.java b/src/java/org/apache/lucene/document/DateField.java index 07b3aeed18e..482064cdd3f 100644 --- a/src/java/org/apache/lucene/document/DateField.java +++ b/src/java/org/apache/lucene/document/DateField.java @@ -19,8 +19,14 @@ package org.apache.lucene.document; import org.apache.lucene.search.PrefixQuery; import org.apache.lucene.search.RangeQuery; +import org.apache.lucene.analysis.NumericTokenStream; // for javadocs +import org.apache.lucene.search.NumericRangeQuery; // for javadocs +import org.apache.lucene.util.NumericUtils; // for javadocs import java.util.Date; // for javadoc +import java.util.Calendar; // for javadoc + +// do not remove in 3.0, needed for reading old indexes! /** * Provides support for converting dates to strings and vice-versa. @@ -38,7 +44,18 @@ import java.util.Date; // for javadoc * indexed when using this class. See {@link DateTools} for an * alternative without such a limitation. * - * @deprecated If you build a new index, use {@link DateTools} instead. This class is included for use with existing + *

+ * Another approach is {@link NumericUtils}, which provides + * a sortable binary representation (prefix encoded) of numeric values, which + * date/time are. + * For indexing a {@link Date} or {@link Calendar}, just get the unix timestamp as + * long using {@link Date#getTime} or {@link Calendar#getTimeInMillis} and + * index this as a numeric value with {@link NumericTokenStream} + * and use {@link NumericRangeQuery} to query it. + * + * @deprecated If you build a new index, use {@link DateTools} or + * {@link NumericTokenStream} instead. + * This class is included for use with existing * indices and will be removed in a future release. */ public class DateField { diff --git a/src/java/org/apache/lucene/document/DateTools.java b/src/java/org/apache/lucene/document/DateTools.java index 208321e74bd..7465ad7f44b 100644 --- a/src/java/org/apache/lucene/document/DateTools.java +++ b/src/java/org/apache/lucene/document/DateTools.java @@ -22,6 +22,9 @@ import java.text.SimpleDateFormat; import java.util.Calendar; import java.util.Date; import java.util.TimeZone; +import org.apache.lucene.analysis.NumericTokenStream; // for javadocs +import org.apache.lucene.search.NumericRangeQuery; // for javadocs +import org.apache.lucene.util.NumericUtils; // for javadocs /** * Provides support for converting dates to strings and vice-versa. @@ -36,6 +39,15 @@ import java.util.TimeZone; *

Compared to {@link DateField} the strings generated by the methods * in this class take slightly more space, unless your selected resolution * is set to Resolution.DAY or lower. + * + *

+ * Another approach is {@link NumericUtils}, which provides + * a sortable binary representation (prefix encoded) of numeric values, which + * date/time are. + * For indexing a {@link Date} or {@link Calendar}, just get the unix timestamp as + * long using {@link Date#getTime} or {@link Calendar#getTimeInMillis} and + * index this as a numeric value with {@link NumericTokenStream} + * and use {@link NumericRangeQuery} to query it. */ public class DateTools { diff --git a/src/java/org/apache/lucene/document/NumberTools.java b/src/java/org/apache/lucene/document/NumberTools.java index 1d70216d261..476de4cd19e 100644 --- a/src/java/org/apache/lucene/document/NumberTools.java +++ b/src/java/org/apache/lucene/document/NumberTools.java @@ -17,6 +17,11 @@ package org.apache.lucene.document; * limitations under the License. */ +import org.apache.lucene.analysis.NumericTokenStream; // for javadocs +import org.apache.lucene.search.NumericRangeQuery; // for javadocs +import org.apache.lucene.util.NumericUtils; // for javadocs + +// do not remove this class in 3.0, it may be needed to decode old indexes! /** * Provides support for converting longs to Strings, and back again. The strings @@ -31,7 +36,13 @@ package org.apache.lucene.document; * This class handles all long values (unlike * {@link org.apache.lucene.document.DateField}). * - * + * @deprecated For new indexes use {@link NumericUtils} instead, which + * provides a sortable binary representation (prefix encoded) of numeric + * values. + * To index and efficiently query numeric values use {@link NumericTokenStream} + * and {@link NumericRangeQuery}. + * This class is included for use with existing + * indices and will be removed in a future release. */ public class NumberTools { diff --git a/src/java/org/apache/lucene/search/NumericRangeFilter.java b/src/java/org/apache/lucene/search/NumericRangeFilter.java new file mode 100644 index 00000000000..ca4ad721609 --- /dev/null +++ b/src/java/org/apache/lucene/search/NumericRangeFilter.java @@ -0,0 +1,122 @@ +package org.apache.lucene.search; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.NumericTokenStream; // for javadocs + +/** + * Implementation of a {@link Filter} that implements trie-based range filtering + * for numeric values. For more information about the algorithm look into the docs of + * {@link NumericRangeQuery}. + * + *

This filter depends on a specific structure of terms in the index that can only be created + * by indexing using {@link NumericTokenStream}. + * + *

Please note: This class has no constructor, you can create filters depending on the data type + * by using the static factories {@linkplain #newLongRange NumericRangeFilter.newLongRange()}, + * {@linkplain #newIntRange NumericRangeFilter.newIntRange()}, {@linkplain #newDoubleRange NumericRangeFilter.newDoubleRange()}, + * and {@linkplain #newFloatRange NumericRangeFilter.newFloatRange()}, e.g.: + *

+ * Filter f = NumericRangeFilter.newFloatRange(field, precisionStep,
+ *                                             new Float(0.3f), new Float(0.10f),
+ *                                             true, true);
+ * 
+ * @since 2.9 + **/ +public final class NumericRangeFilter extends MultiTermQueryWrapperFilter { + + private NumericRangeFilter(final NumericRangeQuery query) { + super(query); + } + + /** + * Factory that creates a NumericRangeFilter, that filters a long + * range using the given precisionStep. + * You can have half-open ranges (which are in fact </≤ or >/≥ queries) + * by setting the min or max value to null. By setting inclusive to false, it will + * match all documents excluding the bounds, with inclusive on, the boundaries are hits, too. + */ + public static NumericRangeFilter newLongRange(final String field, final int precisionStep, + Long min, Long max, final boolean minInclusive, final boolean maxInclusive + ) { + return new NumericRangeFilter( + NumericRangeQuery.newLongRange(field, precisionStep, min, max, minInclusive, maxInclusive) + ); + } + + /** + * Factory that creates a NumericRangeFilter, that filters a int + * range using the given precisionStep. + * You can have half-open ranges (which are in fact </≤ or >/≥ queries) + * by setting the min or max value to null. By setting inclusive to false, it will + * match all documents excluding the bounds, with inclusive on, the boundaries are hits, too. + */ + public static NumericRangeFilter newIntRange(final String field, final int precisionStep, + Integer min, Integer max, final boolean minInclusive, final boolean maxInclusive + ) { + return new NumericRangeFilter( + NumericRangeQuery.newIntRange(field, precisionStep, min, max, minInclusive, maxInclusive) + ); + } + + /** + * Factory that creates a NumericRangeFilter, that filters a double + * range using the given precisionStep. + * You can have half-open ranges (which are in fact </≤ or >/≥ queries) + * by setting the min or max value to null. By setting inclusive to false, it will + * match all documents excluding the bounds, with inclusive on, the boundaries are hits, too. + */ + public static NumericRangeFilter newDoubleRange(final String field, final int precisionStep, + Double min, Double max, final boolean minInclusive, final boolean maxInclusive + ) { + return new NumericRangeFilter( + NumericRangeQuery.newDoubleRange(field, precisionStep, min, max, minInclusive, maxInclusive) + ); + } + + /** + * Factory that creates a NumericRangeFilter, that filters a float + * range using the given precisionStep. + * You can have half-open ranges (which are in fact </≤ or >/≥ queries) + * by setting the min or max value to null. By setting inclusive to false, it will + * match all documents excluding the bounds, with inclusive on, the boundaries are hits, too. + */ + public static NumericRangeFilter newFloatRange(final String field, final int precisionStep, + Float min, Float max, final boolean minInclusive, final boolean maxInclusive + ) { + return new NumericRangeFilter( + NumericRangeQuery.newFloatRange(field, precisionStep, min, max, minInclusive, maxInclusive) + ); + } + + /** Returns the field name for this filter */ + public String getField() { return ((NumericRangeQuery)query).getField(); } + + /** Returns true if the lower endpoint is inclusive */ + public boolean includesMin() { return ((NumericRangeQuery)query).includesMin(); } + + /** Returns true if the upper endpoint is inclusive */ + public boolean includesMax() { return ((NumericRangeQuery)query).includesMax(); } + + /** Returns the lower value of this range filter */ + public Number getMin() { return ((NumericRangeQuery)query).getMin(); } + + /** Returns the upper value of this range filter */ + public Number getMax() { return ((NumericRangeQuery)query).getMax(); } + +} diff --git a/src/java/org/apache/lucene/search/NumericRangeQuery.java b/src/java/org/apache/lucene/search/NumericRangeQuery.java new file mode 100644 index 00000000000..a4b06a8aede --- /dev/null +++ b/src/java/org/apache/lucene/search/NumericRangeQuery.java @@ -0,0 +1,410 @@ +package org.apache.lucene.search; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.LinkedList; + +import org.apache.lucene.analysis.NumericTokenStream; // for javadocs +import org.apache.lucene.util.NumericUtils; +import org.apache.lucene.util.ToStringUtils; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; + +/** + * Implementation of a {@link Query} that implements trie-based range querying + * for numeric values. + * + *

Usage

+ *

Indexing

+ * Before numeric values can be queried, they must be indexed in a special way. You can do this + * by adding numeric fields to the index by specifying a {@link NumericTokenStream}. + * An important setting is the precisionStep, which specifies, + * how many different precisions per numeric value are indexed to speed up range queries. + * Lower values create more terms but speed up search, higher values create less terms, but + * slow down search. Suitable values are 2, 4, or 8. A good starting point to test is 4. + * For code examples see {@link NumericTokenStream}. + * + *

Searching

+ *

This class has no constructor, you can create filters depending on the data type + * by using the static factories {@linkplain #newLongRange NumericRangeQuery.newLongRange()}, + * {@linkplain #newIntRange NumericRangeQuery.newIntRange()}, {@linkplain #newDoubleRange NumericRangeQuery.newDoubleRange()}, + * and {@linkplain #newFloatRange NumericRangeQuery.newFloatRange()}, e.g.: + *

+ * Filter f = NumericRangeQuery.newFloatRange(field, precisionStep,
+ *                                            new Float(0.3f), new Float(0.10f),
+ *                                            true, true);
+ * 
+ * + *

How it works

+ * + *

See the publication about panFMP, + * where this algorithm was described (referred to as TrieRangeQuery): + * + *

Schindler, U, Diepenbroek, M, 2008. + * Generic XML-based Framework for Metadata Portals. + * Computers & Geosciences 34 (12), 1947-1955. + * doi:10.1016/j.cageo.2008.02.023
+ * + *

A quote from this paper: Because Apache Lucene is a full-text + * search engine and not a conventional database, it cannot handle numerical ranges + * (e.g., field value is inside user defined bounds, even dates are numerical values). + * We have developed an extension to Apache Lucene that stores + * the numerical values in a special string-encoded format with variable precision + * (all numerical values like doubles, longs, floats, and ints are converted to + * lexicographic sortable string representations and stored with different precisions + * (for a more detailed description of how the values are stored, + * see {@link NumericUtils}). A range is then divided recursively into multiple intervals for searching: + * The center of the range is searched only with the lowest possible precision in the trie, + * while the boundaries are matched more exactly. This reduces the number of terms dramatically.

+ * + *

For the variant that stores long values in 8 different precisions (each reduced by 8 bits) that + * uses a lowest precision of 1 byte, the index contains only a maximum of 256 distinct values in the + * lowest precision. Overall, a range could consist of a theoretical maximum of + * 7*255*2 + 255 = 3825 distinct terms (when there is a term for every distinct value of an + * 8-byte-number in the index and the range covers almost all of them; a maximum of 255 distinct values is used + * because it would always be possible to reduce the full 256 values to one term with degraded precision). + * In practise, we have seen up to 300 terms in most cases (index with 500,000 metadata records + * and a uniform value distribution).

+ * + *

Precision Step

+ *

You can choose any precisionStep when encoding values. + * Lower step values mean more precisions and so more terms in index (and index gets larger). + * On the other hand, the maximum number of terms to match reduces, which optimized query speed. + * The formula to calculate the maximum term count is: + *

+ *  n = [ (bitsPerValue/precisionStep - 1) * (2^precisionStep - 1 ) * 2 ] + (2^precisionStep - 1 )
+ * 
+ *

(this formula is only correct, when bitsPerValue/precisionStep is an integer; + * in other cases, the value must be rounded up and the last summand must contain the modulo of the division as + * precision step). + * For longs stored using a precision step of 4, n = 15*15*2 + 15 = 465, and for a precision + * step of 2, n = 31*3*2 + 3 = 189. But the faster search speed is reduced by more seeking + * in the term enum of the index. Because of this, the ideal precisionStep value can only + * be found out by testing. Important: You can index with a lower precision step value and test search speed + * using a multiple of the original step value.

+ * + *

This dramatically improves the performance of Apache Lucene with range queries, which + * are no longer dependent on the index size and the number of distinct values because there is + * an upper limit unrelated to either of these properties.

+ * + *

Comparisions of the different types of RangeQueries on an index with about 500,000 docs showed + * that the old {@link RangeQuery} (with raised {@link BooleanQuery} clause count) took about 30-40 + * secs to complete, {@link ConstantScoreRangeQuery} took 5 secs and executing + * this class took <100ms to complete (on an Opteron64 machine, Java 1.5, 8 bit precision step). + * This query type was developed for a geographic portal, where the performance for + * e.g. bounding boxes or exact date/time stamps is important.

+ * + *

The query is in {@linkplain #setConstantScoreRewrite constant score mode} per default. + * With precision steps of ≤4, this query can be run in conventional {@link BooleanQuery} + * rewrite mode without changing the max clause count. + * @since 2.9 + **/ +public final class NumericRangeQuery extends MultiTermQuery { + + private NumericRangeQuery(final String field, final int precisionStep, final int valSize, + Number min, Number max, final boolean minInclusive, final boolean maxInclusive + ) { + assert (valSize == 32 || valSize == 64); + if (precisionStep < 1 || precisionStep > valSize) + throw new IllegalArgumentException("precisionStep may only be 1.."+valSize); + this.field = field.intern(); + this.precisionStep = precisionStep; + this.valSize = valSize; + this.min = min; + this.max = max; + this.minInclusive = minInclusive; + this.maxInclusive = maxInclusive; + setConstantScoreRewrite(true); + } + + /** + * Factory that creates a NumericRangeQuery, that queries a long + * range using the given precisionStep. + * You can have half-open ranges (which are in fact </≤ or >/≥ queries) + * by setting the min or max value to null. By setting inclusive to false, it will + * match all documents excluding the bounds, with inclusive on, the boundaries are hits, too. + */ + public static NumericRangeQuery newLongRange(final String field, final int precisionStep, + Long min, Long max, final boolean minInclusive, final boolean maxInclusive + ) { + return new NumericRangeQuery(field, precisionStep, 64, min, max, minInclusive, maxInclusive); + } + + /** + * Factory that creates a NumericRangeQuery, that queries a int + * range using the given precisionStep. + * You can have half-open ranges (which are in fact </≤ or >/≥ queries) + * by setting the min or max value to null. By setting inclusive to false, it will + * match all documents excluding the bounds, with inclusive on, the boundaries are hits, too. + */ + public static NumericRangeQuery newIntRange(final String field, final int precisionStep, + Integer min, Integer max, final boolean minInclusive, final boolean maxInclusive + ) { + return new NumericRangeQuery(field, precisionStep, 32, min, max, minInclusive, maxInclusive); + } + + /** + * Factory that creates a NumericRangeQuery, that queries a double + * range using the given precisionStep. + * You can have half-open ranges (which are in fact </≤ or >/≥ queries) + * by setting the min or max value to null. By setting inclusive to false, it will + * match all documents excluding the bounds, with inclusive on, the boundaries are hits, too. + */ + public static NumericRangeQuery newDoubleRange(final String field, final int precisionStep, + Double min, Double max, final boolean minInclusive, final boolean maxInclusive + ) { + return new NumericRangeQuery(field, precisionStep, 64, min, max, minInclusive, maxInclusive); + } + + /** + * Factory that creates a NumericRangeQuery, that queries a float + * range using the given precisionStep. + * You can have half-open ranges (which are in fact </≤ or >/≥ queries) + * by setting the min or max value to null. By setting inclusive to false, it will + * match all documents excluding the bounds, with inclusive on, the boundaries are hits, too. + */ + public static NumericRangeQuery newFloatRange(final String field, final int precisionStep, + Float min, Float max, final boolean minInclusive, final boolean maxInclusive + ) { + return new NumericRangeQuery(field, precisionStep, 32, min, max, minInclusive, maxInclusive); + } + + //@Override + protected FilteredTermEnum getEnum(final IndexReader reader) throws IOException { + return new NumericRangeTermEnum(reader); + } + + /** Returns the field name for this query */ + public String getField() { return field; } + + /** Returns true if the lower endpoint is inclusive */ + public boolean includesMin() { return minInclusive; } + + /** Returns true if the upper endpoint is inclusive */ + public boolean includesMax() { return maxInclusive; } + + /** Returns the lower value of this range query */ + public Number getMin() { return min; } + + /** Returns the upper value of this range query */ + public Number getMax() { return max; } + + //@Override + public String toString(final String field) { + final StringBuffer sb = new StringBuffer(); + if (!this.field.equals(field)) sb.append(this.field).append(':'); + return sb.append(minInclusive ? '[' : '{') + .append((min == null) ? "*" : min.toString()) + .append(" TO ") + .append((max == null) ? "*" : max.toString()) + .append(maxInclusive ? ']' : '}') + .append(ToStringUtils.boost(getBoost())) + .toString(); + } + + //@Override + public final boolean equals(final Object o) { + if (o==this) return true; + if (o==null) return false; + if (o instanceof NumericRangeQuery) { + final NumericRangeQuery q=(NumericRangeQuery)o; + return ( + field==q.field && + (q.min == null ? min == null : q.min.equals(min)) && + (q.max == null ? max == null : q.max.equals(max)) && + minInclusive == q.minInclusive && + maxInclusive == q.maxInclusive && + precisionStep == q.precisionStep && + getBoost() == q.getBoost() + ); + } + return false; + } + + //@Override + public final int hashCode() { + int hash = Float.floatToIntBits(getBoost()) ^ field.hashCode(); + hash += precisionStep^0x64365465; + if (min != null) hash += min.hashCode()^0x14fa55fb; + if (max != null) hash += max.hashCode()^0x733fa5fe; + return hash+ + (Boolean.valueOf(minInclusive).hashCode()^0x14fa55fb)+ + (Boolean.valueOf(maxInclusive).hashCode()^0x733fa5fe); + } + + // members (package private, to be also fast accessible by NumericRangeTermEnum) + final String field; + final int precisionStep, valSize; + final Number min, max; + final boolean minInclusive,maxInclusive; + + /** + * Subclass of FilteredTermEnum for enumerating all terms that match the + * sub-ranges for trie range queries. + *

+ * WARNING: This term enumeration is not guaranteed to be always ordered by + * {@link Term#compareTo}. + * The ordering depends on how {@link NumericUtils#splitLongRange} and + * {@link NumericUtils#splitIntRange} generates the sub-ranges. For + * {@link MultiTermQuery} ordering is not relevant. + */ + private final class NumericRangeTermEnum extends FilteredTermEnum { + + private final IndexReader reader; + private final LinkedList/**/ rangeBounds = new LinkedList/**/(); + private String currentUpperBound = null; + + NumericRangeTermEnum(final IndexReader reader) throws IOException { + this.reader = reader; + + switch (valSize) { + case 64: { + // lower + long minBound = Long.MIN_VALUE; + if (min instanceof Long) { + minBound = min.longValue(); + } else if (min instanceof Double) { + minBound = NumericUtils.doubleToSortableLong(min.doubleValue()); + } + if (!minInclusive && min != null) minBound++; + + // upper + long maxBound = Long.MAX_VALUE; + if (max instanceof Long) { + maxBound = max.longValue(); + } else if (max instanceof Double) { + maxBound = NumericUtils.doubleToSortableLong(max.doubleValue()); + } + if (!maxInclusive && max != null) maxBound--; + + NumericUtils.splitLongRange(new NumericUtils.LongRangeBuilder() { + //@Override + public final void addRange(String minPrefixCoded, String maxPrefixCoded) { + rangeBounds.add(minPrefixCoded); + rangeBounds.add(maxPrefixCoded); + } + }, precisionStep, minBound, maxBound); + break; + } + + case 32: { + // lower + int minBound = Integer.MIN_VALUE; + if (min instanceof Integer) { + minBound = min.intValue(); + } else if (min instanceof Float) { + minBound = NumericUtils.floatToSortableInt(min.floatValue()); + } + if (!minInclusive && min != null) minBound++; + + // upper + int maxBound = Integer.MAX_VALUE; + if (max instanceof Integer) { + maxBound = max.intValue(); + } else if (max instanceof Float) { + maxBound = NumericUtils.floatToSortableInt(max.floatValue()); + } + if (!maxInclusive && max != null) maxBound--; + + NumericUtils.splitIntRange(new NumericUtils.IntRangeBuilder() { + //@Override + public final void addRange(String minPrefixCoded, String maxPrefixCoded) { + rangeBounds.add(minPrefixCoded); + rangeBounds.add(maxPrefixCoded); + } + }, precisionStep, minBound, maxBound); + break; + } + + default: + // should never happen + throw new IllegalArgumentException("valSize must be 32 or 64"); + } + + // seek to first term + next(); + } + + //@Override + public float difference() { + return 1.0f; + } + + /** this is a dummy, it is not used by this class. */ + //@Override + protected boolean endEnum() { + assert false; // should never be called + return (currentTerm != null); + } + + /** + * Compares if current upper bound is reached, + * this also updates the term count for statistics. + * In contrast to {@link FilteredTermEnum}, a return value + * of false ends iterating the current enum + * and forwards to the next sub-range. + */ + //@Override + protected boolean termCompare(Term term) { + return (term.field() == field && term.text().compareTo(currentUpperBound) <= 0); + } + + /** Increments the enumeration to the next element. True if one exists. */ + //@Override + public boolean next() throws IOException { + // if a current term exists, the actual enum is initialized: + // try change to next term, if no such term exists, fall-through + if (currentTerm != null) { + assert actualEnum!=null; + if (actualEnum.next()) { + currentTerm = actualEnum.term(); + if (termCompare(currentTerm)) return true; + } + } + // if all above fails, we go forward to the next enum, + // if one is available + currentTerm = null; + if (rangeBounds.size() < 2) return false; + // close the current enum and read next bounds + if (actualEnum != null) { + actualEnum.close(); + actualEnum = null; + } + final String lowerBound = (String)rangeBounds.removeFirst(); + this.currentUpperBound = (String)rangeBounds.removeFirst(); + // this call recursively uses next(), if no valid term in + // next enum found. + // if this behavior is changed/modified in the superclass, + // this enum will not work anymore! + setEnum(reader.terms(new Term(field, lowerBound))); + return (currentTerm != null); + } + + /** Closes the enumeration to further activity, freeing resources. */ + //@Override + public void close() throws IOException { + rangeBounds.clear(); + currentUpperBound = null; + super.close(); + } + + } + +} diff --git a/src/java/org/apache/lucene/search/RangeFilter.java b/src/java/org/apache/lucene/search/RangeFilter.java index de93456814b..97b68461d11 100644 --- a/src/java/org/apache/lucene/search/RangeFilter.java +++ b/src/java/org/apache/lucene/search/RangeFilter.java @@ -22,8 +22,12 @@ import java.text.Collator; /** * A Filter that restricts search results to a range of values in a given * field. - * - * If you construct a large number of range filters with different ranges but on the + * + *

This filter matches the documents looking for terms that fall into the + * supplied range according to {@link String#compareTo(String)}. It is not intended + * for numerical ranges, use {@link NumericRangeFilter} instead. + * + *

If you construct a large number of range filters with different ranges but on the * same field, {@link FieldCacheRangeFilter} may have significantly better performance. */ public class RangeFilter extends MultiTermQueryWrapperFilter { diff --git a/src/java/org/apache/lucene/search/RangeQuery.java b/src/java/org/apache/lucene/search/RangeQuery.java index a29fbaa5ec7..5db295911d0 100644 --- a/src/java/org/apache/lucene/search/RangeQuery.java +++ b/src/java/org/apache/lucene/search/RangeQuery.java @@ -26,7 +26,11 @@ import org.apache.lucene.index.IndexReader; /** * A Query that matches documents within an exclusive range. * - * See {@link MultiTermQuery#setConstantScoreRewrite} for the tradeoffs between + *

This query matches the documents looking for terms that fall into the + * supplied range according to {@link String#compareTo(String)}. It is not intended + * for numerical ranges, use {@link NumericRangeQuery} instead. + * + *

See {@link MultiTermQuery#setConstantScoreRewrite} for the tradeoffs between * enabling and disabling constantScoreRewrite mode. */ diff --git a/src/java/org/apache/lucene/search/package.html b/src/java/org/apache/lucene/search/package.html index eae5d94045c..61af1f14ae4 100644 --- a/src/java/org/apache/lucene/search/package.html +++ b/src/java/org/apache/lucene/search/package.html @@ -136,6 +136,7 @@ org.apache.lucene.search.Searcher#search(Query,Filter)}.

+

RangeQuery

@@ -147,12 +148,28 @@ org.apache.lucene.search.Searcher#search(Query,Filter)}. Term and an upper Term. + according to {@link java.lang.String#compareTo(String)}. It is not intended + for numerical ranges, use NumericRangeQuery instead. + For example, one could find all documents that have terms beginning with the letters a through c. This type of Query is frequently used to find documents that occur in a specific date range.

+ +

+ NumericRangeQuery +

+ +

The + NumericRangeQuery + matches all documents that occur in a numeric range. + For NumericRangeQuery to work, you must index the values + using a special + NumericTokenStream. +

+

PrefixQuery, WildcardQuery diff --git a/src/java/org/apache/lucene/util/NumericUtils.java b/src/java/org/apache/lucene/util/NumericUtils.java new file mode 100644 index 00000000000..65276261e7a --- /dev/null +++ b/src/java/org/apache/lucene/util/NumericUtils.java @@ -0,0 +1,503 @@ +package org.apache.lucene.util; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.NumericTokenStream; // for javadocs +import org.apache.lucene.search.NumericRangeQuery; // for javadocs +import org.apache.lucene.search.NumericRangeFilter; // for javadocs +import org.apache.lucene.search.SortField; +import org.apache.lucene.search.FieldCache; +import org.apache.lucene.search.ExtendedFieldCache; + +/** + * This is a helper class to generate prefix-encoded representations for numerical values + * and supplies converters to represent float/double values as sortable integers/longs. + * + *

To quickly execute range queries in Apache Lucene, a range is divided recursively + * into multiple intervals for searching: The center of the range is searched only with + * the lowest possible precision in the trie, while the boundaries are matched + * more exactly. This reduces the number of terms dramatically. + * + *

This class generates terms to achive this: First the numerical integer values need to + * be converted to strings. For that integer values (32 bit or 64 bit) are made unsigned + * and the bits are converted to ASCII chars with each 7 bit. The resulting string is + * sortable like the original integer value. Each value is also prefixed + * (in the first char) by the shift value (number of bits removed) used + * during encoding. + * + *

To also index floating point numbers, this class supplies two methods to convert them + * to integer values by changing their bit layout: {@link #doubleToSortableLong}, + * {@link #floatToSortableInt}. You will have no precision loss by + * converting floating point numbers to integers and back (only that the integer form + * is not usable). Other data types like dates can easily converted to longs or ints (e.g. + * date to long: {@link java.util.Date#getTime}). + * + *

For easy usage, the trie algorithm is implemented for indexing inside + * {@link NumericTokenStream} that can index int, long, + * float, and double. For querying, + * {@link NumericRangeQuery} and {@link NumericRangeFilter} implement the query part + * for the same data types. + * + *

This class can also be used, to generate lexicographically sortable (according + * {@link String#compareTo(String)}) representations of numeric data types for other + * usages (e.g. sorting). + * + *

Prefix encoded fields can also be sorted using the {@link SortField} factories + * {@link #getLongSortField}, {@link #getIntSortField}, {@link #getDoubleSortField} + * or {@link #getFloatSortField}. + * @since 2.9 + */ +public final class NumericUtils { + + private NumericUtils() {} // no instance! + + /** + * Longs are stored at lower precision by shifting off lower bits. The shift count is + * stored as SHIFT_START_LONG+shift in the first character + */ + public static final char SHIFT_START_LONG = (char)0x20; + + /** + * Expert: The maximum term length (used for char[] buffer size) + * for encoding long values. + * @see #longToPrefixCoded(long,int,char[]) + */ + public static final int LONG_BUF_SIZE = 63/7 + 2; + + /** + * Integers are stored at lower precision by shifting off lower bits. The shift count is + * stored as SHIFT_START_INT+shift in the first character + */ + public static final char SHIFT_START_INT = (char)0x60; + + /** + * Expert: The maximum term length (used for char[] buffer size) + * for encoding int values. + * @see #intToPrefixCoded(int,int,char[]) + */ + public static final int INT_BUF_SIZE = 31/7 + 2; + + /** + * A parser instance for filling a {@link ExtendedFieldCache}, that parses prefix encoded fields as longs. + */ + public static final ExtendedFieldCache.LongParser FIELD_CACHE_LONG_PARSER=new ExtendedFieldCache.LongParser(){ + public final long parseLong(final String val) { + final int shift = val.charAt(0)-SHIFT_START_LONG; + if (shift>0 && shift<=63) + throw new FieldCache.StopFillCacheException(); + return prefixCodedToLong(val); + } + }; + + /** + * A parser instance for filling a {@link FieldCache}, that parses prefix encoded fields as ints. + */ + public static final FieldCache.IntParser FIELD_CACHE_INT_PARSER=new FieldCache.IntParser(){ + public final int parseInt(final String val) { + final int shift = val.charAt(0)-SHIFT_START_INT; + if (shift>0 && shift<=31) + throw new FieldCache.StopFillCacheException(); + return prefixCodedToInt(val); + } + }; + + /** + * A parser instance for filling a {@link ExtendedFieldCache}, that parses prefix encoded fields as doubles. + * This uses {@link #sortableLongToDouble} to convert the encoded long to a double. + */ + public static final ExtendedFieldCache.DoubleParser FIELD_CACHE_DOUBLE_PARSER=new ExtendedFieldCache.DoubleParser(){ + public final double parseDouble(final String val) { + final int shift = val.charAt(0)-SHIFT_START_LONG; + if (shift>0 && shift<=63) + throw new FieldCache.StopFillCacheException(); + return sortableLongToDouble(prefixCodedToLong(val)); + } + }; + + /** + * A parser instance for filling a {@link FieldCache}, that parses prefix encoded fields as floats. + * This uses {@link #sortableIntToFloat} to convert the encoded int to a float. + */ + public static final FieldCache.FloatParser FIELD_CACHE_FLOAT_PARSER=new FieldCache.FloatParser(){ + public final float parseFloat(final String val) { + final int shift = val.charAt(0)-SHIFT_START_INT; + if (shift>0 && shift<=31) + throw new FieldCache.StopFillCacheException(); + return sortableIntToFloat(prefixCodedToInt(val)); + } + }; + + /** + * Expert: Returns prefix coded bits after reducing the precision by shift bits. + * This is method is used by {@link NumericTokenStream}. + * @param val the numeric value + * @param shift how many bits to strip from the right + * @param buffer that will contain the encoded chars, must be at least of {@link #LONG_BUF_SIZE} + * length + * @return number of chars written to buffer + */ + public static int longToPrefixCoded(final long val, final int shift, final char[] buffer) { + int nChars = (63-shift)/7 + 1, len = nChars+1; + buffer[0] = (char)(SHIFT_START_LONG + shift); + long sortableBits = val ^ 0x8000000000000000L; + sortableBits >>>= shift; + while (nChars>=1) { + // Store 7 bits per character for good efficiency when UTF-8 encoding. + // The whole number is right-justified so that lucene can prefix-encode + // the terms more efficiently. + buffer[nChars--] = (char)(sortableBits & 0x7f); + sortableBits >>>= 7; + } + return len; + } + + /** + * Expert: Returns prefix coded bits after reducing the precision by shift bits. + * This is method is used by {@link LongRangeBuilder}. + * @param val the numeric value + * @param shift how many bits to strip from the right + */ + public static String longToPrefixCoded(final long val, final int shift) { + if (shift>63 || shift<0) + throw new IllegalArgumentException("Illegal shift value, must be 0..63"); + final char[] buffer = new char[LONG_BUF_SIZE]; + final int len = longToPrefixCoded(val, shift, buffer); + return new String(buffer, 0, len); + } + + /** + * This is a convenience method, that returns prefix coded bits of a long without + * reducing the precision. It can be used to store the full precision value as a + * stored field in index. + *

To decode, use {@link #prefixCodedToLong}. + */ + public static String longToPrefixCoded(final long val) { + return longToPrefixCoded(val, 0); + } + + /** + * Expert: Returns prefix coded bits after reducing the precision by shift bits. + * This is method is used by {@link NumericTokenStream}. + * @param val the numeric value + * @param shift how many bits to strip from the right + * @param buffer that will contain the encoded chars, must be at least of {@link #INT_BUF_SIZE} + * length + * @return number of chars written to buffer + */ + public static int intToPrefixCoded(final int val, final int shift, final char[] buffer) { + int nChars = (31-shift)/7 + 1, len = nChars+1; + buffer[0] = (char)(SHIFT_START_INT + shift); + int sortableBits = val ^ 0x80000000; + sortableBits >>>= shift; + while (nChars>=1) { + // Store 7 bits per character for good efficiency when UTF-8 encoding. + // The whole number is right-justified so that lucene can prefix-encode + // the terms more efficiently. + buffer[nChars--] = (char)(sortableBits & 0x7f); + sortableBits >>>= 7; + } + return len; + } + + /** + * Expert: Returns prefix coded bits after reducing the precision by shift bits. + * This is method is used by {@link IntRangeBuilder}. + * @param val the numeric value + * @param shift how many bits to strip from the right + */ + public static String intToPrefixCoded(final int val, final int shift) { + if (shift>31 || shift<0) + throw new IllegalArgumentException("Illegal shift value, must be 0..31"); + final char[] buffer = new char[INT_BUF_SIZE]; + final int len = intToPrefixCoded(val, shift, buffer); + return new String(buffer, 0, len); + } + + /** + * This is a convenience method, that returns prefix coded bits of an int without + * reducing the precision. It can be used to store the full precision value as a + * stored field in index. + *

To decode, use {@link #prefixCodedToInt}. + */ + public static String intToPrefixCoded(final int val) { + return intToPrefixCoded(val, 0); + } + + /** + * Returns a long from prefixCoded characters. + * Rightmost bits will be zero for lower precision codes. + * This method can be used to decode e.g. a stored field. + * @throws NumberFormatException if the supplied string is + * not correctly prefix encoded. + * @see #longToPrefixCoded(long) + */ + public static long prefixCodedToLong(final String prefixCoded) { + final int shift = prefixCoded.charAt(0)-SHIFT_START_LONG; + if (shift>63 || shift<0) + throw new NumberFormatException("Invalid shift value in prefixCoded string (is encoded value really a LONG?)"); + long sortableBits = 0L; + for (int i=1, len=prefixCoded.length(); i0x7f) { + throw new NumberFormatException( + "Invalid prefixCoded numerical value representation (char "+ + Integer.toHexString((int)ch)+" at position "+i+" is invalid)" + ); + } + sortableBits |= (long)ch; + } + return (sortableBits << shift) ^ 0x8000000000000000L; + } + + /** + * Returns an int from prefixCoded characters. + * Rightmost bits will be zero for lower precision codes. + * This method can be used to decode e.g. a stored field. + * @throws NumberFormatException if the supplied string is + * not correctly prefix encoded. + * @see #intToPrefixCoded(int) + */ + public static int prefixCodedToInt(final String prefixCoded) { + final int shift = prefixCoded.charAt(0)-SHIFT_START_INT; + if (shift>31 || shift<0) + throw new NumberFormatException("Invalid shift value in prefixCoded string (is encoded value really an INT?)"); + int sortableBits = 0; + for (int i=1, len=prefixCoded.length(); i0x7f) { + throw new NumberFormatException( + "Invalid prefixCoded numerical value representation (char "+ + Integer.toHexString((int)ch)+" at position "+i+" is invalid)" + ); + } + sortableBits |= (int)ch; + } + return (sortableBits << shift) ^ 0x80000000; + } + + /** + * Converts a double value to a sortable signed long. + * The value is converted by getting their IEEE 754 floating-point "double format" + * bit layout and then some bits are swapped, to be able to compare the result as long. + * By this the precision is not reduced, but the value can easily used as a long. + * @see #sortableLongToDouble + */ + public static long doubleToSortableLong(double val) { + long f = Double.doubleToLongBits(val); + if (f<0) f ^= 0x7fffffffffffffffL; + return f; + } + + /** + * Converts a sortable long back to a double. + * @see #doubleToSortableLong + */ + public static double sortableLongToDouble(long val) { + if (val<0) val ^= 0x7fffffffffffffffL; + return Double.longBitsToDouble(val); + } + + /** + * Converts a float value to a sortable signed int. + * The value is converted by getting their IEEE 754 floating-point "float format" + * bit layout and then some bits are swapped, to be able to compare the result as int. + * By this the precision is not reduced, but the value can easily used as an int. + * @see #sortableIntToFloat + */ + public static int floatToSortableInt(float val) { + int f = Float.floatToIntBits(val); + if (f<0) f ^= 0x7fffffff; + return f; + } + + /** + * Converts a sortable int back to a float. + * @see #floatToSortableInt + */ + public static float sortableIntToFloat(int val) { + if (val<0) val ^= 0x7fffffff; + return Float.intBitsToFloat(val); + } + + /** A factory method, that generates a {@link SortField} instance for sorting prefix encoded long values. */ + public static SortField getLongSortField(final String field, final boolean reverse) { + return new SortField(field, FIELD_CACHE_LONG_PARSER, reverse); + } + + /** A factory method, that generates a {@link SortField} instance for sorting prefix encoded int values. */ + public static SortField getIntSortField(final String field, final boolean reverse) { + return new SortField(field, FIELD_CACHE_INT_PARSER, reverse); + } + + /** A factory method, that generates a {@link SortField} instance for sorting prefix encoded double values. */ + public static SortField getDoubleSortField(final String field, final boolean reverse) { + return new SortField(field, FIELD_CACHE_DOUBLE_PARSER, reverse); + } + + /** A factory method, that generates a {@link SortField} instance for sorting prefix encoded float values. */ + public static SortField getFloatSortField(final String field, final boolean reverse) { + return new SortField(field, FIELD_CACHE_FLOAT_PARSER, reverse); + } + + /** + * Expert: Splits a long range recursively. + * You may implement a builder that adds clauses to a + * {@link org.apache.lucene.search.BooleanQuery} for each call to its + * {@link LongRangeBuilder#addRange(String,String)} + * method. + *

This method is used by {@link NumericRangeQuery}. + */ + public static void splitLongRange(final LongRangeBuilder builder, + final int precisionStep, final long minBound, final long maxBound + ) { + if (precisionStep<1 || precisionStep>64) + throw new IllegalArgumentException("precisionStep may only be 1..64"); + splitRange(builder, 64, precisionStep, minBound, maxBound); + } + + /** + * Expert: Splits an int range recursively. + * You may implement a builder that adds clauses to a + * {@link org.apache.lucene.search.BooleanQuery} for each call to its + * {@link IntRangeBuilder#addRange(String,String)} + * method. + *

This method is used by {@link NumericRangeQuery}. + */ + public static void splitIntRange(final IntRangeBuilder builder, + final int precisionStep, final int minBound, final int maxBound + ) { + if (precisionStep<1 || precisionStep>32) + throw new IllegalArgumentException("precisionStep may only be 1..32"); + splitRange(builder, 32, precisionStep, (long)minBound, (long)maxBound); + } + + /** This helper does the splitting for both 32 and 64 bit. */ + private static void splitRange( + final Object builder, final int valSize, + final int precisionStep, long minBound, long maxBound + ) { + if (minBound > maxBound) return; + for (int shift=0; ; shift += precisionStep) { + // calculate new bounds for inner precision + final long diff = 1L << (shift+precisionStep), + mask = ((1L<=valSize || nextMinBound>nextMaxBound) { + // We are in the lowest precision or the next precision is not available. + addRange(builder, valSize, minBound, maxBound, shift); + // exit the split recursion loop + break; + } + + if (hasLower) + addRange(builder, valSize, minBound, minBound | mask, shift); + if (hasUpper) + addRange(builder, valSize, maxBound & ~mask, maxBound, shift); + + // recurse to next precision + minBound = nextMinBound; + maxBound = nextMaxBound; + } + } + + /** Helper that delegates to correct range builder */ + private static void addRange( + final Object builder, final int valSize, + long minBound, long maxBound, + final int shift + ) { + // for the max bound set all lower bits (that were shifted away): + // this is important for testing or other usages of the splitted range + // (e.g. to reconstruct the full range). The prefixEncoding will remove + // the bits anyway, so they do not hurt! + maxBound |= (1L << shift) - 1L; + // delegate to correct range builder + switch(valSize) { + case 64: + ((LongRangeBuilder)builder).addRange(minBound, maxBound, shift); + break; + case 32: + ((IntRangeBuilder)builder).addRange((int)minBound, (int)maxBound, shift); + break; + default: + // Should not happen! + throw new IllegalArgumentException("valSize must be 32 or 64."); + } + } + + /** + * Expert: Callback for {@link #splitLongRange}. + * You need to overwrite only one of the methods. + *

WARNING: This is a very low-level interface, + * the method signatures may change in later versions. + */ + public static abstract class LongRangeBuilder { + + /** + * Overwrite this method, if you like to receive the already prefix encoded range bounds. + * You can directly build classical (inclusive) range queries from them. + */ + public void addRange(String minPrefixCoded, String maxPrefixCoded) { + throw new UnsupportedOperationException(); + } + + /** + * Overwrite this method, if you like to receive the raw long range bounds. + * You can use this for e.g. debugging purposes (print out range bounds). + */ + public void addRange(final long min, final long max, final int shift) { + addRange(longToPrefixCoded(min, shift), longToPrefixCoded(max, shift)); + } + + } + + /** + * Expert: Callback for {@link #splitIntRange}. + * You need to overwrite only one of the methods. + *

WARNING: This is a very low-level interface, + * the method signatures may change in later versions. + */ + public static abstract class IntRangeBuilder { + + /** + * Overwrite this method, if you like to receive the already prefix encoded range bounds. + * You can directly build classical range (inclusive) queries from them. + */ + public void addRange(String minPrefixCoded, String maxPrefixCoded) { + throw new UnsupportedOperationException(); + } + + /** + * Overwrite this method, if you like to receive the raw int range bounds. + * You can use this for e.g. debugging purposes (print out range bounds). + */ + public void addRange(final int min, final int max, final int shift) { + addRange(intToPrefixCoded(min, shift), intToPrefixCoded(max, shift)); + } + + } + +} diff --git a/src/test/org/apache/lucene/analysis/TestNumericTokenStream.java b/src/test/org/apache/lucene/analysis/TestNumericTokenStream.java new file mode 100644 index 00000000000..5f225895a13 --- /dev/null +++ b/src/test/org/apache/lucene/analysis/TestNumericTokenStream.java @@ -0,0 +1,103 @@ +package org.apache.lucene.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.NumericUtils; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; + +public class TestNumericTokenStream extends LuceneTestCase { + + static final int precisionStep = 8; + static final long lvalue = 4573245871874382L; + static final int ivalue = 123456; + + public void testLongStreamNewAPI() throws Exception { + final NumericTokenStream stream=new NumericTokenStream(precisionStep).setLongValue(lvalue); + stream.setUseNewAPI(true); + // use getAttribute to test if attributes really exist, if not an IAE will be throwed + final TermAttribute termAtt = (TermAttribute) stream.getAttribute(TermAttribute.class); + for (int shift=0; shift<64; shift+=precisionStep) { + assertTrue("New token is available", stream.incrementToken()); + assertEquals("Term is correctly encoded", NumericUtils.longToPrefixCoded(lvalue, shift), termAtt.term()); + } + assertFalse("No more tokens available", stream.incrementToken()); + } + + public void testLongStreamOldAPI() throws Exception { + final NumericTokenStream stream=new NumericTokenStream(precisionStep).setLongValue(lvalue); + stream.setUseNewAPI(false); + Token tok=new Token(); + for (int shift=0; shift<64; shift+=precisionStep) { + assertNotNull("New token is available", tok=stream.next(tok)); + assertEquals("Term is correctly encoded", NumericUtils.longToPrefixCoded(lvalue, shift), tok.term()); + } + assertNull("No more tokens available", stream.next(tok)); + } + + public void testIntStreamNewAPI() throws Exception { + final NumericTokenStream stream=new NumericTokenStream(precisionStep).setIntValue(ivalue); + stream.setUseNewAPI(true); + // use getAttribute to test if attributes really exist, if not an IAE will be throwed + final TermAttribute termAtt = (TermAttribute) stream.getAttribute(TermAttribute.class); + for (int shift=0; shift<32; shift+=precisionStep) { + assertTrue("New token is available", stream.incrementToken()); + assertEquals("Term is correctly encoded", NumericUtils.intToPrefixCoded(ivalue, shift), termAtt.term()); + } + assertFalse("No more tokens available", stream.incrementToken()); + } + + public void testIntStreamOldAPI() throws Exception { + final NumericTokenStream stream=new NumericTokenStream(precisionStep).setIntValue(ivalue); + stream.setUseNewAPI(false); + Token tok=new Token(); + for (int shift=0; shift<32; shift+=precisionStep) { + assertNotNull("New token is available", tok=stream.next(tok)); + assertEquals("Term is correctly encoded", NumericUtils.intToPrefixCoded(ivalue, shift), tok.term()); + } + assertNull("No more tokens available", stream.next(tok)); + } + + public void testNotInitialized() throws Exception { + final NumericTokenStream stream=new NumericTokenStream(precisionStep); + + try { + stream.reset(); + fail("reset() should not succeed."); + } catch (IllegalStateException e) { + // pass + } + + stream.setUseNewAPI(true); + try { + stream.incrementToken(); + fail("incrementToken() should not succeed."); + } catch (IllegalStateException e) { + // pass + } + + stream.setUseNewAPI(false); + try { + stream.next(new Token()); + fail("next() should not succeed."); + } catch (IllegalStateException e) { + // pass + } + } + +} diff --git a/src/test/org/apache/lucene/search/TestNumericRangeQuery32.java b/src/test/org/apache/lucene/search/TestNumericRangeQuery32.java new file mode 100644 index 00000000000..e5fdd11208c --- /dev/null +++ b/src/test/org/apache/lucene/search/TestNumericRangeQuery32.java @@ -0,0 +1,431 @@ +package org.apache.lucene.search; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.Random; + +import org.apache.lucene.analysis.NumericTokenStream; +import org.apache.lucene.analysis.WhitespaceAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriter.MaxFieldLength; +import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.NumericUtils; + +public class TestNumericRangeQuery32 extends LuceneTestCase { + // distance of entries + private static final int distance = 6666; + // shift the starting of the values to the left, to also have negative values: + private static final int startOffset = - 1 << 15; + // number of docs to generate for testing + private static final int noDocs = 10000; + + private static Field newField(String name, int precisionStep) { + NumericTokenStream stream = new NumericTokenStream(precisionStep); + stream.setUseNewAPI(true); + Field f=new Field(name, stream); + f.setOmitTermFreqAndPositions(true); + f.setOmitNorms(true); + return f; + } + + private static final RAMDirectory directory; + private static final IndexSearcher searcher; + static { + try { + // set the theoretical maximum term count for 8bit (see docs for the number) + BooleanQuery.setMaxClauseCount(3*255*2 + 255); + + directory = new RAMDirectory(); + IndexWriter writer = new IndexWriter(directory, new WhitespaceAnalyzer(), + true, MaxFieldLength.UNLIMITED); + + Field + field8 = newField("field8", 8), + field4 = newField("field4", 4), + field2 = newField("field2", 2), + ascfield8 = newField("ascfield8", 8), + ascfield4 = newField("ascfield4", 4), + ascfield2 = newField("ascfield2", 2); + + // Add a series of noDocs docs with increasing int values + for (int l=0; l0) { + assertEquals("Distinct term number is equal for all query types", lastTerms, terms); + } + lastTerms = terms; + } + } + + public void testRange_8bit() throws Exception { + testRange(8); + } + + public void testRange_4bit() throws Exception { + testRange(4); + } + + public void testRange_2bit() throws Exception { + testRange(2); + } + + public void testInverseRange() throws Exception { + NumericRangeFilter f = NumericRangeFilter.newIntRange("field8", 8, new Integer(1000), new Integer(-1000), true, true); + assertSame("A inverse range should return the EMPTY_DOCIDSET instance", DocIdSet.EMPTY_DOCIDSET, f.getDocIdSet(searcher.getIndexReader())); + } + + private void testLeftOpenRange(int precisionStep) throws Exception { + String field="field"+precisionStep; + int count=3000; + int upper=(count-1)*distance + (distance/3) + startOffset; + NumericRangeQuery q=NumericRangeQuery.newIntRange(field, precisionStep, null, new Integer(upper), true, true); + TopDocs topDocs = searcher.search(q, null, noDocs, Sort.INDEXORDER); + System.out.println("Found "+q.getTotalNumberOfTerms()+" distinct terms in left open range for field '"+field+"'."); + ScoreDoc[] sd = topDocs.scoreDocs; + assertNotNull(sd); + assertEquals("Score doc count", count, sd.length ); + Document doc=searcher.doc(sd[0].doc); + assertEquals("First doc", startOffset, Integer.parseInt(doc.get("value")) ); + doc=searcher.doc(sd[sd.length-1].doc); + assertEquals("Last doc", (count-1)*distance+startOffset, Integer.parseInt(doc.get("value")) ); + } + + public void testLeftOpenRange_8bit() throws Exception { + testLeftOpenRange(8); + } + + public void testLeftOpenRange_4bit() throws Exception { + testLeftOpenRange(4); + } + + public void testLeftOpenRange_2bit() throws Exception { + testLeftOpenRange(2); + } + + private void testRightOpenRange(int precisionStep) throws Exception { + String field="field"+precisionStep; + int count=3000; + int lower=(count-1)*distance + (distance/3) +startOffset; + NumericRangeQuery q=NumericRangeQuery.newIntRange(field, precisionStep, new Integer(lower), null, true, true); + TopDocs topDocs = searcher.search(q, null, noDocs, Sort.INDEXORDER); + System.out.println("Found "+q.getTotalNumberOfTerms()+" distinct terms in right open range for field '"+field+"'."); + ScoreDoc[] sd = topDocs.scoreDocs; + assertNotNull(sd); + assertEquals("Score doc count", noDocs-count, sd.length ); + Document doc=searcher.doc(sd[0].doc); + assertEquals("First doc", count*distance+startOffset, Integer.parseInt(doc.get("value")) ); + doc=searcher.doc(sd[sd.length-1].doc); + assertEquals("Last doc", (noDocs-1)*distance+startOffset, Integer.parseInt(doc.get("value")) ); + } + + public void testRightOpenRange_8bit() throws Exception { + testRightOpenRange(8); + } + + public void testRightOpenRange_4bit() throws Exception { + testRightOpenRange(4); + } + + public void testRightOpenRange_2bit() throws Exception { + testRightOpenRange(2); + } + + private void testRandomTrieAndClassicRangeQuery(int precisionStep) throws Exception { + final Random rnd=newRandom(); + String field="field"+precisionStep; + int termCountT=0,termCountC=0; + for (int i=0; i<50; i++) { + int lower=(int)(rnd.nextDouble()*noDocs*distance)+startOffset; + int upper=(int)(rnd.nextDouble()*noDocs*distance)+startOffset; + if (lower>upper) { + int a=lower; lower=upper; upper=a; + } + // test inclusive range + NumericRangeQuery tq=NumericRangeQuery.newIntRange(field, precisionStep, new Integer(lower), new Integer(upper), true, true); + RangeQuery cq=new RangeQuery(field, NumericUtils.intToPrefixCoded(lower), NumericUtils.intToPrefixCoded(upper), true, true); + cq.setConstantScoreRewrite(true); + TopDocs tTopDocs = searcher.search(tq, 1); + TopDocs cTopDocs = searcher.search(cq, 1); + assertEquals("Returned count for NumericRangeQuery and RangeQuery must be equal", cTopDocs.totalHits, tTopDocs.totalHits ); + termCountT += tq.getTotalNumberOfTerms(); + termCountC += cq.getTotalNumberOfTerms(); + // test exclusive range + tq=NumericRangeQuery.newIntRange(field, precisionStep, new Integer(lower), new Integer(upper), false, false); + cq=new RangeQuery(field, NumericUtils.intToPrefixCoded(lower), NumericUtils.intToPrefixCoded(upper), false, false); + cq.setConstantScoreRewrite(true); + tTopDocs = searcher.search(tq, 1); + cTopDocs = searcher.search(cq, 1); + assertEquals("Returned count for NumericRangeQuery and RangeQuery must be equal", cTopDocs.totalHits, tTopDocs.totalHits ); + termCountT += tq.getTotalNumberOfTerms(); + termCountC += cq.getTotalNumberOfTerms(); + // test left exclusive range + tq=NumericRangeQuery.newIntRange(field, precisionStep, new Integer(lower), new Integer(upper), false, true); + cq=new RangeQuery(field, NumericUtils.intToPrefixCoded(lower), NumericUtils.intToPrefixCoded(upper), false, true); + cq.setConstantScoreRewrite(true); + tTopDocs = searcher.search(tq, 1); + cTopDocs = searcher.search(cq, 1); + assertEquals("Returned count for NumericRangeQuery and RangeQuery must be equal", cTopDocs.totalHits, tTopDocs.totalHits ); + termCountT += tq.getTotalNumberOfTerms(); + termCountC += cq.getTotalNumberOfTerms(); + // test right exclusive range + tq=NumericRangeQuery.newIntRange(field, precisionStep, new Integer(lower), new Integer(upper), true, false); + cq=new RangeQuery(field, NumericUtils.intToPrefixCoded(lower), NumericUtils.intToPrefixCoded(upper), true, false); + cq.setConstantScoreRewrite(true); + tTopDocs = searcher.search(tq, 1); + cTopDocs = searcher.search(cq, 1); + assertEquals("Returned count for NumericRangeQuery and RangeQuery must be equal", cTopDocs.totalHits, tTopDocs.totalHits ); + termCountT += tq.getTotalNumberOfTerms(); + termCountC += cq.getTotalNumberOfTerms(); + } + System.out.println("Average number of terms during random search on '" + field + "':"); + System.out.println(" Trie query: " + (((double)termCountT)/(50*4))); + System.out.println(" Classical query: " + (((double)termCountC)/(50*4))); + } + + public void testRandomTrieAndClassicRangeQuery_8bit() throws Exception { + testRandomTrieAndClassicRangeQuery(8); + } + + public void testRandomTrieAndClassicRangeQuery_4bit() throws Exception { + testRandomTrieAndClassicRangeQuery(4); + } + + public void testRandomTrieAndClassicRangeQuery_2bit() throws Exception { + testRandomTrieAndClassicRangeQuery(2); + } + + private void testRangeSplit(int precisionStep) throws Exception { + final Random rnd=newRandom(); + String field="ascfield"+precisionStep; + // 50 random tests + for (int i=0; i<50; i++) { + int lower=(int)(rnd.nextDouble()*noDocs - noDocs/2); + int upper=(int)(rnd.nextDouble()*noDocs - noDocs/2); + if (lower>upper) { + int a=lower; lower=upper; upper=a; + } + // test inclusive range + Query tq=NumericRangeQuery.newIntRange(field, precisionStep, new Integer(lower), new Integer(upper), true, true); + TopDocs tTopDocs = searcher.search(tq, 1); + assertEquals("Returned count of range query must be equal to inclusive range length", upper-lower+1, tTopDocs.totalHits ); + // test exclusive range + tq=NumericRangeQuery.newIntRange(field, precisionStep, new Integer(lower), new Integer(upper), false, false); + tTopDocs = searcher.search(tq, 1); + assertEquals("Returned count of range query must be equal to exclusive range length", Math.max(upper-lower-1, 0), tTopDocs.totalHits ); + // test left exclusive range + tq=NumericRangeQuery.newIntRange(field, precisionStep, new Integer(lower), new Integer(upper), false, true); + tTopDocs = searcher.search(tq, 1); + assertEquals("Returned count of range query must be equal to half exclusive range length", upper-lower, tTopDocs.totalHits ); + // test right exclusive range + tq=NumericRangeQuery.newIntRange(field, precisionStep, new Integer(lower), new Integer(upper), true, false); + tTopDocs = searcher.search(tq, 1); + assertEquals("Returned count of range query must be equal to half exclusive range length", upper-lower, tTopDocs.totalHits ); + } + } + + public void testRangeSplit_8bit() throws Exception { + testRangeSplit(8); + } + + public void testRangeSplit_4bit() throws Exception { + testRangeSplit(4); + } + + public void testRangeSplit_2bit() throws Exception { + testRangeSplit(2); + } + + /** we fake a float test using int2float conversion of NumericUtils */ + private void testFloatRange(int precisionStep) throws Exception { + final String field="ascfield"+precisionStep; + final int lower=-1000, upper=+2000; + + Query tq=NumericRangeQuery.newFloatRange(field, precisionStep, + new Float(NumericUtils.sortableIntToFloat(lower)), new Float(NumericUtils.sortableIntToFloat(upper)), true, true); + TopDocs tTopDocs = searcher.search(tq, 1); + assertEquals("Returned count of range query must be equal to inclusive range length", upper-lower+1, tTopDocs.totalHits ); + + Filter tf=NumericRangeFilter.newFloatRange(field, precisionStep, + new Float(NumericUtils.sortableIntToFloat(lower)), new Float(NumericUtils.sortableIntToFloat(upper)), true, true); + tTopDocs = searcher.search(new MatchAllDocsQuery(), tf, 1); + assertEquals("Returned count of range filter must be equal to inclusive range length", upper-lower+1, tTopDocs.totalHits ); + } + + public void testFloatRange_8bit() throws Exception { + testFloatRange(8); + } + + public void testFloatRange_4bit() throws Exception { + testFloatRange(4); + } + + public void testFloatRange_2bit() throws Exception { + testFloatRange(2); + } + + private void testSorting(int precisionStep) throws Exception { + final Random rnd=newRandom(); + String field="field"+precisionStep; + // 10 random tests, the index order is ascending, + // so using a reverse sort field should retun descending documents + for (int i=0; i<10; i++) { + int lower=(int)(rnd.nextDouble()*noDocs*distance)+startOffset; + int upper=(int)(rnd.nextDouble()*noDocs*distance)+startOffset; + if (lower>upper) { + int a=lower; lower=upper; upper=a; + } + Query tq=NumericRangeQuery.newIntRange(field, precisionStep, new Integer(lower), new Integer(upper), true, true); + TopDocs topDocs = searcher.search(tq, null, noDocs, new Sort(NumericUtils.getIntSortField(field, true))); + if (topDocs.totalHits==0) continue; + ScoreDoc[] sd = topDocs.scoreDocs; + assertNotNull(sd); + int last=Integer.parseInt(searcher.doc(sd[0].doc).get("value")); + for (int j=1; jact ); + last=act; + } + } + } + + public void testSorting_8bit() throws Exception { + testSorting(8); + } + + public void testSorting_4bit() throws Exception { + testSorting(4); + } + + public void testSorting_2bit() throws Exception { + testSorting(2); + } + + public void testEqualsAndHash() throws Exception { + QueryUtils.checkHashEquals(NumericRangeQuery.newIntRange("test1", 4, new Integer(10), new Integer(20), true, true)); + QueryUtils.checkHashEquals(NumericRangeQuery.newIntRange("test2", 4, new Integer(10), new Integer(20), false, true)); + QueryUtils.checkHashEquals(NumericRangeQuery.newIntRange("test3", 4, new Integer(10), new Integer(20), true, false)); + QueryUtils.checkHashEquals(NumericRangeQuery.newIntRange("test4", 4, new Integer(10), new Integer(20), false, false)); + QueryUtils.checkHashEquals(NumericRangeQuery.newIntRange("test5", 4, new Integer(10), null, true, true)); + QueryUtils.checkHashEquals(NumericRangeQuery.newIntRange("test6", 4, null, new Integer(20), true, true)); + QueryUtils.checkHashEquals(NumericRangeQuery.newIntRange("test7", 4, null, null, true, true)); + QueryUtils.checkEqual( + NumericRangeQuery.newIntRange("test8", 4, new Integer(10), new Integer(20), true, true), + NumericRangeQuery.newIntRange("test8", 4, new Integer(10), new Integer(20), true, true) + ); + QueryUtils.checkUnequal( + NumericRangeQuery.newIntRange("test9", 4, new Integer(10), new Integer(20), true, true), + NumericRangeQuery.newIntRange("test9", 8, new Integer(10), new Integer(20), true, true) + ); + QueryUtils.checkUnequal( + NumericRangeQuery.newIntRange("test10a", 4, new Integer(10), new Integer(20), true, true), + NumericRangeQuery.newIntRange("test10b", 4, new Integer(10), new Integer(20), true, true) + ); + QueryUtils.checkUnequal( + NumericRangeQuery.newIntRange("test11", 4, new Integer(10), new Integer(20), true, true), + NumericRangeQuery.newIntRange("test11", 4, new Integer(20), new Integer(10), true, true) + ); + QueryUtils.checkUnequal( + NumericRangeQuery.newIntRange("test12", 4, new Integer(10), new Integer(20), true, true), + NumericRangeQuery.newIntRange("test12", 4, new Integer(10), new Integer(20), false, true) + ); + QueryUtils.checkUnequal( + NumericRangeQuery.newIntRange("test13", 4, new Integer(10), new Integer(20), true, true), + NumericRangeQuery.newFloatRange("test13", 4, new Float(10f), new Float(20f), true, true) + ); + // the following produces a hash collision, because Long and Integer have the same hashcode, so only test equality: + Query q1 = NumericRangeQuery.newIntRange("test14", 4, new Integer(10), new Integer(20), true, true); + Query q2 = NumericRangeQuery.newLongRange("test14", 4, new Long(10L), new Long(20L), true, true); + assertFalse(q1.equals(q2)); + assertFalse(q2.equals(q1)); + } + +} diff --git a/src/test/org/apache/lucene/search/TestNumericRangeQuery64.java b/src/test/org/apache/lucene/search/TestNumericRangeQuery64.java new file mode 100644 index 00000000000..dd8c114ce01 --- /dev/null +++ b/src/test/org/apache/lucene/search/TestNumericRangeQuery64.java @@ -0,0 +1,427 @@ +package org.apache.lucene.search; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.Random; + +import org.apache.lucene.analysis.NumericTokenStream; +import org.apache.lucene.analysis.WhitespaceAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriter.MaxFieldLength; +import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.NumericUtils; + +public class TestNumericRangeQuery64 extends LuceneTestCase { + // distance of entries + private static final long distance = 66666L; + // shift the starting of the values to the left, to also have negative values: + private static final long startOffset = - 1L << 31; + // number of docs to generate for testing + private static final int noDocs = 10000; + + private static Field newField(String name, int precisionStep) { + NumericTokenStream stream = new NumericTokenStream(precisionStep); + stream.setUseNewAPI(true); + Field f=new Field(name, stream); + f.setOmitTermFreqAndPositions(true); + f.setOmitNorms(true); + return f; + } + + private static final RAMDirectory directory; + private static final IndexSearcher searcher; + static { + try { + // set the theoretical maximum term count for 8bit (see docs for the number) + BooleanQuery.setMaxClauseCount(7*255*2 + 255); + + directory = new RAMDirectory(); + IndexWriter writer = new IndexWriter(directory, new WhitespaceAnalyzer(), + true, MaxFieldLength.UNLIMITED); + + Field + field8 = newField("field8", 8), + field4 = newField("field4", 4), + field2 = newField("field2", 2), + ascfield8 = newField("ascfield8", 8), + ascfield4 = newField("ascfield4", 4), + ascfield2 = newField("ascfield2", 2); + + // Add a series of noDocs docs with increasing long values + for (int l=0; l0) { + assertEquals("Distinct term number is equal for all query types", lastTerms, terms); + } + lastTerms = terms; + } + } + + public void testRange_8bit() throws Exception { + testRange(8); + } + + public void testRange_4bit() throws Exception { + testRange(4); + } + + public void testRange_2bit() throws Exception { + testRange(2); + } + + public void testInverseRange() throws Exception { + NumericRangeFilter f = NumericRangeFilter.newLongRange("field8", 8, new Long(1000L), new Long(-1000L), true, true); + assertSame("A inverse range should return the EMPTY_DOCIDSET instance", DocIdSet.EMPTY_DOCIDSET, f.getDocIdSet(searcher.getIndexReader())); + } + + private void testLeftOpenRange(int precisionStep) throws Exception { + String field="field"+precisionStep; + int count=3000; + long upper=(count-1)*distance + (distance/3) + startOffset; + NumericRangeQuery q=NumericRangeQuery.newLongRange(field, precisionStep, null, new Long(upper), true, true); + TopDocs topDocs = searcher.search(q, null, noDocs, Sort.INDEXORDER); + System.out.println("Found "+q.getTotalNumberOfTerms()+" distinct terms in left open range for field '"+field+"'."); + ScoreDoc[] sd = topDocs.scoreDocs; + assertNotNull(sd); + assertEquals("Score doc count", count, sd.length ); + Document doc=searcher.doc(sd[0].doc); + assertEquals("First doc", startOffset, Long.parseLong(doc.get("value")) ); + doc=searcher.doc(sd[sd.length-1].doc); + assertEquals("Last doc", (count-1)*distance+startOffset, Long.parseLong(doc.get("value")) ); + } + + public void testLeftOpenRange_8bit() throws Exception { + testLeftOpenRange(8); + } + + public void testLeftOpenRange_4bit() throws Exception { + testLeftOpenRange(4); + } + + public void testLeftOpenRange_2bit() throws Exception { + testLeftOpenRange(2); + } + + private void testRightOpenRange(int precisionStep) throws Exception { + String field="field"+precisionStep; + int count=3000; + long lower=(count-1)*distance + (distance/3) +startOffset; + NumericRangeQuery q=NumericRangeQuery.newLongRange(field, precisionStep, new Long(lower), null, true, true); + TopDocs topDocs = searcher.search(q, null, noDocs, Sort.INDEXORDER); + System.out.println("Found "+q.getTotalNumberOfTerms()+" distinct terms in right open range for field '"+field+"'."); + ScoreDoc[] sd = topDocs.scoreDocs; + assertNotNull(sd); + assertEquals("Score doc count", noDocs-count, sd.length ); + Document doc=searcher.doc(sd[0].doc); + assertEquals("First doc", count*distance+startOffset, Long.parseLong(doc.get("value")) ); + doc=searcher.doc(sd[sd.length-1].doc); + assertEquals("Last doc", (noDocs-1)*distance+startOffset, Long.parseLong(doc.get("value")) ); + } + + public void testRightOpenRange_8bit() throws Exception { + testRightOpenRange(8); + } + + public void testRightOpenRange_4bit() throws Exception { + testRightOpenRange(4); + } + + public void testRightOpenRange_2bit() throws Exception { + testRightOpenRange(2); + } + + private void testRandomTrieAndClassicRangeQuery(int precisionStep) throws Exception { + final Random rnd=newRandom(); + String field="field"+precisionStep; + int termCountT=0,termCountC=0; + for (int i=0; i<50; i++) { + long lower=(long)(rnd.nextDouble()*noDocs*distance)+startOffset; + long upper=(long)(rnd.nextDouble()*noDocs*distance)+startOffset; + if (lower>upper) { + long a=lower; lower=upper; upper=a; + } + // test inclusive range + NumericRangeQuery tq=NumericRangeQuery.newLongRange(field, precisionStep, new Long(lower), new Long(upper), true, true); + RangeQuery cq=new RangeQuery(field, NumericUtils.longToPrefixCoded(lower), NumericUtils.longToPrefixCoded(upper), true, true); + cq.setConstantScoreRewrite(true); + TopDocs tTopDocs = searcher.search(tq, 1); + TopDocs cTopDocs = searcher.search(cq, 1); + assertEquals("Returned count for NumericRangeQuery and RangeQuery must be equal", cTopDocs.totalHits, tTopDocs.totalHits ); + termCountT += tq.getTotalNumberOfTerms(); + termCountC += cq.getTotalNumberOfTerms(); + // test exclusive range + tq=NumericRangeQuery.newLongRange(field, precisionStep, new Long(lower), new Long(upper), false, false); + cq=new RangeQuery(field, NumericUtils.longToPrefixCoded(lower), NumericUtils.longToPrefixCoded(upper), false, false); + cq.setConstantScoreRewrite(true); + tTopDocs = searcher.search(tq, 1); + cTopDocs = searcher.search(cq, 1); + assertEquals("Returned count for NumericRangeQuery and RangeQuery must be equal", cTopDocs.totalHits, tTopDocs.totalHits ); + termCountT += tq.getTotalNumberOfTerms(); + termCountC += cq.getTotalNumberOfTerms(); + // test left exclusive range + tq=NumericRangeQuery.newLongRange(field, precisionStep, new Long(lower), new Long(upper), false, true); + cq=new RangeQuery(field, NumericUtils.longToPrefixCoded(lower), NumericUtils.longToPrefixCoded(upper), false, true); + cq.setConstantScoreRewrite(true); + tTopDocs = searcher.search(tq, 1); + cTopDocs = searcher.search(cq, 1); + assertEquals("Returned count for NumericRangeQuery and RangeQuery must be equal", cTopDocs.totalHits, tTopDocs.totalHits ); + termCountT += tq.getTotalNumberOfTerms(); + termCountC += cq.getTotalNumberOfTerms(); + // test right exclusive range + tq=NumericRangeQuery.newLongRange(field, precisionStep, new Long(lower), new Long(upper), true, false); + cq=new RangeQuery(field, NumericUtils.longToPrefixCoded(lower), NumericUtils.longToPrefixCoded(upper), true, false); + cq.setConstantScoreRewrite(true); + tTopDocs = searcher.search(tq, 1); + cTopDocs = searcher.search(cq, 1); + assertEquals("Returned count for NumericRangeQuery and RangeQuery must be equal", cTopDocs.totalHits, tTopDocs.totalHits ); + termCountT += tq.getTotalNumberOfTerms(); + termCountC += cq.getTotalNumberOfTerms(); + } + System.out.println("Average number of terms during random search on '" + field + "':"); + System.out.println(" Trie query: " + (((double)termCountT)/(50*4))); + System.out.println(" Classical query: " + (((double)termCountC)/(50*4))); + } + + public void testRandomTrieAndClassicRangeQuery_8bit() throws Exception { + testRandomTrieAndClassicRangeQuery(8); + } + + public void testRandomTrieAndClassicRangeQuery_4bit() throws Exception { + testRandomTrieAndClassicRangeQuery(4); + } + + public void testRandomTrieAndClassicRangeQuery_2bit() throws Exception { + testRandomTrieAndClassicRangeQuery(2); + } + + private void testRangeSplit(int precisionStep) throws Exception { + final Random rnd=newRandom(); + String field="ascfield"+precisionStep; + // 50 random tests + for (int i=0; i<50; i++) { + long lower=(long)(rnd.nextDouble()*noDocs - noDocs/2); + long upper=(long)(rnd.nextDouble()*noDocs - noDocs/2); + if (lower>upper) { + long a=lower; lower=upper; upper=a; + } + // test inclusive range + Query tq=NumericRangeQuery.newLongRange(field, precisionStep, new Long(lower), new Long(upper), true, true); + TopDocs tTopDocs = searcher.search(tq, 1); + assertEquals("Returned count of range query must be equal to inclusive range length", upper-lower+1, tTopDocs.totalHits ); + // test exclusive range + tq=NumericRangeQuery.newLongRange(field, precisionStep, new Long(lower), new Long(upper), false, false); + tTopDocs = searcher.search(tq, 1); + assertEquals("Returned count of range query must be equal to exclusive range length", Math.max(upper-lower-1, 0), tTopDocs.totalHits ); + // test left exclusive range + tq=NumericRangeQuery.newLongRange(field, precisionStep, new Long(lower), new Long(upper), false, true); + tTopDocs = searcher.search(tq, 1); + assertEquals("Returned count of range query must be equal to half exclusive range length", upper-lower, tTopDocs.totalHits ); + // test right exclusive range + tq=NumericRangeQuery.newLongRange(field, precisionStep, new Long(lower), new Long(upper), true, false); + tTopDocs = searcher.search(tq, 1); + assertEquals("Returned count of range query must be equal to half exclusive range length", upper-lower, tTopDocs.totalHits ); + } + } + + public void testRangeSplit_8bit() throws Exception { + testRangeSplit(8); + } + + public void testRangeSplit_4bit() throws Exception { + testRangeSplit(4); + } + + public void testRangeSplit_2bit() throws Exception { + testRangeSplit(2); + } + + /** we fake a double test using long2double conversion of NumericUtils */ + private void testDoubleRange(int precisionStep) throws Exception { + final String field="ascfield"+precisionStep; + final long lower=-1000L, upper=+2000L; + + Query tq=NumericRangeQuery.newDoubleRange(field, precisionStep, + new Double(NumericUtils.sortableLongToDouble(lower)), new Double(NumericUtils.sortableLongToDouble(upper)), true, true); + TopDocs tTopDocs = searcher.search(tq, 1); + assertEquals("Returned count of range query must be equal to inclusive range length", upper-lower+1, tTopDocs.totalHits ); + + Filter tf=NumericRangeFilter.newDoubleRange(field, precisionStep, + new Double(NumericUtils.sortableLongToDouble(lower)), new Double(NumericUtils.sortableLongToDouble(upper)), true, true); + tTopDocs = searcher.search(new MatchAllDocsQuery(), tf, 1); + assertEquals("Returned count of range filter must be equal to inclusive range length", upper-lower+1, tTopDocs.totalHits ); + } + + public void testDoubleRange_8bit() throws Exception { + testDoubleRange(8); + } + + public void testDoubleRange_4bit() throws Exception { + testDoubleRange(4); + } + + public void testDoubleRange_2bit() throws Exception { + testDoubleRange(2); + } + + private void testSorting(int precisionStep) throws Exception { + final Random rnd=newRandom(); + String field="field"+precisionStep; + // 10 random tests, the index order is ascending, + // so using a reverse sort field should retun descending documents + for (int i=0; i<10; i++) { + long lower=(long)(rnd.nextDouble()*noDocs*distance)+startOffset; + long upper=(long)(rnd.nextDouble()*noDocs*distance)+startOffset; + if (lower>upper) { + long a=lower; lower=upper; upper=a; + } + Query tq=NumericRangeQuery.newLongRange(field, precisionStep, new Long(lower), new Long(upper), true, true); + TopDocs topDocs = searcher.search(tq, null, noDocs, new Sort(NumericUtils.getLongSortField(field, true))); + if (topDocs.totalHits==0) continue; + ScoreDoc[] sd = topDocs.scoreDocs; + assertNotNull(sd); + long last=Long.parseLong(searcher.doc(sd[0].doc).get("value")); + for (int j=1; jact ); + last=act; + } + } + } + + public void testSorting_8bit() throws Exception { + testSorting(8); + } + + public void testSorting_4bit() throws Exception { + testSorting(4); + } + + public void testSorting_2bit() throws Exception { + testSorting(2); + } + + public void testEqualsAndHash() throws Exception { + QueryUtils.checkHashEquals(NumericRangeQuery.newLongRange("test1", 4, new Long(10L), new Long(20L), true, true)); + QueryUtils.checkHashEquals(NumericRangeQuery.newLongRange("test2", 4, new Long(10L), new Long(20L), false, true)); + QueryUtils.checkHashEquals(NumericRangeQuery.newLongRange("test3", 4, new Long(10L), new Long(20L), true, false)); + QueryUtils.checkHashEquals(NumericRangeQuery.newLongRange("test4", 4, new Long(10L), new Long(20L), false, false)); + QueryUtils.checkHashEquals(NumericRangeQuery.newLongRange("test5", 4, new Long(10L), null, true, true)); + QueryUtils.checkHashEquals(NumericRangeQuery.newLongRange("test6", 4, null, new Long(20L), true, true)); + QueryUtils.checkHashEquals(NumericRangeQuery.newLongRange("test7", 4, null, null, true, true)); + QueryUtils.checkEqual( + NumericRangeQuery.newLongRange("test8", 4, new Long(10L), new Long(20L), true, true), + NumericRangeQuery.newLongRange("test8", 4, new Long(10L), new Long(20L), true, true) + ); + QueryUtils.checkUnequal( + NumericRangeQuery.newLongRange("test9", 4, new Long(10L), new Long(20L), true, true), + NumericRangeQuery.newLongRange("test9", 8, new Long(10L), new Long(20L), true, true) + ); + QueryUtils.checkUnequal( + NumericRangeQuery.newLongRange("test10a", 4, new Long(10L), new Long(20L), true, true), + NumericRangeQuery.newLongRange("test10b", 4, new Long(10L), new Long(20L), true, true) + ); + QueryUtils.checkUnequal( + NumericRangeQuery.newLongRange("test11", 4, new Long(10L), new Long(20L), true, true), + NumericRangeQuery.newLongRange("test11", 4, new Long(20L), new Long(10L), true, true) + ); + QueryUtils.checkUnequal( + NumericRangeQuery.newLongRange("test12", 4, new Long(10L), new Long(20L), true, true), + NumericRangeQuery.newLongRange("test12", 4, new Long(10L), new Long(20L), false, true) + ); + QueryUtils.checkUnequal( + NumericRangeQuery.newLongRange("test13", 4, new Long(10L), new Long(20L), true, true), + NumericRangeQuery.newFloatRange("test13", 4, new Float(10f), new Float(20f), true, true) + ); + // difference to int range is tested in TestNumericRangeQuery32 + } + +} diff --git a/src/test/org/apache/lucene/util/TestNumericUtils.java b/src/test/org/apache/lucene/util/TestNumericUtils.java new file mode 100644 index 00000000000..50867c364aa --- /dev/null +++ b/src/test/org/apache/lucene/util/TestNumericUtils.java @@ -0,0 +1,339 @@ +package org.apache.lucene.util; + +/** +* Licensed to the Apache Software Foundation (ASF) under one or more +* contributor license agreements. See the NOTICE file distributed with +* this work for additional information regarding copyright ownership. +* The ASF licenses this file to You under the Apache License, Version 2.0 +* (the "License"); you may not use this file except in compliance with +* the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.OpenBitSet; + +import java.util.Arrays; +import java.util.Collections; +import java.util.Iterator; + +public class TestNumericUtils extends LuceneTestCase { + + public void testLongConversionAndOrdering() throws Exception { + // generate a series of encoded longs, each numerical one bigger than the one before + String last=null; + for (long l=-100000L; l<100000L; l++) { + String act=NumericUtils.longToPrefixCoded(l); + if (last!=null) { + // test if smaller + assertTrue("actual bigger than last", last.compareTo(act) < 0 ); + } + // test is back and forward conversion works + assertEquals("forward and back conversion should generate same long", l, NumericUtils.prefixCodedToLong(act)); + // next step + last=act; + } + } + + public void testIntConversionAndOrdering() throws Exception { + // generate a series of encoded ints, each numerical one bigger than the one before + String last=null; + for (int i=-100000; i<100000; i++) { + String act=NumericUtils.intToPrefixCoded(i); + if (last!=null) { + // test if smaller + assertTrue("actual bigger than last", last.compareTo(act) < 0 ); + } + // test is back and forward conversion works + assertEquals("forward and back conversion should generate same int", i, NumericUtils.prefixCodedToInt(act)); + // next step + last=act; + } + } + + public void testLongSpecialValues() throws Exception { + long[] vals=new long[]{ + Long.MIN_VALUE, Long.MIN_VALUE+1, Long.MIN_VALUE+2, -5003400000000L, + -4000L, -3000L, -2000L, -1000L, -1L, 0L, 1L, 10L, 300L, 50006789999999999L, Long.MAX_VALUE-2, Long.MAX_VALUE-1, Long.MAX_VALUE + }; + String[] prefixVals=new String[vals.length]; + + for (int i=0; i=lower && min<=upper && max>=lower && max<=upper); + if (useBitSet) for (long l=min; l<=max; l++) { + assertFalse("ranges should not overlap", bits.getAndSet(l-lower) ); + } + // make unsigned longs for easier display and understanding + min ^= 0x8000000000000000L; + max ^= 0x8000000000000000L; + //System.out.println("new Long(0x"+Long.toHexString(min>>>shift)+"L),new Long(0x"+Long.toHexString(max>>>shift)+"L),"); + assertEquals( "inner min bound", ((Long)neededBounds.next()).longValue(), min>>>shift); + assertEquals( "inner max bound", ((Long)neededBounds.next()).longValue(), max>>>shift); + } + }, precisionStep, lower, upper); + + if (useBitSet) { + // after flipping all bits in the range, the cardinality should be zero + bits.flip(0,upper-lower+1); + assertTrue("The sub-range concenated should match the whole range", bits.isEmpty()); + } + } + + public void testSplitLongRange() throws Exception { + // a hard-coded "standard" range + assertLongRangeSplit(-5000L, 9500L, 4, true, Arrays.asList(new Long[]{ + new Long(0x7fffffffffffec78L),new Long(0x7fffffffffffec7fL), + new Long(0x8000000000002510L),new Long(0x800000000000251cL), + new Long(0x7fffffffffffec8L), new Long(0x7fffffffffffecfL), + new Long(0x800000000000250L), new Long(0x800000000000250L), + new Long(0x7fffffffffffedL), new Long(0x7fffffffffffefL), + new Long(0x80000000000020L), new Long(0x80000000000024L), + new Long(0x7ffffffffffffL), new Long(0x8000000000001L) + }).iterator()); + + // the same with no range splitting + assertLongRangeSplit(-5000L, 9500L, 64, true, Arrays.asList(new Long[]{ + new Long(0x7fffffffffffec78L),new Long(0x800000000000251cL) + }).iterator()); + + // this tests optimized range splitting, if one of the inner bounds + // is also the bound of the next lower precision, it should be used completely + assertLongRangeSplit(0L, 1024L+63L, 4, true, Arrays.asList(new Long[]{ + new Long(0x800000000000040L), new Long(0x800000000000043L), + new Long(0x80000000000000L), new Long(0x80000000000003L) + }).iterator()); + + // the full long range should only consist of a lowest precision range; no bitset testing here, as too much memory needed :-) + assertLongRangeSplit(Long.MIN_VALUE, Long.MAX_VALUE, 8, false, Arrays.asList(new Long[]{ + new Long(0x00L),new Long(0xffL) + }).iterator()); + + // the same with precisionStep=4 + assertLongRangeSplit(Long.MIN_VALUE, Long.MAX_VALUE, 4, false, Arrays.asList(new Long[]{ + new Long(0x0L),new Long(0xfL) + }).iterator()); + + // the same with precisionStep=2 + assertLongRangeSplit(Long.MIN_VALUE, Long.MAX_VALUE, 2, false, Arrays.asList(new Long[]{ + new Long(0x0L),new Long(0x3L) + }).iterator()); + + // the same with precisionStep=1 + assertLongRangeSplit(Long.MIN_VALUE, Long.MAX_VALUE, 1, false, Arrays.asList(new Long[]{ + new Long(0x0L),new Long(0x1L) + }).iterator()); + + // a inverse range should produce no sub-ranges + assertLongRangeSplit(9500L, -5000L, 4, false, Collections.EMPTY_LIST.iterator()); + + // a 0-length range should reproduce the range itsself + assertLongRangeSplit(9500L, 9500L, 4, false, Arrays.asList(new Long[]{ + new Long(0x800000000000251cL),new Long(0x800000000000251cL) + }).iterator()); + } + + /** Note: The neededBounds iterator must be unsigned (easier understanding what's happening) */ + protected void assertIntRangeSplit(final int lower, final int upper, int precisionStep, + final boolean useBitSet, final Iterator neededBounds + ) throws Exception { + final OpenBitSet bits=useBitSet ? new OpenBitSet(upper-lower+1) : null; + + NumericUtils.splitIntRange(new NumericUtils.IntRangeBuilder() { + //@Override + public void addRange(int min, int max, int shift) { + assertTrue("min, max should be inside bounds", min>=lower && min<=upper && max>=lower && max<=upper); + if (useBitSet) for (int i=min; i<=max; i++) { + assertFalse("ranges should not overlap", bits.getAndSet(i-lower) ); + } + // make unsigned ints for easier display and understanding + min ^= 0x80000000; + max ^= 0x80000000; + //System.out.println("new Integer(0x"+Integer.toHexString(min>>>shift)+"),new Integer(0x"+Integer.toHexString(max>>>shift)+"),"); + assertEquals( "inner min bound", ((Integer)neededBounds.next()).intValue(), min>>>shift); + assertEquals( "inner max bound", ((Integer)neededBounds.next()).intValue(), max>>>shift); + } + }, precisionStep, lower, upper); + + if (useBitSet) { + // after flipping all bits in the range, the cardinality should be zero + bits.flip(0,upper-lower+1); + assertTrue("The sub-range concenated should match the whole range", bits.isEmpty()); + } + } + + public void testSplitIntRange() throws Exception { + // a hard-coded "standard" range + assertIntRangeSplit(-5000, 9500, 4, true, Arrays.asList(new Integer[]{ + new Integer(0x7fffec78),new Integer(0x7fffec7f), + new Integer(0x80002510),new Integer(0x8000251c), + new Integer(0x7fffec8), new Integer(0x7fffecf), + new Integer(0x8000250), new Integer(0x8000250), + new Integer(0x7fffed), new Integer(0x7fffef), + new Integer(0x800020), new Integer(0x800024), + new Integer(0x7ffff), new Integer(0x80001) + }).iterator()); + + // the same with no range splitting + assertIntRangeSplit(-5000, 9500, 32, true, Arrays.asList(new Integer[]{ + new Integer(0x7fffec78),new Integer(0x8000251c) + }).iterator()); + + // this tests optimized range splitting, if one of the inner bounds + // is also the bound of the next lower precision, it should be used completely + assertIntRangeSplit(0, 1024+63, 4, true, Arrays.asList(new Integer[]{ + new Integer(0x8000040), new Integer(0x8000043), + new Integer(0x800000), new Integer(0x800003) + }).iterator()); + + // the full int range should only consist of a lowest precision range; no bitset testing here, as too much memory needed :-) + assertIntRangeSplit(Integer.MIN_VALUE, Integer.MAX_VALUE, 8, false, Arrays.asList(new Integer[]{ + new Integer(0x00),new Integer(0xff) + }).iterator()); + + // the same with precisionStep=4 + assertIntRangeSplit(Integer.MIN_VALUE, Integer.MAX_VALUE, 4, false, Arrays.asList(new Integer[]{ + new Integer(0x0),new Integer(0xf) + }).iterator()); + + // the same with precisionStep=2 + assertIntRangeSplit(Integer.MIN_VALUE, Integer.MAX_VALUE, 2, false, Arrays.asList(new Integer[]{ + new Integer(0x0),new Integer(0x3) + }).iterator()); + + // the same with precisionStep=1 + assertIntRangeSplit(Integer.MIN_VALUE, Integer.MAX_VALUE, 1, false, Arrays.asList(new Integer[]{ + new Integer(0x0),new Integer(0x1) + }).iterator()); + + // a inverse range should produce no sub-ranges + assertIntRangeSplit(9500, -5000, 4, false, Collections.EMPTY_LIST.iterator()); + + // a 0-length range should reproduce the range itsself + assertIntRangeSplit(9500, 9500, 4, false, Arrays.asList(new Integer[]{ + new Integer(0x8000251c),new Integer(0x8000251c) + }).iterator()); + } + +}