From b2a4bc68adf2101970a83c8489fc5d936c6e9116 Mon Sep 17 00:00:00 2001 From: Uwe Schindler Date: Tue, 14 Jul 2009 09:17:44 +0000 Subject: [PATCH] LUCENE-1712: Set default precisionStep for NumericField and NumericRange* git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@793823 13f79535-47bb-0310-9956-ffa450edef68 --- CHANGES.txt | 2 +- .../lucene/analysis/NumericTokenStream.java | 49 ++++++----- .../apache/lucene/document/NumericField.java | 38 +++++++- .../lucene/search/NumericRangeFilter.java | 61 +++++++++++++ .../lucene/search/NumericRangeQuery.java | 86 +++++++++++++++++-- .../org/apache/lucene/util/NumericUtils.java | 31 ++++--- .../analysis/TestNumericTokenStream.java | 26 +++--- .../search/TestNumericRangeQuery32.java | 18 +++- .../search/TestNumericRangeQuery64.java | 52 +++++++++-- 9 files changed, 296 insertions(+), 67 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index bf74cad4904..74a5c086ccf 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -516,7 +516,7 @@ New features See the Javadocs for NGramDistance.java for a reference paper on why this is helpful (Tom Morton via Grant Ingersoll) -27. LUCENE-1470, LUCENE-1582, LUCENE-1602, LUCENE-1673, LUCENE-1701: +27. LUCENE-1470, LUCENE-1582, LUCENE-1602, LUCENE-1673, LUCENE-1701, LUCENE-1712: Added NumericRangeQuery and NumericRangeFilter, a fast alternative to RangeQuery/RangeFilter for numeric searches. They depend on a specific structure of terms in the index that can be created by indexing diff --git a/src/java/org/apache/lucene/analysis/NumericTokenStream.java b/src/java/org/apache/lucene/analysis/NumericTokenStream.java index 1c0dcec984a..f011e240608 100644 --- a/src/java/org/apache/lucene/analysis/NumericTokenStream.java +++ b/src/java/org/apache/lucene/analysis/NumericTokenStream.java @@ -77,7 +77,10 @@ import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; * *

Values indexed by this stream can be loaded into the {@link FieldCache} * and can be sorted (use {@link SortField}{@code .TYPE} to specify the correct - * type; {@link SortField#AUTO} does not work with this type of field) + * type; {@link SortField#AUTO} does not work with this type of field). + * Values solely used for sorting can be indexed using a precisionStep + * of {@link Integer#MAX_VALUE} (at least ≥64), because this step only produces + * one value token with highest precision. * *

NOTE: This API is experimental and * might change in incompatible ways in the next release. @@ -86,24 +89,30 @@ import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; */ public final class NumericTokenStream extends TokenStream { - /** The full precision 64 bit token gets this token type assigned. */ - public static final String TOKEN_TYPE_FULL_PREC_64 = "fullPrecNumeric64"; + /** The full precision token gets this token type assigned. */ + public static final String TOKEN_TYPE_FULL_PREC = "fullPrecNumeric"; - /** The lower precision 64 bit tokens gets this token type assigned. */ - public static final String TOKEN_TYPE_LOWER_PREC_64 = "lowerPrecNumeric64"; - - /** The full precision 32 bit token gets this token type assigned. */ - public static final String TOKEN_TYPE_FULL_PREC_32 = "fullPrecNumeric32"; - - /** The lower precision 32 bit tokens gets this token type assigned. */ - public static final String TOKEN_TYPE_LOWER_PREC_32 = "lowerPrecNumeric32"; + /** The lower precision tokens gets this token type assigned. */ + public static final String TOKEN_TYPE_LOWER_PREC = "lowerPrecNumeric"; /** - * Creates a token stream for numeric values. The stream is not yet initialized, + * Creates a token stream for numeric values using the default precisionStep + * {@link NumericUtils#PRECISION_STEP_DEFAULT} (4). The stream is not yet initialized, + * before using set a value using the various set???Value() methods. + */ + public NumericTokenStream() { + this(NumericUtils.PRECISION_STEP_DEFAULT); + } + + /** + * Creates a token stream for numeric values with the specified + * precisionStep. The stream is not yet initialized, * before using set a value using the various set???Value() methods. */ public NumericTokenStream(final int precisionStep) { this.precisionStep = precisionStep; + if (precisionStep < 1) + throw new IllegalArgumentException("precisionStep must be >=1"); termAtt = (TermAttribute) addAttribute(TermAttribute.class); typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class); posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class); @@ -165,8 +174,6 @@ public final class NumericTokenStream extends TokenStream { public void reset() { if (valSize == 0) throw new IllegalStateException("call set???Value() before usage"); - if (precisionStep < 1 || precisionStep > valSize) - throw new IllegalArgumentException("precisionStep may only be 1.."+valSize); shift = 0; } @@ -180,15 +187,13 @@ public final class NumericTokenStream extends TokenStream { final char[] buffer; switch (valSize) { case 64: - buffer = termAtt.resizeTermBuffer(NumericUtils.LONG_BUF_SIZE); + buffer = termAtt.resizeTermBuffer(NumericUtils.BUF_SIZE_LONG); termAtt.setTermLength(NumericUtils.longToPrefixCoded(value, shift, buffer)); - typeAtt.setType((shift == 0) ? TOKEN_TYPE_FULL_PREC_64 : TOKEN_TYPE_LOWER_PREC_64); break; case 32: - buffer = termAtt.resizeTermBuffer(NumericUtils.INT_BUF_SIZE); + buffer = termAtt.resizeTermBuffer(NumericUtils.BUF_SIZE_INT); termAtt.setTermLength(NumericUtils.intToPrefixCoded((int) value, shift, buffer)); - typeAtt.setType((shift == 0) ? TOKEN_TYPE_FULL_PREC_32 : TOKEN_TYPE_LOWER_PREC_32); break; default: @@ -196,6 +201,7 @@ public final class NumericTokenStream extends TokenStream { throw new IllegalArgumentException("valSize must be 32 or 64"); } + typeAtt.setType((shift == 0) ? TOKEN_TYPE_FULL_PREC : TOKEN_TYPE_LOWER_PREC); posIncrAtt.setPositionIncrement((shift == 0) ? 1 : 0); shift += precisionStep; return true; @@ -215,15 +221,13 @@ public final class NumericTokenStream extends TokenStream { final char[] buffer; switch (valSize) { case 64: - buffer = reusableToken.resizeTermBuffer(NumericUtils.LONG_BUF_SIZE); + buffer = reusableToken.resizeTermBuffer(NumericUtils.BUF_SIZE_LONG); reusableToken.setTermLength(NumericUtils.longToPrefixCoded(value, shift, buffer)); - reusableToken.setType((shift == 0) ? TOKEN_TYPE_FULL_PREC_64 : TOKEN_TYPE_LOWER_PREC_64); break; case 32: - buffer = reusableToken.resizeTermBuffer(NumericUtils.INT_BUF_SIZE); + buffer = reusableToken.resizeTermBuffer(NumericUtils.BUF_SIZE_INT); reusableToken.setTermLength(NumericUtils.intToPrefixCoded((int) value, shift, buffer)); - reusableToken.setType((shift == 0) ? TOKEN_TYPE_FULL_PREC_32 : TOKEN_TYPE_LOWER_PREC_32); break; default: @@ -231,6 +235,7 @@ public final class NumericTokenStream extends TokenStream { throw new IllegalArgumentException("valSize must be 32 or 64"); } + reusableToken.setType((shift == 0) ? TOKEN_TYPE_FULL_PREC : TOKEN_TYPE_LOWER_PREC); reusableToken.setPositionIncrement((shift == 0) ? 1 : 0); shift += precisionStep; return reusableToken; diff --git a/src/java/org/apache/lucene/document/NumericField.java b/src/java/org/apache/lucene/document/NumericField.java index 1ae36c0c8a8..f22a1ba1eef 100644 --- a/src/java/org/apache/lucene/document/NumericField.java +++ b/src/java/org/apache/lucene/document/NumericField.java @@ -21,6 +21,7 @@ import java.io.Reader; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.NumericTokenStream; +import org.apache.lucene.util.NumericUtils; import org.apache.lucene.search.NumericRangeQuery; // javadocs import org.apache.lucene.search.NumericRangeFilter; // javadocs import org.apache.lucene.search.SortField; // javadocs @@ -72,7 +73,10 @@ import org.apache.lucene.search.FieldCache; // javadocs * *

Values indexed by this field can be loaded into the {@link FieldCache} * and can be sorted (use {@link SortField}{@code .TYPE} to specify the correct - * type; {@link SortField#AUTO} does not work with this type of field) + * type; {@link SortField#AUTO} does not work with this type of field). + * Values solely used for sorting can be indexed using a precisionStep + * of {@link Integer#MAX_VALUE} (at least ≥64), because this step only produces + * one value token with highest precision. * *

NOTE: This API is experimental and * might change in incompatible ways in the next release. @@ -84,7 +88,34 @@ public final class NumericField extends AbstractField { private final NumericTokenStream tokenStream; /** - * Creates a field for numeric values. The instance is not yet initialized with + * Creates a field for numeric values using the default precisionStep + * {@link NumericUtils#PRECISION_STEP_DEFAULT} (4). The instance is not yet initialized with + * a numeric value, before indexing a document containing this field, + * set a value using the various set???Value() methods. + * This constrcutor creates an indexed, but not stored field. + * @param name the field name + */ + public NumericField(String name) { + this(name, NumericUtils.PRECISION_STEP_DEFAULT, Field.Store.NO, true); + } + + /** + * Creates a field for numeric values using the default precisionStep + * {@link NumericUtils#PRECISION_STEP_DEFAULT} (4). The instance is not yet initialized with + * a numeric value, before indexing a document containing this field, + * set a value using the various set???Value() methods. + * @param name the field name + * @param store if the field should be stored in plain text form + * (according to toString(value) of the used data type) + * @param index if the field should be indexed using {@link NumericTokenStream} + */ + public NumericField(String name, Field.Store store, boolean index) { + this(name, NumericUtils.PRECISION_STEP_DEFAULT, store, index); + } + + /** + * Creates a field for numeric values with the specified + * precisionStep. The instance is not yet initialized with * a numeric value, before indexing a document containing this field, * set a value using the various set???Value() methods. * This constrcutor creates an indexed, but not stored field. @@ -96,7 +127,8 @@ public final class NumericField extends AbstractField { } /** - * Creates a field for numeric values. The instance is not yet initialized with + * Creates a field for numeric values with the specified + * precisionStep. The instance is not yet initialized with * a numeric value, before indexing a document containing this field, * set a value using the various set???Value() methods. * @param name the field name diff --git a/src/java/org/apache/lucene/search/NumericRangeFilter.java b/src/java/org/apache/lucene/search/NumericRangeFilter.java index 09331a1f62c..f17e70a5ef3 100644 --- a/src/java/org/apache/lucene/search/NumericRangeFilter.java +++ b/src/java/org/apache/lucene/search/NumericRangeFilter.java @@ -19,6 +19,7 @@ package org.apache.lucene.search; import org.apache.lucene.analysis.NumericTokenStream; // for javadocs import org.apache.lucene.document.NumericField; // for javadocs +import org.apache.lucene.util.NumericUtils; // for javadocs /** * Implementation of a {@link Filter} that implements trie-based range filtering @@ -64,6 +65,21 @@ public final class NumericRangeFilter extends MultiTermQueryWrapperFilter { ); } + /** + * Factory that creates a NumericRangeFilter, that queries a long + * range using the default precisionStep {@link NumericUtils#PRECISION_STEP_DEFAULT} (4). + * You can have half-open ranges (which are in fact </≤ or >/≥ queries) + * by setting the min or max value to null. By setting inclusive to false, it will + * match all documents excluding the bounds, with inclusive on, the boundaries are hits, too. + */ + public static NumericRangeFilter newLongRange(final String field, + Long min, Long max, final boolean minInclusive, final boolean maxInclusive + ) { + return new NumericRangeFilter( + NumericRangeQuery.newLongRange(field, min, max, minInclusive, maxInclusive) + ); + } + /** * Factory that creates a NumericRangeFilter, that filters a int * range using the given precisionStep. @@ -79,6 +95,21 @@ public final class NumericRangeFilter extends MultiTermQueryWrapperFilter { ); } + /** + * Factory that creates a NumericRangeFilter, that queries a int + * range using the default precisionStep {@link NumericUtils#PRECISION_STEP_DEFAULT} (4). + * You can have half-open ranges (which are in fact </≤ or >/≥ queries) + * by setting the min or max value to null. By setting inclusive to false, it will + * match all documents excluding the bounds, with inclusive on, the boundaries are hits, too. + */ + public static NumericRangeFilter newIntRange(final String field, + Integer min, Integer max, final boolean minInclusive, final boolean maxInclusive + ) { + return new NumericRangeFilter( + NumericRangeQuery.newIntRange(field, min, max, minInclusive, maxInclusive) + ); + } + /** * Factory that creates a NumericRangeFilter, that filters a double * range using the given precisionStep. @@ -94,6 +125,21 @@ public final class NumericRangeFilter extends MultiTermQueryWrapperFilter { ); } + /** + * Factory that creates a NumericRangeFilter, that queries a double + * range using the default precisionStep {@link NumericUtils#PRECISION_STEP_DEFAULT} (4). + * You can have half-open ranges (which are in fact </≤ or >/≥ queries) + * by setting the min or max value to null. By setting inclusive to false, it will + * match all documents excluding the bounds, with inclusive on, the boundaries are hits, too. + */ + public static NumericRangeFilter newDoubleRange(final String field, + Double min, Double max, final boolean minInclusive, final boolean maxInclusive + ) { + return new NumericRangeFilter( + NumericRangeQuery.newDoubleRange(field, min, max, minInclusive, maxInclusive) + ); + } + /** * Factory that creates a NumericRangeFilter, that filters a float * range using the given precisionStep. @@ -109,6 +155,21 @@ public final class NumericRangeFilter extends MultiTermQueryWrapperFilter { ); } + /** + * Factory that creates a NumericRangeFilter, that queries a float + * range using the default precisionStep {@link NumericUtils#PRECISION_STEP_DEFAULT} (4). + * You can have half-open ranges (which are in fact </≤ or >/≥ queries) + * by setting the min or max value to null. By setting inclusive to false, it will + * match all documents excluding the bounds, with inclusive on, the boundaries are hits, too. + */ + public static NumericRangeFilter newFloatRange(final String field, + Float min, Float max, final boolean minInclusive, final boolean maxInclusive + ) { + return new NumericRangeFilter( + NumericRangeQuery.newFloatRange(field, min, max, minInclusive, maxInclusive) + ); + } + /** Returns the field name for this filter */ public String getField() { return ((NumericRangeQuery)query).getField(); } diff --git a/src/java/org/apache/lucene/search/NumericRangeQuery.java b/src/java/org/apache/lucene/search/NumericRangeQuery.java index dfdc32f1572..6c680e29a4f 100644 --- a/src/java/org/apache/lucene/search/NumericRangeQuery.java +++ b/src/java/org/apache/lucene/search/NumericRangeQuery.java @@ -38,8 +38,9 @@ import org.apache.lucene.index.Term; * An important setting is the precisionStep, which specifies, * how many different precisions per numeric value are indexed to speed up range queries. * Lower values create more terms but speed up search, higher values create less terms, but - * slow down search. Suitable values are 2, 4, or 8. A good starting point to test is 4. - * For code examples see {@link NumericField}. + * slow down search. Suitable values are between 1 and 8. A good starting point to test is 4, + * which is the default value for all Numeric* classes. For a discussion about ideal + * values, see below. Indexing code examples can be found in {@link NumericField}. * *

Searching

*

This class has no constructor, you can create queries depending on the data type @@ -51,6 +52,8 @@ import org.apache.lucene.index.Term; * new Float(0.3f), new Float(0.10f), * true, true); * + * The used precisionStep must be compatible + * to the one used during indexing (see below). The default is also 4. * *

How it works

* @@ -101,18 +104,31 @@ import org.apache.lucene.index.Term; * be found out by testing. Important: You can index with a lower precision step value and test search speed * using a multiple of the original step value.

* + *

Good values for precisionStep are depending on usage and data type: + *

+ * *

This dramatically improves the performance of Apache Lucene with range queries, which * are no longer dependent on the index size and the number of distinct values because there is * an upper limit unrelated to either of these properties.

* *

Comparisions of the different types of RangeQueries on an index with about 500,000 docs showed - * that the old {@link RangeQuery} (with raised {@link BooleanQuery} clause count) took about 30-40 - * secs to complete, {@link ConstantScoreRangeQuery} took 5 secs and executing - * this class took <100ms to complete (on an Opteron64 machine, Java 1.5, 8 bit precision step). - * This query type was developed for a geographic portal, where the performance for + * that {@link TermRangeQuery} in boolean rewrite mode (with raised {@link BooleanQuery} clause count) + * took about 30-40 secs to complete, {@link TermRangeQuery} in constant score rewrite mode took 5 secs + * and executing this class took <100ms to complete (on an Opteron64 machine, Java 1.5, 8 bit + * precision step). This query type was developed for a geographic portal, where the performance for * e.g. bounding boxes or exact date/time stamps is important.

* - *

The query is in {@linkplain #setConstantScoreRewrite constant score mode} per default. + *

The query defaults to {@linkplain #setConstantScoreRewrite constant score rewrite mode}. * With precision steps of ≤4, this query can be run in conventional {@link BooleanQuery} * rewrite mode without changing the max clause count. * @@ -127,8 +143,8 @@ public final class NumericRangeQuery extends MultiTermQuery { Number min, Number max, final boolean minInclusive, final boolean maxInclusive ) { assert (valSize == 32 || valSize == 64); - if (precisionStep < 1 || precisionStep > valSize) - throw new IllegalArgumentException("precisionStep may only be 1.."+valSize); + if (precisionStep < 1) + throw new IllegalArgumentException("precisionStep must be >=1"); this.field = field.intern(); this.precisionStep = precisionStep; this.valSize = valSize; @@ -152,6 +168,19 @@ public final class NumericRangeQuery extends MultiTermQuery { return new NumericRangeQuery(field, precisionStep, 64, min, max, minInclusive, maxInclusive); } + /** + * Factory that creates a NumericRangeQuery, that queries a long + * range using the default precisionStep {@link NumericUtils#PRECISION_STEP_DEFAULT} (4). + * You can have half-open ranges (which are in fact </≤ or >/≥ queries) + * by setting the min or max value to null. By setting inclusive to false, it will + * match all documents excluding the bounds, with inclusive on, the boundaries are hits, too. + */ + public static NumericRangeQuery newLongRange(final String field, + Long min, Long max, final boolean minInclusive, final boolean maxInclusive + ) { + return new NumericRangeQuery(field, NumericUtils.PRECISION_STEP_DEFAULT, 64, min, max, minInclusive, maxInclusive); + } + /** * Factory that creates a NumericRangeQuery, that queries a int * range using the given precisionStep. @@ -165,6 +194,19 @@ public final class NumericRangeQuery extends MultiTermQuery { return new NumericRangeQuery(field, precisionStep, 32, min, max, minInclusive, maxInclusive); } + /** + * Factory that creates a NumericRangeQuery, that queries a int + * range using the default precisionStep {@link NumericUtils#PRECISION_STEP_DEFAULT} (4). + * You can have half-open ranges (which are in fact </≤ or >/≥ queries) + * by setting the min or max value to null. By setting inclusive to false, it will + * match all documents excluding the bounds, with inclusive on, the boundaries are hits, too. + */ + public static NumericRangeQuery newIntRange(final String field, + Integer min, Integer max, final boolean minInclusive, final boolean maxInclusive + ) { + return new NumericRangeQuery(field, NumericUtils.PRECISION_STEP_DEFAULT, 32, min, max, minInclusive, maxInclusive); + } + /** * Factory that creates a NumericRangeQuery, that queries a double * range using the given precisionStep. @@ -178,6 +220,19 @@ public final class NumericRangeQuery extends MultiTermQuery { return new NumericRangeQuery(field, precisionStep, 64, min, max, minInclusive, maxInclusive); } + /** + * Factory that creates a NumericRangeQuery, that queries a double + * range using the default precisionStep {@link NumericUtils#PRECISION_STEP_DEFAULT} (4). + * You can have half-open ranges (which are in fact </≤ or >/≥ queries) + * by setting the min or max value to null. By setting inclusive to false, it will + * match all documents excluding the bounds, with inclusive on, the boundaries are hits, too. + */ + public static NumericRangeQuery newDoubleRange(final String field, + Double min, Double max, final boolean minInclusive, final boolean maxInclusive + ) { + return new NumericRangeQuery(field, NumericUtils.PRECISION_STEP_DEFAULT, 64, min, max, minInclusive, maxInclusive); + } + /** * Factory that creates a NumericRangeQuery, that queries a float * range using the given precisionStep. @@ -191,6 +246,19 @@ public final class NumericRangeQuery extends MultiTermQuery { return new NumericRangeQuery(field, precisionStep, 32, min, max, minInclusive, maxInclusive); } + /** + * Factory that creates a NumericRangeQuery, that queries a float + * range using the default precisionStep {@link NumericUtils#PRECISION_STEP_DEFAULT} (4). + * You can have half-open ranges (which are in fact </≤ or >/≥ queries) + * by setting the min or max value to null. By setting inclusive to false, it will + * match all documents excluding the bounds, with inclusive on, the boundaries are hits, too. + */ + public static NumericRangeQuery newFloatRange(final String field, + Float min, Float max, final boolean minInclusive, final boolean maxInclusive + ) { + return new NumericRangeQuery(field, NumericUtils.PRECISION_STEP_DEFAULT, 32, min, max, minInclusive, maxInclusive); + } + //@Override protected FilteredTermEnum getEnum(final IndexReader reader) throws IOException { return new NumericRangeTermEnum(reader); diff --git a/src/java/org/apache/lucene/util/NumericUtils.java b/src/java/org/apache/lucene/util/NumericUtils.java index ea4252a8b96..1a3c635632a 100644 --- a/src/java/org/apache/lucene/util/NumericUtils.java +++ b/src/java/org/apache/lucene/util/NumericUtils.java @@ -18,6 +18,7 @@ package org.apache.lucene.util; */ import org.apache.lucene.analysis.NumericTokenStream; // for javadocs +import org.apache.lucene.document.NumericField; // for javadocs import org.apache.lucene.search.NumericRangeQuery; // for javadocs import org.apache.lucene.search.NumericRangeFilter; // for javadocs @@ -62,9 +63,15 @@ import org.apache.lucene.search.NumericRangeFilter; // for javadocs public final class NumericUtils { private NumericUtils() {} // no instance! - + /** - * Longs are stored at lower precision by shifting off lower bits. The shift count is + * The default precision step used by {@link NumericField}, {@link NumericTokenStream}, + * {@link NumericRangeQuery}, and {@link NumericRangeFilter} as default + */ + public static final int PRECISION_STEP_DEFAULT = 4; + + /** + * Expert: Longs are stored at lower precision by shifting off lower bits. The shift count is * stored as SHIFT_START_LONG+shift in the first character */ public static final char SHIFT_START_LONG = (char)0x20; @@ -74,10 +81,10 @@ public final class NumericUtils { * for encoding long values. * @see #longToPrefixCoded(long,int,char[]) */ - public static final int LONG_BUF_SIZE = 63/7 + 2; + public static final int BUF_SIZE_LONG = 63/7 + 2; /** - * Integers are stored at lower precision by shifting off lower bits. The shift count is + * Expert: Integers are stored at lower precision by shifting off lower bits. The shift count is * stored as SHIFT_START_INT+shift in the first character */ public static final char SHIFT_START_INT = (char)0x60; @@ -87,14 +94,14 @@ public final class NumericUtils { * for encoding int values. * @see #intToPrefixCoded(int,int,char[]) */ - public static final int INT_BUF_SIZE = 31/7 + 2; + public static final int BUF_SIZE_INT = 31/7 + 2; /** * Expert: Returns prefix coded bits after reducing the precision by shift bits. * This is method is used by {@link NumericTokenStream}. * @param val the numeric value * @param shift how many bits to strip from the right - * @param buffer that will contain the encoded chars, must be at least of {@link #LONG_BUF_SIZE} + * @param buffer that will contain the encoded chars, must be at least of {@link #BUF_SIZE_LONG} * length * @return number of chars written to buffer */ @@ -122,7 +129,7 @@ public final class NumericUtils { * @param shift how many bits to strip from the right */ public static String longToPrefixCoded(final long val, final int shift) { - final char[] buffer = new char[LONG_BUF_SIZE]; + final char[] buffer = new char[BUF_SIZE_LONG]; final int len = longToPrefixCoded(val, shift, buffer); return new String(buffer, 0, len); } @@ -142,7 +149,7 @@ public final class NumericUtils { * This is method is used by {@link NumericTokenStream}. * @param val the numeric value * @param shift how many bits to strip from the right - * @param buffer that will contain the encoded chars, must be at least of {@link #INT_BUF_SIZE} + * @param buffer that will contain the encoded chars, must be at least of {@link #BUF_SIZE_INT} * length * @return number of chars written to buffer */ @@ -170,7 +177,7 @@ public final class NumericUtils { * @param shift how many bits to strip from the right */ public static String intToPrefixCoded(final int val, final int shift) { - final char[] buffer = new char[INT_BUF_SIZE]; + final char[] buffer = new char[BUF_SIZE_INT]; final int len = intToPrefixCoded(val, shift, buffer); return new String(buffer, 0, len); } @@ -294,8 +301,6 @@ public final class NumericUtils { public static void splitLongRange(final LongRangeBuilder builder, final int precisionStep, final long minBound, final long maxBound ) { - if (precisionStep<1 || precisionStep>64) - throw new IllegalArgumentException("precisionStep may only be 1..64"); splitRange(builder, 64, precisionStep, minBound, maxBound); } @@ -310,8 +315,6 @@ public final class NumericUtils { public static void splitIntRange(final IntRangeBuilder builder, final int precisionStep, final int minBound, final int maxBound ) { - if (precisionStep<1 || precisionStep>32) - throw new IllegalArgumentException("precisionStep may only be 1..32"); splitRange(builder, 32, precisionStep, (long)minBound, (long)maxBound); } @@ -320,6 +323,8 @@ public final class NumericUtils { final Object builder, final int valSize, final int precisionStep, long minBound, long maxBound ) { + if (precisionStep < 1) + throw new IllegalArgumentException("precisionStep must be >=1"); if (minBound > maxBound) return; for (int shift=0; ; shift += precisionStep) { // calculate new bounds for inner precision diff --git a/src/test/org/apache/lucene/analysis/TestNumericTokenStream.java b/src/test/org/apache/lucene/analysis/TestNumericTokenStream.java index 5f225895a13..e9f73019512 100644 --- a/src/test/org/apache/lucene/analysis/TestNumericTokenStream.java +++ b/src/test/org/apache/lucene/analysis/TestNumericTokenStream.java @@ -20,61 +20,67 @@ package org.apache.lucene.analysis; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.NumericUtils; import org.apache.lucene.analysis.tokenattributes.TermAttribute; +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; public class TestNumericTokenStream extends LuceneTestCase { - static final int precisionStep = 8; static final long lvalue = 4573245871874382L; static final int ivalue = 123456; public void testLongStreamNewAPI() throws Exception { - final NumericTokenStream stream=new NumericTokenStream(precisionStep).setLongValue(lvalue); + final NumericTokenStream stream=new NumericTokenStream().setLongValue(lvalue); stream.setUseNewAPI(true); // use getAttribute to test if attributes really exist, if not an IAE will be throwed final TermAttribute termAtt = (TermAttribute) stream.getAttribute(TermAttribute.class); - for (int shift=0; shift<64; shift+=precisionStep) { + final TypeAttribute typeAtt = (TypeAttribute) stream.getAttribute(TypeAttribute.class); + for (int shift=0; shift<64; shift+=NumericUtils.PRECISION_STEP_DEFAULT) { assertTrue("New token is available", stream.incrementToken()); assertEquals("Term is correctly encoded", NumericUtils.longToPrefixCoded(lvalue, shift), termAtt.term()); + assertEquals("Type correct", (shift == 0) ? NumericTokenStream.TOKEN_TYPE_FULL_PREC : NumericTokenStream.TOKEN_TYPE_LOWER_PREC, typeAtt.type()); } assertFalse("No more tokens available", stream.incrementToken()); } public void testLongStreamOldAPI() throws Exception { - final NumericTokenStream stream=new NumericTokenStream(precisionStep).setLongValue(lvalue); + final NumericTokenStream stream=new NumericTokenStream().setLongValue(lvalue); stream.setUseNewAPI(false); Token tok=new Token(); - for (int shift=0; shift<64; shift+=precisionStep) { + for (int shift=0; shift<64; shift+=NumericUtils.PRECISION_STEP_DEFAULT) { assertNotNull("New token is available", tok=stream.next(tok)); assertEquals("Term is correctly encoded", NumericUtils.longToPrefixCoded(lvalue, shift), tok.term()); + assertEquals("Type correct", (shift == 0) ? NumericTokenStream.TOKEN_TYPE_FULL_PREC : NumericTokenStream.TOKEN_TYPE_LOWER_PREC, tok.type()); } assertNull("No more tokens available", stream.next(tok)); } public void testIntStreamNewAPI() throws Exception { - final NumericTokenStream stream=new NumericTokenStream(precisionStep).setIntValue(ivalue); + final NumericTokenStream stream=new NumericTokenStream().setIntValue(ivalue); stream.setUseNewAPI(true); // use getAttribute to test if attributes really exist, if not an IAE will be throwed final TermAttribute termAtt = (TermAttribute) stream.getAttribute(TermAttribute.class); - for (int shift=0; shift<32; shift+=precisionStep) { + final TypeAttribute typeAtt = (TypeAttribute) stream.getAttribute(TypeAttribute.class); + for (int shift=0; shift<32; shift+=NumericUtils.PRECISION_STEP_DEFAULT) { assertTrue("New token is available", stream.incrementToken()); assertEquals("Term is correctly encoded", NumericUtils.intToPrefixCoded(ivalue, shift), termAtt.term()); + assertEquals("Type correct", (shift == 0) ? NumericTokenStream.TOKEN_TYPE_FULL_PREC : NumericTokenStream.TOKEN_TYPE_LOWER_PREC, typeAtt.type()); } assertFalse("No more tokens available", stream.incrementToken()); } public void testIntStreamOldAPI() throws Exception { - final NumericTokenStream stream=new NumericTokenStream(precisionStep).setIntValue(ivalue); + final NumericTokenStream stream=new NumericTokenStream().setIntValue(ivalue); stream.setUseNewAPI(false); Token tok=new Token(); - for (int shift=0; shift<32; shift+=precisionStep) { + for (int shift=0; shift<32; shift+=NumericUtils.PRECISION_STEP_DEFAULT) { assertNotNull("New token is available", tok=stream.next(tok)); assertEquals("Term is correctly encoded", NumericUtils.intToPrefixCoded(ivalue, shift), tok.term()); + assertEquals("Type correct", (shift == 0) ? NumericTokenStream.TOKEN_TYPE_FULL_PREC : NumericTokenStream.TOKEN_TYPE_LOWER_PREC, tok.type()); } assertNull("No more tokens available", stream.next(tok)); } public void testNotInitialized() throws Exception { - final NumericTokenStream stream=new NumericTokenStream(precisionStep); + final NumericTokenStream stream=new NumericTokenStream(); try { stream.reset(); diff --git a/src/test/org/apache/lucene/search/TestNumericRangeQuery32.java b/src/test/org/apache/lucene/search/TestNumericRangeQuery32.java index 5c928dc3b30..30316858195 100644 --- a/src/test/org/apache/lucene/search/TestNumericRangeQuery32.java +++ b/src/test/org/apache/lucene/search/TestNumericRangeQuery32.java @@ -53,13 +53,14 @@ public class TestNumericRangeQuery32 extends LuceneTestCase { field8 = new NumericField("field8", 8, Field.Store.YES, true), field4 = new NumericField("field4", 4, Field.Store.YES, true), field2 = new NumericField("field2", 2, Field.Store.YES, true), + fieldNoTrie = new NumericField("field"+Integer.MAX_VALUE, Integer.MAX_VALUE, Field.Store.YES, true), ascfield8 = new NumericField("ascfield8", 8, Field.Store.NO, true), ascfield4 = new NumericField("ascfield4", 4, Field.Store.NO, true), ascfield2 = new NumericField("ascfield2", 2, Field.Store.NO, true); Document doc = new Document(); // add fields, that have a distance to test general functionality - doc.add(field8); doc.add(field4); doc.add(field2); + doc.add(field8); doc.add(field4); doc.add(field2); doc.add(fieldNoTrie); // add ascending fields with a distance of 1, beginning at -noDocs/2 to test the correct splitting of range and inclusive/exclusive doc.add(ascfield8); doc.add(ascfield4); doc.add(ascfield2); @@ -69,6 +70,7 @@ public class TestNumericRangeQuery32 extends LuceneTestCase { field8.setIntValue(val); field4.setIntValue(val); field2.setIntValue(val); + fieldNoTrie.setIntValue(val); val=l-(noDocs/2); ascfield8.setIntValue(val); @@ -261,9 +263,13 @@ public class TestNumericRangeQuery32 extends LuceneTestCase { termCountT += tq.getTotalNumberOfTerms(); termCountC += cq.getTotalNumberOfTerms(); } - System.out.println("Average number of terms during random search on '" + field + "':"); - System.out.println(" Trie query: " + (((double)termCountT)/(50*4))); - System.out.println(" Classical query: " + (((double)termCountC)/(50*4))); + if (precisionStep == Integer.MAX_VALUE) { + assertEquals("Total number of terms should be equal for unlimited precStep", termCountT, termCountC); + } else { + System.out.println("Average number of terms during random search on '" + field + "':"); + System.out.println(" Trie query: " + (((double)termCountT)/(50*4))); + System.out.println(" Classical query: " + (((double)termCountC)/(50*4))); + } } public void testRandomTrieAndClassicRangeQuery_8bit() throws Exception { @@ -278,6 +284,10 @@ public class TestNumericRangeQuery32 extends LuceneTestCase { testRandomTrieAndClassicRangeQuery(2); } + public void testRandomTrieAndClassicRangeQuery_NoTrie() throws Exception { + testRandomTrieAndClassicRangeQuery(Integer.MAX_VALUE); + } + private void testRangeSplit(int precisionStep) throws Exception { final Random rnd=newRandom(); String field="ascfield"+precisionStep; diff --git a/src/test/org/apache/lucene/search/TestNumericRangeQuery64.java b/src/test/org/apache/lucene/search/TestNumericRangeQuery64.java index bec0a35ac1b..9993827e5be 100644 --- a/src/test/org/apache/lucene/search/TestNumericRangeQuery64.java +++ b/src/test/org/apache/lucene/search/TestNumericRangeQuery64.java @@ -51,27 +51,33 @@ public class TestNumericRangeQuery64 extends LuceneTestCase { NumericField field8 = new NumericField("field8", 8, Field.Store.YES, true), + field6 = new NumericField("field6", 6, Field.Store.YES, true), field4 = new NumericField("field4", 4, Field.Store.YES, true), field2 = new NumericField("field2", 2, Field.Store.YES, true), + fieldNoTrie = new NumericField("field"+Integer.MAX_VALUE, Integer.MAX_VALUE, Field.Store.YES, true), ascfield8 = new NumericField("ascfield8", 8, Field.Store.NO, true), + ascfield6 = new NumericField("ascfield6", 6, Field.Store.NO, true), ascfield4 = new NumericField("ascfield4", 4, Field.Store.NO, true), ascfield2 = new NumericField("ascfield2", 2, Field.Store.NO, true); Document doc = new Document(); // add fields, that have a distance to test general functionality - doc.add(field8); doc.add(field4); doc.add(field2); + doc.add(field8); doc.add(field6); doc.add(field4); doc.add(field2); doc.add(fieldNoTrie); // add ascending fields with a distance of 1, beginning at -noDocs/2 to test the correct splitting of range and inclusive/exclusive - doc.add(ascfield8); doc.add(ascfield4); doc.add(ascfield2); + doc.add(ascfield8); doc.add(ascfield6); doc.add(ascfield4); doc.add(ascfield2); // Add a series of noDocs docs with increasing long values, by updating the fields for (int l=0; l