LUCENE-1470: New implementation using encoding of TrieUtils and TrieRangeFilter that now also supports 32bit and 64bit fields

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@744207 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Uwe Schindler 2009-02-13 18:27:01 +00:00
parent 18b819abe0
commit 8b79fa51a5
11 changed files with 1800 additions and 1137 deletions

View File

@ -0,0 +1,160 @@
package org.apache.lucene.search.trie;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.Arrays;
import org.apache.lucene.search.Filter;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ConstantScoreQuery;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.TermDocs;
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.index.Term;
import org.apache.lucene.util.OpenBitSet;
import org.apache.lucene.util.ToStringUtils;
abstract class AbstractTrieRangeFilter extends Filter {
AbstractTrieRangeFilter(final String[] fields, final int precisionStep,
Number min, Number max, final boolean minInclusive, final boolean maxInclusive
) {
this.fields=(String[])fields.clone();
this.precisionStep=precisionStep;
this.min=min;
this.max=max;
this.minInclusive=minInclusive;
this.maxInclusive=maxInclusive;
}
//@Override
public String toString() {
return toString(null);
}
public String toString(final String field) {
final StringBuffer sb=new StringBuffer();
if (!this.fields[0].equals(field)) sb.append(this.fields[0]).append(':');
return sb.append(minInclusive ? '[' : '{')
.append((min==null) ? "*" : min.toString())
.append(" TO ")
.append((max==null) ? "*" : max.toString())
.append(maxInclusive ? ']' : '}').toString();
}
//@Override
public final boolean equals(final Object o) {
if (o==this) return true;
if (o==null) return false;
if (this.getClass().equals(o.getClass())) {
AbstractTrieRangeFilter q=(AbstractTrieRangeFilter)o;
return (
Arrays.equals(fields,q.fields) &&
(q.min == null ? min == null : q.min.equals(min)) &&
(q.max == null ? max == null : q.max.equals(max)) &&
minInclusive==q.minInclusive &&
maxInclusive==q.maxInclusive &&
precisionStep==q.precisionStep
);
}
return false;
}
//@Override
public final int hashCode() {
int hash=Arrays.asList(fields).hashCode()+(precisionStep^0x64365465);
if (min!=null) hash += min.hashCode()^0x14fa55fb;
if (max!=null) hash += max.hashCode()^0x733fa5fe;
return hash+
(Boolean.valueOf(minInclusive).hashCode()^0x14fa55fb)+
(Boolean.valueOf(maxInclusive).hashCode()^0x733fa5fe);
}
/**
* Expert: Return the number of terms visited during the last execution of {@link #getDocIdSet}.
* This may be used for performance comparisons of different trie variants and their effectiveness.
* This method is not thread safe, be sure to only call it when no query is running!
* @throws IllegalStateException if {@link #getDocIdSet} was not yet executed.
*/
public int getLastNumberOfTerms() {
if (lastNumberOfTerms < 0) throw new IllegalStateException();
return lastNumberOfTerms;
}
void resetLastNumberOfTerms() {
lastNumberOfTerms=0;
}
/** Returns this range filter as a query.
* Using this method, it is possible to create a Query using <code>new {Long|Int}TrieRangeFilter(....).asQuery()</code>.
* This is a synonym for wrapping with a {@link ConstantScoreQuery},
* but this query returns a better <code>toString()</code> variant.
*/
public Query asQuery() {
return new ConstantScoreQuery(this) {
/** this instance return a nicer String variant than the original {@link ConstantScoreQuery} */
//@Override
public String toString(final String field) {
// return a more convenient representation of this query than ConstantScoreQuery does:
return ((AbstractTrieRangeFilter) filter).toString(field)+ToStringUtils.boost(getBoost());
}
};
}
void fillBits(
final IndexReader reader,
final OpenBitSet bits, final TermDocs termDocs,
String field,
final String lowerTerm, final String upperTerm
) throws IOException {
final int len=lowerTerm.length();
assert upperTerm.length()==len;
field=field.intern();
// find the docs
final TermEnum enumerator = reader.terms(new Term(field, lowerTerm));
try {
do {
final Term term = enumerator.term();
if (term!=null && term.field()==field) {
// break out when upperTerm reached or length of term is different
final String t=term.text();
if (len!=t.length() || t.compareTo(upperTerm)>0) break;
// we have a good term, find the docs
lastNumberOfTerms++;
termDocs.seek(enumerator);
while (termDocs.next()) bits.set(termDocs.doc());
} else break;
} while (enumerator.next());
} finally {
enumerator.close();
}
}
// members
final String[] fields;
final int precisionStep;
final Number min,max;
final boolean minInclusive,maxInclusive;
private int lastNumberOfTerms=-1;
}

View File

@ -0,0 +1,140 @@
package org.apache.lucene.search.trie;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.document.Document;
import org.apache.lucene.search.Filter;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.TermDocs;
import org.apache.lucene.util.OpenBitSet;
/**
* Implementation of a Lucene {@link Filter} that implements trie-based range filtering for ints/floats.
* This filter depends on a specific structure of terms in the index that can only be created
* by {@link TrieUtils} methods.
* For more information, how the algorithm works, see the {@linkplain org.apache.lucene.search.trie package description}.
*/
public class IntTrieRangeFilter extends AbstractTrieRangeFilter {
/**
* A trie filter for matching trie coded values using the given field name and
* the default helper field.
* <code>precisionStep</code> must me equal or a multiple of the <code>precisionStep</code>
* used for indexing the values.
* You can leave the bounds open, by supplying <code>null</code> for <code>min</code> and/or
* <code>max</code>. Inclusive/exclusive bounds can also be supplied.
* To query float values use the converter {@link TrieUtils#floatToSortableInt}.
* <p>This is the counterpart to {@link TrieUtils#addIndexedFields(Document,String,String[])}.
* <p><b>This is the recommended usage of TrieUtils/IntTrieRangeFilter.</b>
*/
public IntTrieRangeFilter(final String field, final int precisionStep,
final Integer min, final Integer max, final boolean minInclusive, final boolean maxInclusive
) {
this(
new String[]{field, field+TrieUtils.LOWER_PRECISION_FIELD_NAME_SUFFIX},
precisionStep,min,max,minInclusive,maxInclusive
);
}
/**
* Expert: A trie filter for matching trie coded values using the given field names.
* You can specify the main and helper field name, that was used to idex the values.
* <code>precisionStep</code> must me equal or a multiple of the <code>precisionStep</code>
* used for indexing the values.
* You can leave the bounds open, by supplying <code>null</code> for <code>min</code> and/or
* <code>max</code>. Inclusive/exclusive bounds can also be supplied.
* To query float values use the converter {@link TrieUtils#floatToSortableInt}.
* <p>This is the counterpart to {@link TrieUtils#addIndexedFields(Document,String,String,String[])}.
*/
public IntTrieRangeFilter(final String field, final String lowerPrecisionField, final int precisionStep,
final Integer min, final Integer max, final boolean minInclusive, final boolean maxInclusive
) {
this(new String[]{field, lowerPrecisionField},precisionStep,min,max,minInclusive,maxInclusive);
}
/**
* Expert: A trie filter for matching trie coded values
* using the given field names. If the array of field names is shorter than the
* trieCoded one, all trieCoded values with higher index get the last field name.
* <code>precisionStep</code> must me equal or a multiple of the <code>precisionStep</code>
* used for indexing the values.
* You can leave the bounds open, by supplying <code>null</code> for <code>min</code> and/or
* <code>max</code>. Inclusive/exclusive bounds can also be supplied.
* To query float values use the converter {@link TrieUtils#floatToSortableInt}.
* <p>This is the counterpart to {@link TrieUtils#addIndexedFields(Document,String[],String[])}.
*/
public IntTrieRangeFilter(final String[] fields, final int precisionStep,
Integer min, Integer max, final boolean minInclusive, final boolean maxInclusive
) {
super(fields, precisionStep, min, max, minInclusive, maxInclusive);
}
/**
* Returns a DocIdSet that provides the documents which should be permitted or prohibited in search results.
*/
//@Override
public DocIdSet getDocIdSet(final IndexReader reader) throws IOException {
// calculate the upper and lower bounds respecting the inclusive and null values.
int minBound=(this.min==null) ? Integer.MIN_VALUE : (
minInclusive ? this.min.intValue() : (this.min.intValue()+1)
);
int maxBound=(this.max==null) ? Integer.MAX_VALUE : (
maxInclusive ? this.max.intValue() : (this.max.intValue()-1)
);
resetLastNumberOfTerms();
if (minBound > maxBound) {
// shortcut, no docs will match this
return DocIdSet.EMPTY_DOCIDSET;
} else {
final OpenBitSet bits = new OpenBitSet(reader.maxDoc());
final TermDocs termDocs = reader.termDocs();
try {
TrieUtils.splitIntRange(new TrieUtils.IntRangeBuilder() {
//@Override
public final void addRange(String minPrefixCoded, String maxPrefixCoded, int level) {
try {
fillBits(
reader, bits, termDocs,
fields[Math.min(fields.length-1, level)],
minPrefixCoded, maxPrefixCoded
);
} catch (IOException ioe) {
// IntRangeBuilder is not allowed to throw checked exceptions:
// wrap as RuntimeException
throw new RuntimeException(ioe);
}
}
}, precisionStep, minBound, maxBound);
} catch (RuntimeException e) {
if (e.getCause() instanceof IOException) throw (IOException)e.getCause();
throw e;
} finally {
termDocs.close();
}
return bits;
}
}
}

View File

@ -0,0 +1,140 @@
package org.apache.lucene.search.trie;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.document.Document;
import org.apache.lucene.search.Filter;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.TermDocs;
import org.apache.lucene.util.OpenBitSet;
/**
* Implementation of a Lucene {@link Filter} that implements trie-based range filtering for longs/doubles.
* This filter depends on a specific structure of terms in the index that can only be created
* by {@link TrieUtils} methods.
* For more information, how the algorithm works, see the {@linkplain org.apache.lucene.search.trie package description}.
*/
public class LongTrieRangeFilter extends AbstractTrieRangeFilter {
/**
* A trie filter for matching trie coded values using the given field name and
* the default helper field.
* <code>precisionStep</code> must me equal or a multiple of the <code>precisionStep</code>
* used for indexing the values.
* You can leave the bounds open, by supplying <code>null</code> for <code>min</code> and/or
* <code>max</code>. Inclusive/exclusive bounds can also be supplied.
* To query double values use the converter {@link TrieUtils#doubleToSortableLong}.
* <p>This is the counterpart to {@link TrieUtils#addIndexedFields(Document,String,String[])}.
* <p><b>This is the recommended usage of TrieUtils/LongTrieRangeFilter.</b>
*/
public LongTrieRangeFilter(final String field, final int precisionStep,
final Long min, final Long max, final boolean minInclusive, final boolean maxInclusive
) {
this(
new String[]{field, field+TrieUtils.LOWER_PRECISION_FIELD_NAME_SUFFIX},
precisionStep,min,max,minInclusive,maxInclusive
);
}
/**
* Expert: A trie filter for matching trie coded values using the given field names.
* You can specify the main and helper field name, that was used to idex the values.
* <code>precisionStep</code> must me equal or a multiple of the <code>precisionStep</code>
* used for indexing the values.
* You can leave the bounds open, by supplying <code>null</code> for <code>min</code> and/or
* <code>max</code>. Inclusive/exclusive bounds can also be supplied.
* To query double values use the converter {@link TrieUtils#doubleToSortableLong}.
* <p>This is the counterpart to {@link TrieUtils#addIndexedFields(Document,String,String,String[])}.
*/
public LongTrieRangeFilter(final String field, final String lowerPrecisionField, final int precisionStep,
final Long min, final Long max, final boolean minInclusive, final boolean maxInclusive
) {
this(new String[]{field, lowerPrecisionField},precisionStep,min,max,minInclusive,maxInclusive);
}
/**
* Expert: A trie filter for matching trie coded values
* using the given field names. If the array of field names is shorter than the
* trieCoded one, all trieCoded values with higher index get the last field name.
* <code>precisionStep</code> must me equal or a multiple of the <code>precisionStep</code>
* used for indexing the values.
* You can leave the bounds open, by supplying <code>null</code> for <code>min</code> and/or
* <code>max</code>. Inclusive/exclusive bounds can also be supplied.
* To query double values use the converter {@link TrieUtils#doubleToSortableLong}.
* <p>This is the counterpart to {@link TrieUtils#addIndexedFields(Document,String[],String[])}.
*/
public LongTrieRangeFilter(final String[] fields, final int precisionStep,
Long min, Long max, final boolean minInclusive, final boolean maxInclusive
) {
super(fields, precisionStep, min, max, minInclusive, maxInclusive);
}
/**
* Returns a DocIdSet that provides the documents which should be permitted or prohibited in search results.
*/
//@Override
public DocIdSet getDocIdSet(final IndexReader reader) throws IOException {
// calculate the upper and lower bounds respecting the inclusive and null values.
long minBound=(this.min==null) ? Long.MIN_VALUE : (
minInclusive ? this.min.longValue() : (this.min.longValue()+1L)
);
long maxBound=(this.max==null) ? Long.MAX_VALUE : (
maxInclusive ? this.max.longValue() : (this.max.longValue()-1L)
);
resetLastNumberOfTerms();
if (minBound > maxBound) {
// shortcut, no docs will match this
return DocIdSet.EMPTY_DOCIDSET;
} else {
final OpenBitSet bits = new OpenBitSet(reader.maxDoc());
final TermDocs termDocs = reader.termDocs();
try {
TrieUtils.splitLongRange(new TrieUtils.LongRangeBuilder() {
//@Override
public final void addRange(String minPrefixCoded, String maxPrefixCoded, int level) {
try {
fillBits(
reader, bits, termDocs,
fields[Math.min(fields.length-1, level)],
minPrefixCoded, maxPrefixCoded
);
} catch (IOException ioe) {
// LongRangeBuilder is not allowed to throw checked exceptions:
// wrap as RuntimeException
throw new RuntimeException(ioe);
}
}
}, precisionStep, minBound, maxBound);
} catch (RuntimeException e) {
if (e.getCause() instanceof IOException) throw (IOException)e.getCause();
throw e;
} finally {
termDocs.close();
}
return bits;
}
}
}

View File

@ -1,302 +0,0 @@
package org.apache.lucene.search.trie;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.Date;
import org.apache.lucene.search.Filter;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.TermDocs;
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.index.Term;
import org.apache.lucene.util.OpenBitSet;
/**
* Implementation of a Lucene {@link Filter} that implements trie-based range filtering.
* This filter depends on a specific structure of terms in the index that can only be created
* by {@link TrieUtils} methods.
* For more information, how the algorithm works, see the package description {@link org.apache.lucene.search.trie}.
*/
public final class TrieRangeFilter extends Filter {
/**
* Universal constructor (expert use only): Uses already trie-converted min/max values.
* You can set <code>min</code> or <code>max</code> (but not both) to <code>null</code> to leave one bound open.
* With <code>minInclusive</code> and <code>maxInclusive</code> can be choosen, if the corresponding
* bound should be included or excluded from the range.
*/
public TrieRangeFilter(final String field, String min, String max,
final boolean minInclusive, final boolean maxInclusive, final TrieUtils variant
) {
if (min==null && max==null) throw new IllegalArgumentException("The min and max values cannot be both null.");
this.trieVariant=variant;
this.field=field.intern();
// just for toString()
this.minUnconverted=min;
this.maxUnconverted=max;
this.minInclusive=minInclusive;
this.maxInclusive=maxInclusive;
// encode bounds
this.min=(min==null) ? trieVariant.TRIE_CODED_NUMERIC_MIN : (
minInclusive ? min : variant.incrementTrieCoded(min)
);
this.max=(max==null) ? trieVariant.TRIE_CODED_NUMERIC_MAX : (
maxInclusive ? max : variant.decrementTrieCoded(max)
);
// check encoded values
if (
this.min.length() != trieVariant.TRIE_CODED_LENGTH ||
this.max.length() != trieVariant.TRIE_CODED_LENGTH
) throw new NumberFormatException("Invalid trie encoded numerical value representation (incompatible length).");
}
/**
* Generates a trie filter using the supplied field with range bounds in numeric form (double).
* You can set <code>min</code> or <code>max</code> (but not both) to <code>null</code> to leave one bound open.
* With <code>minInclusive</code> and <code>maxInclusive</code> can be choosen, if the corresponding
* bound should be included or excluded from the range.
*/
public TrieRangeFilter(final String field, final Double min, final Double max,
final boolean minInclusive, final boolean maxInclusive, final TrieUtils variant
) {
this(
field,
(min==null) ? null : variant.doubleToTrieCoded(min.doubleValue()),
(max==null) ? null : variant.doubleToTrieCoded(max.doubleValue()),
minInclusive,
maxInclusive,
variant
);
this.minUnconverted=min;
this.maxUnconverted=max;
}
/**
* Generates a trie filter using the supplied field with range bounds in date/time form.
* You can set <code>min</code> or <code>max</code> (but not both) to <code>null</code> to leave one bound open.
* With <code>minInclusive</code> and <code>maxInclusive</code> can be choosen, if the corresponding
* bound should be included or excluded from the range.
*/
public TrieRangeFilter(final String field, final Date min, final Date max,
final boolean minInclusive, final boolean maxInclusive, final TrieUtils variant
) {
this(
field,
(min==null) ? null : variant.dateToTrieCoded(min),
(max==null) ? null : variant.dateToTrieCoded(max),
minInclusive,
maxInclusive,
variant
);
this.minUnconverted=min;
this.maxUnconverted=max;
}
/**
* Generates a trie filter using the supplied field with range bounds in integer form (long).
* You can set <code>min</code> or <code>max</code> (but not both) to <code>null</code> to leave one bound open.
* With <code>minInclusive</code> and <code>maxInclusive</code> can be choosen, if the corresponding
* bound should be included or excluded from the range.
*/
public TrieRangeFilter(final String field, final Long min, final Long max,
final boolean minInclusive, final boolean maxInclusive, final TrieUtils variant
) {
this(
field,
(min==null) ? null : variant.longToTrieCoded(min.longValue()),
(max==null) ? null : variant.longToTrieCoded(max.longValue()),
minInclusive,
maxInclusive,
variant
);
this.minUnconverted=min;
this.maxUnconverted=max;
}
//@Override
public String toString() {
return toString(null);
}
public String toString(final String field) {
final StringBuffer sb=new StringBuffer();
if (!this.field.equals(field)) sb.append(this.field).append(':');
return sb.append(minInclusive ? '[' : '{')
.append((minUnconverted==null) ? "*" : minUnconverted.toString())
.append(" TO ")
.append((maxUnconverted==null) ? "*" : maxUnconverted.toString())
.append(maxInclusive ? ']' : '}').toString();
}
/**
* Two instances are equal if they have the same trie-encoded range bounds, same field, and same variant.
* If one of the instances uses an exclusive lower bound, it is equal to a range with inclusive bound,
* when the inclusive lower bound is equal to the incremented exclusive lower bound of the other one.
* The same applys for the upper bound in other direction.
*/
//@Override
public final boolean equals(final Object o) {
if (o instanceof TrieRangeFilter) {
TrieRangeFilter q=(TrieRangeFilter)o;
// trieVariants are singleton per type, so no equals needed.
return (field==q.field && min.equals(q.min) && max.equals(q.max) && trieVariant==q.trieVariant);
} else return false;
}
//@Override
public final int hashCode() {
// the hash code uses from the variant only the number of bits, as this is unique for the variant
return field.hashCode()+(min.hashCode()^0x14fa55fb)+(max.hashCode()^0x733fa5fe)+(trieVariant.TRIE_BITS^0x64365465);
}
/** prints the String in hexadecimal \\u notation (for debugging of <code>setBits()</code>) */
private String stringToHexDigits(final String s) {
StringBuffer sb=new StringBuffer(s.length()*3);
for (int i=0,c=s.length(); i<c; i++) {
char ch=s.charAt(i);
sb.append("\\u").append(Integer.toHexString((int)ch));
}
return sb.toString();
}
/** Marks documents in a specific range. Code borrowed from original RangeFilter and simplified (and returns number of terms) */
private int setBits(final IndexReader reader, final TermDocs termDocs, final OpenBitSet bits, String lowerTerm, String upperTerm) throws IOException {
//System.out.println(stringToHexDigits(lowerTerm)+" TO "+stringToHexDigits(upperTerm));
int count=0,len=lowerTerm.length();
final String field;
if (len<trieVariant.TRIE_CODED_LENGTH) {
// lower precision value is in helper field
field=(this.field + trieVariant.LOWER_PRECISION_FIELD_NAME_SUFFIX).intern();
// add padding before lower precision values to group them
lowerTerm=new StringBuffer(len+1).append((char)(trieVariant.TRIE_CODED_PADDING_START+len)).append(lowerTerm).toString();
upperTerm=new StringBuffer(len+1).append((char)(trieVariant.TRIE_CODED_PADDING_START+len)).append(upperTerm).toString();
// length is longer by 1 char because of padding
len++;
} else {
// full precision value is in original field
field=this.field;
}
final TermEnum enumerator = reader.terms(new Term(field, lowerTerm));
try {
do {
final Term term = enumerator.term();
if (term!=null && term.field()==field) {
// break out when upperTerm reached or length of term is different
final String t=term.text();
if (len!=t.length() || t.compareTo(upperTerm)>0) break;
// we have a good term, find the docs
count++;
termDocs.seek(enumerator);
while (termDocs.next()) bits.set(termDocs.doc());
} else break;
} while (enumerator.next());
} finally {
enumerator.close();
}
return count;
}
/** Splits range recursively (and returns number of terms) */
private int splitRange(
final IndexReader reader, final TermDocs termDocs, final OpenBitSet bits,
final String min, final boolean lowerBoundOpen, final String max, final boolean upperBoundOpen
) throws IOException {
int count=0;
final int length=min.length();
final String minShort=lowerBoundOpen ? min.substring(0,length-1) : trieVariant.incrementTrieCoded(min.substring(0,length-1));
final String maxShort=upperBoundOpen ? max.substring(0,length-1) : trieVariant.decrementTrieCoded(max.substring(0,length-1));
if (length==1 || minShort.compareTo(maxShort)>=0) {
// we are in the lowest precision or the current precision is not existent
count+=setBits(reader, termDocs, bits, min, max);
} else {
// Avoid too much seeking: first go deeper into lower precision
// (in IndexReader's TermEnum these terms are earlier).
// Do this only, if the current length is not trieVariant.TRIE_CODED_LENGTH (not full precision),
// because terms from the highest prec come before all lower prec terms
// (because the field name is ordered before the suffixed one).
if (length!=trieVariant.TRIE_CODED_LENGTH) count+=splitRange(
reader,termDocs,bits,
minShort,lowerBoundOpen,
maxShort,upperBoundOpen
);
// Avoid too much seeking: set bits for lower part of current (higher) precision.
// These terms come later in IndexReader's TermEnum.
if (!lowerBoundOpen) {
count+=setBits(reader, termDocs, bits, min, trieVariant.decrementTrieCoded(minShort+trieVariant.TRIE_CODED_SYMBOL_MIN));
}
// Avoid too much seeking: set bits for upper part of current precision.
// These terms come later in IndexReader's TermEnum.
if (!upperBoundOpen) {
count+=setBits(reader, termDocs, bits, trieVariant.incrementTrieCoded(maxShort+trieVariant.TRIE_CODED_SYMBOL_MAX), max);
}
// If the first step (see above) was not done (because length==trieVariant.TRIE_CODED_LENGTH) we do it now.
if (length==trieVariant.TRIE_CODED_LENGTH) count+=splitRange(
reader,termDocs,bits,
minShort,lowerBoundOpen,
maxShort,upperBoundOpen
);
}
return count;
}
/**
* Returns a DocIdSet that provides the documents which should be permitted or prohibited in search results.
*/
//@Override
public DocIdSet getDocIdSet(IndexReader reader) throws IOException {
if (min.compareTo(max) > 0) {
// shortcut: if min>max, no docs will match!
lastNumberOfTerms=0;
return DocIdSet.EMPTY_DOCIDSET;
} else {
final OpenBitSet bits = new OpenBitSet(reader.maxDoc());
final TermDocs termDocs = reader.termDocs();
try {
lastNumberOfTerms=splitRange(
reader,termDocs,bits,
min,trieVariant.TRIE_CODED_NUMERIC_MIN.equals(min),
max,trieVariant.TRIE_CODED_NUMERIC_MAX.equals(max)
);
} finally {
termDocs.close();
}
return bits;
}
}
/**
* EXPERT: Return the number of terms visited during the last execution of {@link #getDocIdSet}.
* This may be used for performance comparisons of different trie variants and their effectiveness.
* This method is not thread safe, be sure to only call it when no query is running!
* @throws IllegalStateException if {@link #getDocIdSet} was not yet executed.
*/
public int getLastNumberOfTerms() {
if (lastNumberOfTerms < 0) throw new IllegalStateException();
return lastNumberOfTerms;
}
// members
private final String field,min,max;
private final TrieUtils trieVariant;
private final boolean minInclusive,maxInclusive;
private Object minUnconverted,maxUnconverted;
private int lastNumberOfTerms=-1;
}

View File

@ -1,117 +0,0 @@
package org.apache.lucene.search.trie;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.Date;
import org.apache.lucene.search.ConstantScoreQuery;
import org.apache.lucene.util.ToStringUtils;
/**
* A Lucene {@link Query} that implements a trie-based range query.
* This query depends on a specific structure of terms in the index that can only be created
* by {@link TrieUtils} methods.
* <p>This class wraps a {@link TrieRangeFilter}.
* @see TrieRangeFilter
*/
public final class TrieRangeQuery extends ConstantScoreQuery {
/**
* Universal constructor (expert use only): Uses already trie-converted min/max values.
* You can set <code>min</code> or <code>max</code> (but not both) to <code>null</code> to leave one bound open.
* With <code>minInclusive</code> and <code>maxInclusive</code> can be choosen, if the corresponding
* bound should be included or excluded from the range.
*/
public TrieRangeQuery(final String field, final String min, final String max,
final boolean minInclusive, final boolean maxInclusive, final TrieUtils variant
) {
super(new TrieRangeFilter(field,min,max,minInclusive,maxInclusive,variant));
}
/**
* A trie query using the supplied field with range bounds in numeric form (double).
* You can set <code>min</code> or <code>max</code> (but not both) to <code>null</code> to leave one bound open.
* With <code>minInclusive</code> and <code>maxInclusive</code> can be choosen, if the corresponding
* bound should be included or excluded from the range.
*/
public TrieRangeQuery(final String field, final Double min, final Double max,
final boolean minInclusive, final boolean maxInclusive, final TrieUtils variant
) {
super(new TrieRangeFilter(field,min,max,minInclusive,maxInclusive,variant));
}
/**
* A trie query using the supplied field with range bounds in date/time form.
* You can set <code>min</code> or <code>max</code> (but not both) to <code>null</code> to leave one bound open.
* With <code>minInclusive</code> and <code>maxInclusive</code> can be choosen, if the corresponding
* bound should be included or excluded from the range.
*/
public TrieRangeQuery(final String field, final Date min, final Date max,
final boolean minInclusive, final boolean maxInclusive, final TrieUtils variant
) {
super(new TrieRangeFilter(field,min,max,minInclusive,maxInclusive,variant));
}
/**
* A trie query using the supplied field with range bounds in integer form (long).
* You can set <code>min</code> or <code>max</code> (but not both) to <code>null</code> to leave one bound open.
* With <code>minInclusive</code> and <code>maxInclusive</code> can be choosen, if the corresponding
* bound should be included or excluded from the range.
*/
public TrieRangeQuery(final String field, final Long min, final Long max,
final boolean minInclusive, final boolean maxInclusive, final TrieUtils variant
) {
super(new TrieRangeFilter(field,min,max,minInclusive,maxInclusive,variant));
}
/**
* EXPERT: Return the number of terms visited during the last execution of the query.
* This may be used for performance comparisons of different trie variants and their effectiveness.
* When using this method be sure to query an one-segment index (optimized one) to get correct results.
* This method is not thread safe, be sure to only call it when no query is running!
* @throws IllegalStateException if query was not yet executed.
*/
public int getLastNumberOfTerms() {
return ((TrieRangeFilter) filter).getLastNumberOfTerms();
}
//@Override
public String toString(final String field) {
// return a more convenient representation of this query than ConstantScoreQuery does:
return ((TrieRangeFilter) filter).toString(field)+ToStringUtils.boost(getBoost());
}
/**
* Two instances are equal if they have the same trie-encoded range bounds, same field, same boost, and same variant.
* If one of the instances uses an exclusive lower bound, it is equal to a range with inclusive bound,
* when the inclusive lower bound is equal to the decremented exclusive lower bound.
* The same applys for the upper bound in other direction.
*/
//@Override
public final boolean equals(final Object o) {
if (!(o instanceof TrieRangeQuery)) return false;
return super.equals(o);
}
//@Override
public final int hashCode() {
// make hashCode a little bit different:
return super.hashCode()^0x1756fa55;
}
}

View File

@ -17,385 +17,531 @@ package org.apache.lucene.search.trie;
* limitations under the License.
*/
import java.util.Date;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.FieldCache;
import org.apache.lucene.search.ExtendedFieldCache;
/**
* This is a helper class to construct the trie-based index entries for numerical values.
* <p>For more information on how the algorithm works, see the package description {@link org.apache.lucene.search.trie}.
* The format of how the numerical values are stored in index is documented here:
* <p>All numerical values are first converted to special <code>unsigned long</code>s by applying some bit-wise transformations. This means:<ul>
* <li>{@link Date}s are casted to UNIX timestamps (milliseconds since 1970-01-01, this is how Java represents date/time
* internally): {@link Date#getTime()}. The resulting <code>signed long</code> is transformed to the unsigned form like so:</li>
* <li><code>signed long</code>s are shifted, so that {@link Long#MIN_VALUE} is mapped to <code>0x0000000000000000</code>,
* {@link Long#MAX_VALUE} is mapped to <code>0xffffffffffffffff</code>.</li>
* <li><code>double</code>s are converted by getting their IEEE 754 floating-point "double format" bit layout and then some bits
* are swapped, to be able to compare the result as <code>unsigned long</code>s.</li>
* </ul>
* <p>For each variant (you can choose between {@link #VARIANT_8BIT}, {@link #VARIANT_4BIT}, and {@link #VARIANT_2BIT}),
* the bitmap of this <code>unsigned long</code> is divided into parts of a number of bits (starting with the most-significant bits)
* and each part converted to characters between {@link #TRIE_CODED_SYMBOL_MIN} and {@link #TRIE_CODED_SYMBOL_MAX}.
* The resulting {@link String} is comparable like the corresponding <code>unsigned long</code>.
* <p>To store the different precisions of the long values (from one character [only the most significant one] to the full encoded length),
* each lower precision is prefixed by the length ({@link #TRIE_CODED_PADDING_START}<code>+precision == 0x20+precision</code>),
* in an extra "helper" field with a suffixed field name (i.e. fieldname "numeric" =&gt; lower precision's name "numeric#trie").
* The full long is not prefixed at all and indexed and stored according to the given flags in the original field name.
* By this it is possible to get the correct enumeration of terms in correct precision
* of the term list by just jumping to the correct fieldname and/or prefix. The full precision value may also be
* stored in the document. Having the full precision value as term in a separate field with the original name,
* sorting of query results against such fields is possible using the original field name.
* For more information on how the algorithm works, see the
* {@linkplain org.apache.lucene.search.trie package description}.
* <h3>The trie format using prefix encoded numerical values</h3>
* <p>To quickly execute range queries in Apache Lucene, a range is divided recursively
* into multiple intervals for searching: The center of the range is searched only with
* the lowest possible precision in the trie, while the boundaries are matched
* more exactly. This reduces the number of terms dramatically.
* <p>This class generates terms to achive this: First the numerical integer values need to
* be converted to strings. For that integer values (32 bit or 64 bit) are made unsigned
* and the bits are converted to ASCII chars with each 7 bit. The resulting string is
* sortable like the original integer value.
* <p>To also index floating point numbers, this class supplies two methods to convert them
* to integer values by changing their bit layout: {@link #doubleToSortableLong},
* {@link #floatToSortableInt}. You will have no precision loss by
* converting floating point numbers to integers and back (only that the integer form does
* is not usable). Other data types like dates can easily converted to longs or ints (e.g.
* date to long: {@link java.util.Date#getTime}).
* <p>To index the different precisions of the long values each encoded value is also reduced
* by zeroing bits from the right. Each value is also prefixed (in the first char) by the
* <code>shift</code> value (number of bits removed) used during encoding. This series of
* different precision values can be indexed into a Lucene {@link Document} using
* {@link #addIndexedFields(Document,String,String[])}. The default is to index the original
* precision in the supplied field name and the lower precisions in an additional helper field.
* Because of this, the full-precision field can also be sorted (using {@link #getLongSortField}
* or {@link #getIntSortField}).
* <p>The number of bits removed from the right for each trie entry is called
* <code>precisionStep</code> in this API. For comparing the different step values, see the
* {@linkplain org.apache.lucene.search.trie package description}.
*/
public final class TrieUtils {
/** Instance of TrieUtils using a trie factor of 8 bit.
* This is the <b>recommended</b> one (rather fast and storage optimized) */
public static final TrieUtils VARIANT_8BIT=new TrieUtils(8);
private TrieUtils() {} // no instance!
/** Instance of TrieUtils using a trie factor of 4 bit. */
public static final TrieUtils VARIANT_4BIT=new TrieUtils(4);
/** Instance of TrieUtils using a trie factor of 2 bit.
* This may be good for some indexes, but it needs much storage space
* and is not much faster than 8 bit in most cases. */
public static final TrieUtils VARIANT_2BIT=new TrieUtils(2);
/** Marker (PADDING) before lower-precision trie entries to signal the precision value. See class description! */
public static final char TRIE_CODED_PADDING_START=(char)0x20;
/** The "helper" field containing the lower precision terms is the original fieldname with this appended. */
/**
* The default &quot;helper&quot; field containing the lower precision terms is the original
* fieldname with this appended. This suffix is used in
* {@link #addIndexedFields(Document,String,String[])} and the corresponding c'tor
* of <code>(Long|Int)TrieRangeFilter</code>.
*/
public static final String LOWER_PRECISION_FIELD_NAME_SUFFIX="#trie";
/** Character used as lower end */
public static final char TRIE_CODED_SYMBOL_MIN=(char)0x100;
/**
* Longs are stored at lower precision by shifting off lower bits. The shift count is
* stored as <code>SHIFT_START_LONG+shift</code> in the first character
*/
public static final char SHIFT_START_LONG = (char)0x20;
/**
* A parser instance for filling a {@link ExtendedFieldCache}, that parses trie encoded fields as longs,
* auto detecting the trie encoding variant using the String length.
* Integers are stored at lower precision by shifting off lower bits. The shift count is
* stored as <code>SHIFT_START_INT+shift</code> in the first character
*/
public static final ExtendedFieldCache.LongParser FIELD_CACHE_LONG_PARSER_AUTO=new ExtendedFieldCache.LongParser(){
public final long parseLong(String val) {
return trieCodedToLongAuto(val);
public static final char SHIFT_START_INT = (char)0x60;
/**
* A parser instance for filling a {@link ExtendedFieldCache}, that parses prefix encoded fields as longs.
*/
public static final ExtendedFieldCache.LongParser FIELD_CACHE_LONG_PARSER=new ExtendedFieldCache.LongParser(){
public final long parseLong(final String val) {
return prefixCodedToLong(val);
}
};
/**
* A parser instance for filling a {@link ExtendedFieldCache}, that parses trie encoded fields as doubles,
* auto detecting the trie encoding variant using the String length.
* A parser instance for filling a {@link FieldCache}, that parses prefix encoded fields as ints.
*/
public static final ExtendedFieldCache.DoubleParser FIELD_CACHE_DOUBLE_PARSER_AUTO=new ExtendedFieldCache.DoubleParser(){
public final double parseDouble(String val) {
return trieCodedToDoubleAuto(val);
public static final FieldCache.IntParser FIELD_CACHE_INT_PARSER=new FieldCache.IntParser(){
public final int parseInt(final String val) {
return prefixCodedToInt(val);
}
};
/**
* A parser instance for filling a {@link ExtendedFieldCache}, that parses prefix encoded fields as doubles.
* This uses {@link #sortableLongToDouble} to convert the encoded long to a double.
*/
public static final ExtendedFieldCache.DoubleParser FIELD_CACHE_DOUBLE_PARSER=new ExtendedFieldCache.DoubleParser(){
public final double parseDouble(final String val) {
return sortableLongToDouble(prefixCodedToLong(val));
}
};
/**
* Detects and returns the variant of a trie encoded string using the length.
* @throws NumberFormatException if the length is not 8, 16, or 32 chars.
* A parser instance for filling a {@link FieldCache}, that parses prefix encoded fields as floats.
* This uses {@link #sortableIntToFloat} to convert the encoded int to a float.
*/
public static final TrieUtils autoDetectVariant(final String s) {
final int l=s.length();
if (l==VARIANT_8BIT.TRIE_CODED_LENGTH) {
return VARIANT_8BIT;
} else if (l==VARIANT_4BIT.TRIE_CODED_LENGTH) {
return VARIANT_4BIT;
} else if (l==VARIANT_2BIT.TRIE_CODED_LENGTH) {
return VARIANT_2BIT;
} else {
throw new NumberFormatException("Invalid trie encoded numerical value representation (incompatible length).");
public static final FieldCache.FloatParser FIELD_CACHE_FLOAT_PARSER=new FieldCache.FloatParser(){
public final float parseFloat(final String val) {
return sortableIntToFloat(prefixCodedToInt(val));
}
};
/**
* This is a convenience method, that returns prefix coded bits of a long without
* reducing the precision. It can be used to store the full precision value as a
* stored field in index.
* <p>To decode, use {@link #prefixCodedToLong}.
*/
public static String longToPrefixCoded(final long val) {
return longToPrefixCoded(val, 0);
}
/**
* Converts a encoded <code>String</code> value back to a <code>long</code>,
* auto detecting the trie encoding variant using the String length.
* Expert: Returns prefix coded bits after reducing the precision by <code>shift</code> bits.
* This is method is used by {@link #trieCodeLong}.
*/
public static final long trieCodedToLongAuto(final String s) {
return autoDetectVariant(s).trieCodedToLong(s);
}
/**
* Converts a encoded <code>String</code> value back to a <code>double</code>,
* auto detecting the trie encoding variant using the String length.
*/
public static final double trieCodedToDoubleAuto(final String s) {
return autoDetectVariant(s).trieCodedToDouble(s);
}
/**
* Converts a encoded <code>String</code> value back to a <code>Date</code>,
* auto detecting the trie encoding variant using the String length.
*/
public static final Date trieCodedToDateAuto(final String s) {
return autoDetectVariant(s).trieCodedToDate(s);
}
/**
* A factory method, that generates a {@link SortField} instance for sorting trie encoded values,
* automatically detecting the trie encoding variant using the String length.
*/
public static final SortField getSortFieldAuto(final String field) {
return new SortField(field, FIELD_CACHE_LONG_PARSER_AUTO);
}
/**
* A factory method, that generates a {@link SortField} instance for sorting trie encoded values,
* automatically detecting the trie encoding variant using the String length.
*/
public static final SortField getSortFieldAuto(final String field, boolean reverse) {
return new SortField(field, FIELD_CACHE_LONG_PARSER_AUTO, reverse);
}
// TrieUtils instance's part
private TrieUtils(int bits) {
assert 64%bits == 0;
// helper variable for conversion
mask = (1L << bits) - 1L;
// init global "constants"
TRIE_BITS=bits;
TRIE_CODED_LENGTH=64/TRIE_BITS;
TRIE_CODED_SYMBOL_MAX=(char)(TRIE_CODED_SYMBOL_MIN+mask);
TRIE_CODED_NUMERIC_MIN=longToTrieCoded(Long.MIN_VALUE);
TRIE_CODED_NUMERIC_MAX=longToTrieCoded(Long.MAX_VALUE);
}
// internal conversion to/from strings
private final String internalLongToTrieCoded(long l) {
final char[] buf=new char[TRIE_CODED_LENGTH];
for (int i=TRIE_CODED_LENGTH-1; i>=0; i--) {
buf[i] = (char)( TRIE_CODED_SYMBOL_MIN + (l & mask) );
l = l >>> TRIE_BITS;
public static String longToPrefixCoded(final long val, final int shift) {
if (shift>63 || shift<0)
throw new IllegalArgumentException("Illegal shift value, must be 0..63");
int nChars = (63-shift)/7 + 1;
final char[] arr = new char[nChars+1];
arr[0] = (char)(SHIFT_START_LONG + shift);
long sortableBits = val ^ 0x8000000000000000L;
sortableBits >>>= shift;
while (nChars>=1) {
// Store 7 bits per character for good efficiency when UTF-8 encoding.
// The whole number is right-justified so that lucene can prefix-encode
// the terms more efficiently.
arr[nChars--] = (char)(sortableBits & 0x7f);
sortableBits >>>= 7;
}
return new String(buf);
return new String(arr);
}
private final long internalTrieCodedToLong(final String s) {
if (s==null) throw new NullPointerException("Trie encoded string may not be NULL");
final int len=s.length();
if (len!=TRIE_CODED_LENGTH) throw new NumberFormatException(
"Invalid trie encoded numerical value representation (incompatible length, must be "+TRIE_CODED_LENGTH+")"
);
long l=0L;
for (int i=0; i<len; i++) {
char ch=s.charAt(i);
if (ch>=TRIE_CODED_SYMBOL_MIN && ch<=TRIE_CODED_SYMBOL_MAX) {
l = (l << TRIE_BITS) | (long)(ch-TRIE_CODED_SYMBOL_MIN);
} else {
/**
* This is a convenience method, that returns prefix coded bits of an int without
* reducing the precision. It can be used to store the full precision value as a
* stored field in index.
* <p>To decode, use {@link #prefixCodedToInt}.
*/
public static String intToPrefixCoded(final int val) {
return intToPrefixCoded(val, 0);
}
/**
* Expert: Returns prefix coded bits after reducing the precision by <code>shift</code> bits.
* This is method is used by {@link #trieCodeInt}.
*/
public static String intToPrefixCoded(final int val, final int shift) {
if (shift>31 || shift<0)
throw new IllegalArgumentException("Illegal shift value, must be 0..31");
int nChars = (31-shift)/7 + 1;
final char[] arr = new char[nChars+1];
arr[0] = (char)(SHIFT_START_INT + shift);
int sortableBits = val ^ 0x80000000;
sortableBits >>>= shift;
while (nChars>=1) {
// Store 7 bits per character for good efficiency when UTF-8 encoding.
// The whole number is right-justified so that lucene can prefix-encode
// the terms more efficiently.
arr[nChars--] = (char)(sortableBits & 0x7f);
sortableBits >>>= 7;
}
return new String(arr);
}
/**
* Returns a long from prefixCoded characters.
* Rightmost bits will be zero for lower precision codes.
* This method can be used to decode e.g. a stored field.
* @see #longToPrefixCoded(long)
*/
public static long prefixCodedToLong(final String prefixCoded) {
final int len = prefixCoded.length();
final int shift = prefixCoded.charAt(0)-SHIFT_START_LONG;
if (shift>63 || shift<0)
throw new NumberFormatException("Invalid shift value in prefixCoded string (is encoded value really a LONG?)");
long sortableBits = 0L;
for (int i=1; i<len; i++) {
sortableBits <<= 7;
final char ch = prefixCoded.charAt(i);
if (ch>0x7f) {
throw new NumberFormatException(
"Invalid trie encoded numerical value representation (char "+
"Invalid prefixCoded numerical value representation (char "+
Integer.toHexString((int)ch)+" at position "+i+" is invalid)"
);
}
sortableBits |= (long)(ch & 0x7f);
}
return l;
return (sortableBits << shift) ^ 0x8000000000000000L;
}
// Long's
/** Converts a <code>long</code> value encoded to a <code>String</code>. */
public String longToTrieCoded(final long l) {
return internalLongToTrieCoded(l ^ 0x8000000000000000L);
}
/** Converts a encoded <code>String</code> value back to a <code>long</code>. */
public long trieCodedToLong(final String s) {
return internalTrieCodedToLong(s) ^ 0x8000000000000000L;
}
// Double's
/** Converts a <code>double</code> value encoded to a <code>String</code>. */
public String doubleToTrieCoded(final double d) {
long l=Double.doubleToLongBits(d);
if ((l & 0x8000000000000000L) == 0L) {
// >0
l |= 0x8000000000000000L;
} else {
// <0
l = ~l;
}
return internalLongToTrieCoded(l);
}
/** Converts a encoded <code>String</code> value back to a <code>double</code>. */
public double trieCodedToDouble(final String s) {
long l=internalTrieCodedToLong(s);
if ((l & 0x8000000000000000L) != 0L) {
// >0
l &= 0x7fffffffffffffffL;
} else {
// <0
l = ~l;
}
return Double.longBitsToDouble(l);
}
// Date's
/** Converts a <code>Date</code> value encoded to a <code>String</code>. */
public String dateToTrieCoded(final Date d) {
return longToTrieCoded(d.getTime());
}
/** Converts a encoded <code>String</code> value back to a <code>Date</code>. */
public Date trieCodedToDate(final String s) {
return new Date(trieCodedToLong(s));
}
// increment / decrement
/** Increments an encoded String value by 1. Needed by {@link TrieRangeFilter}. */
public String incrementTrieCoded(final String v) {
final int l=v.length();
final char[] buf=new char[l];
boolean inc=true;
for (int i=l-1; i>=0; i--) {
int b=v.charAt(i)-TRIE_CODED_SYMBOL_MIN;
if (inc) b++;
if (inc=(b>(int)mask)) b=0;
buf[i]=(char)(TRIE_CODED_SYMBOL_MIN+b);
}
return new String(buf);
}
/** Decrements an encoded String value by 1. Needed by {@link TrieRangeFilter}. */
public String decrementTrieCoded(final String v) {
final int l=v.length();
final char[] buf=new char[l];
boolean dec=true;
for (int i=l-1; i>=0; i--) {
int b=v.charAt(i)-TRIE_CODED_SYMBOL_MIN;
if (dec) b--;
if (dec=(b<0)) b=(int)mask;
buf[i]=(char)(TRIE_CODED_SYMBOL_MIN+b);
}
return new String(buf);
}
private void addConvertedTrieCodedDocumentField(
final Document ldoc, final String fieldname, final String val,
final boolean index, final Field.Store store
) {
Field f=new Field(fieldname, val, store, index?Field.Index.NOT_ANALYZED_NO_NORMS:Field.Index.NO);
if (index) {
f.setOmitTf(true);
ldoc.add(f);
// add the lower precision values in the helper field with prefix
final StringBuffer sb=new StringBuffer(TRIE_CODED_LENGTH);
synchronized(sb) {
for (int i=TRIE_CODED_LENGTH-1; i>0; i--) {
sb.setLength(0);
f=new Field(
fieldname + LOWER_PRECISION_FIELD_NAME_SUFFIX,
sb.append( (char)(TRIE_CODED_PADDING_START+i) ).append( val.substring(0,i) ).toString(),
Field.Store.NO, Field.Index.NOT_ANALYZED_NO_NORMS
);
f.setOmitTf(true);
ldoc.add(f);
}
/**
* Returns an int from prefixCoded characters.
* Rightmost bits will be zero for lower precision codes.
* This method can be used to decode e.g. a stored field.
* @see #intToPrefixCoded(int)
*/
public static int prefixCodedToInt(final String prefixCoded) {
final int len = prefixCoded.length();
final int shift = prefixCoded.charAt(0)-SHIFT_START_INT;
if (shift>31 || shift<0)
throw new NumberFormatException("Invalid shift value in prefixCoded string (is encoded value really an INT?)");
int sortableBits = 0;
for (int i=1; i<len; i++) {
sortableBits <<= 7;
final char ch = prefixCoded.charAt(i);
if (ch>0x7f) {
throw new NumberFormatException(
"Invalid prefixCoded numerical value representation (char "+
Integer.toHexString((int)ch)+" at position "+i+" is invalid)"
);
}
} else {
ldoc.add(f);
sortableBits |= (int)(ch & 0x7f);
}
return (sortableBits << shift) ^ 0x80000000;
}
/**
* Stores a double value in trie-form in document for indexing.
* <p>To store the different precisions of the long values (from one byte [only the most significant one] to the full eight bytes),
* each lower precision is prefixed by the length ({@link #TRIE_CODED_PADDING_START}<code>+precision</code>),
* in an extra "helper" field with a name of <code>fieldname+{@link #LOWER_PRECISION_FIELD_NAME_SUFFIX}</code>
* (i.e. fieldname "numeric" => lower precision's name "numeric#trie").
* The full long is not prefixed at all and indexed and stored according to the given flags in the original field name.
* If the field should not be searchable, set <code>index</code> to <code>false</code>. It is then only stored (for convenience).
* Fields added to a document using this method can be queried by {@link TrieRangeQuery}.
* Converts a <code>double</code> value to a sortable signed <code>long</code>.
* The value is converted by getting their IEEE 754 floating-point &quot;double format&quot;
* bit layout and then some bits are swapped, to be able to compare the result as long.
* By this the precision is not reduced, but the value can easily used as a long.
* @see #sortableLongToDouble
*/
public void addDoubleTrieCodedDocumentField(
final Document ldoc, final String fieldname, final double val,
final boolean index, final Field.Store store
) {
addConvertedTrieCodedDocumentField(ldoc, fieldname, doubleToTrieCoded(val), index, store);
public static long doubleToSortableLong(double val) {
long f = Double.doubleToLongBits(val);
if (f<0) f ^= 0x7fffffffffffffffL;
return f;
}
/**
* Stores a Date value in trie-form in document for indexing.
* <p>To store the different precisions of the long values (from one byte [only the most significant one] to the full eight bytes),
* each lower precision is prefixed by the length ({@link #TRIE_CODED_PADDING_START}<code>+precision</code>),
* in an extra "helper" field with a name of <code>fieldname+{@link #LOWER_PRECISION_FIELD_NAME_SUFFIX}</code>
* (i.e. fieldname "numeric" => lower precision's name "numeric#trie").
* The full long is not prefixed at all and indexed and stored according to the given flags in the original field name.
* If the field should not be searchable, set <code>index</code> to <code>false</code>. It is then only stored (for convenience).
* Fields added to a document using this method can be queried by {@link TrieRangeQuery}.
* Converts a sortable <code>long</code> back to a <code>double</code>.
* @see #doubleToSortableLong
*/
public void addDateTrieCodedDocumentField(
final Document ldoc, final String fieldname,
final Date val, final boolean index, final Field.Store store
) {
addConvertedTrieCodedDocumentField(ldoc, fieldname, dateToTrieCoded(val), index, store);
public static double sortableLongToDouble(long val) {
if (val<0) val ^= 0x7fffffffffffffffL;
return Double.longBitsToDouble(val);
}
/**
* Stores a long value in trie-form in document for indexing.
* <p>To store the different precisions of the long values (from one byte [only the most significant one] to the full eight bytes),
* each lower precision is prefixed by the length ({@link #TRIE_CODED_PADDING_START}<code>+precision</code>),
* in an extra "helper" field with a name of <code>fieldname+{@link #LOWER_PRECISION_FIELD_NAME_SUFFIX}</code>
* (i.e. fieldname "numeric" => lower precision's name "numeric#trie").
* The full long is not prefixed at all and indexed and stored according to the given flags in the original field name.
* If the field should not be searchable, set <code>index</code> to <code>false</code>. It is then only stored (for convenience).
* Fields added to a document using this method can be queried by {@link TrieRangeQuery}.
* Converts a <code>float</code> value to a sortable signed <code>int</code>.
* The value is converted by getting their IEEE 754 floating-point &quot;float format&quot;
* bit layout and then some bits are swapped, to be able to compare the result as int.
* By this the precision is not reduced, but the value can easily used as an int.
* @see #sortableIntToFloat
*/
public void addLongTrieCodedDocumentField(
final Document ldoc, final String fieldname,
final long val, final boolean index, final Field.Store store
) {
addConvertedTrieCodedDocumentField(ldoc, fieldname, longToTrieCoded(val), index, store);
public static int floatToSortableInt(float val) {
int f = Float.floatToIntBits(val);
if (f<0) f ^= 0x7fffffff;
return f;
}
/** A factory method, that generates a {@link SortField} instance for sorting trie encoded values. */
public SortField getSortField(final String field) {
return new SortField(field, FIELD_CACHE_LONG_PARSER);
/**
* Converts a sortable <code>int</code> back to a <code>float</code>.
* @see #floatToSortableInt
*/
public static float sortableIntToFloat(int val) {
if (val<0) val ^= 0x7fffffff;
return Float.intBitsToFloat(val);
}
/** A factory method, that generates a {@link SortField} instance for sorting trie encoded values. */
public SortField getSortField(final String field, boolean reverse) {
/** A factory method, that generates a {@link SortField} instance for sorting prefix encoded long values. */
public static SortField getLongSortField(final String field, boolean reverse) {
return new SortField(field, FIELD_CACHE_LONG_PARSER, reverse);
}
/** A parser instance for filling a {@link ExtendedFieldCache}, that parses trie encoded fields as longs. */
public final ExtendedFieldCache.LongParser FIELD_CACHE_LONG_PARSER=new ExtendedFieldCache.LongParser(){
public final long parseLong(String val) {
return trieCodedToLong(val);
/** A factory method, that generates a {@link SortField} instance for sorting prefix encoded int values. */
public static SortField getIntSortField(final String field, boolean reverse) {
return new SortField(field, FIELD_CACHE_INT_PARSER, reverse);
}
/**
* Returns a sequence of trie coded numbers suitable for {@link LongTrieRangeFilter}.
* Each successive string in the list has had it's precision reduced by <code>precisionStep</code>.
* For sorting, index the first full-precision value into a separate field and the
* remaining values into another field.
* <p>To achieve this, use {@link #addIndexedFields(Document,String,String[])}.
*/
public static String[] trieCodeLong(long val, int precisionStep) {
if (precisionStep<1 || precisionStep>64)
throw new IllegalArgumentException("precisionStep may only be 1..64");
String[] arr = new String[63/precisionStep+1];
int idx = 0;
for (int shift=0; shift<64; shift+=precisionStep) {
arr[idx++] = longToPrefixCoded(val, shift);
}
};
/** A parser instance for filling a {@link ExtendedFieldCache}, that parses trie encoded fields as doubles. */
public final ExtendedFieldCache.DoubleParser FIELD_CACHE_DOUBLE_PARSER=new ExtendedFieldCache.DoubleParser(){
public final double parseDouble(String val) {
return trieCodedToDouble(val);
return arr;
}
/**
* Returns a sequence of trie coded numbers suitable for {@link IntTrieRangeFilter}.
* Each successive string in the list has had it's precision reduced by <code>precisionStep</code>.
* For sorting, index the first full-precision value into a separate field and the
* remaining values into another field.
* <p>To achieve this, use {@link #addIndexedFields(Document,String,String[])}.
*/
public static String[] trieCodeInt(int val, int precisionStep) {
if (precisionStep<1 || precisionStep>32)
throw new IllegalArgumentException("precisionStep may only be 1..32");
String[] arr = new String[31/precisionStep+1];
int idx = 0;
for (int shift=0; shift<32; shift+=precisionStep) {
arr[idx++] = intToPrefixCoded(val, shift);
}
};
return arr;
}
/**
* Indexes the full precision value only in the main field (for sorting), and indexes all other
* lower precision values in <code>field+LOWER_PRECISION_FIELD_NAME_SUFFIX</code>.
* <p><b>This is the recommended variant to add trie fields to the index.</b>
* By this it is possible to sort the field using a <code>SortField</code> instance
* returned by {@link #getLongSortField} or {@link #getIntSortField}.
* <p>This method does not store the fields and saves no term frequency or norms
* (which are normally not needed for trie fields). If you want to additionally store
* the value, you can use the normal methods of {@link Document} to achive this, just specify
* <code>Field.Store.YES</code>, <code>Field.Index.NO</code> and the same field name.
* <p>Examples:
* <pre>
* addIndexedFields(doc, "mydouble", trieCodeLong(doubleToSortableLong(1.414d), 4));
* addIndexedFields(doc, "mylong", trieCodeLong(123456L, 4));
* </pre>
**/
public static void addIndexedFields(Document doc, String field, String[] trieCoded) {
addIndexedFields(doc, new String[]{field, field+LOWER_PRECISION_FIELD_NAME_SUFFIX}, trieCoded);
}
/**
* Expert: Indexes the full precision value only in the main field (for sorting), and indexes all other
* lower precision values in the <code>lowerPrecision</code> field.
* If you do not specify the same field name for the main and lower precision one,
* it is possible to sort the field using a <code>SortField</code> instance
* returned by {@link #getLongSortField} or {@link #getIntSortField}.
* <p>This method does not store the fields and saves no term frequency or norms
* (which are normally not needed for trie fields). If you want to additionally store
* the value, you can use the normal methods of {@link Document} to achive this, just specify
* <code>Field.Store.YES</code>, <code>Field.Index.NO</code> and the same main field name.
* <p>Examples:
* <pre>
* addIndexedFields(doc, "mydouble", "mydoubletrie", trieCodeLong(doubleToSortableLong(1.414d), 4));
* addIndexedFields(doc, "mylong", "mylongtrie", trieCodeLong(123456L, 4));
* </pre>
* @see #addIndexedFields(Document,String,String[])
**/
public static void addIndexedFields(Document doc, String field, String lowerPrecisionField, String[] trieCoded) {
addIndexedFields(doc, new String[]{field, lowerPrecisionField}, trieCoded);
}
/**
* Expert: Indexes a series of trie coded values into a lucene {@link Document}
* using the given field names.
* If the array of field names is shorter than the trie coded one, all trie coded
* values with higher index get the last field name.
* <p>This method does not store the fields and saves no term frequency or norms
* (which are normally not needed for trie fields). If you want to additionally store
* the value, you can use the normal methods of {@link Document} to achive this, just specify
* <code>Field.Store.YES</code>, <code>Field.Index.NO</code> and the same main field name.
**/
public static void addIndexedFields(Document doc, String[] fields, String[] trieCoded) {
for (int i=0; i<trieCoded.length; i++) {
final int fnum = Math.min(fields.length-1, i);
final Field f = new Field(fields[fnum], trieCoded[i], Field.Store.NO, Field.Index.NOT_ANALYZED_NO_NORMS);
f.setOmitTf(true);
doc.add(f);
}
}
/**
* Expert: Splits a long range recursively.
* You may implement a builder that adds clauses to a
* {@link org.apache.lucene.search.BooleanQuery} for each call to its
* {@link IntRangeBuilder#addRange(String,String,int)}
* method.
* <p>This method is used by {@link LongTrieRangeFilter}.
*/
public static void splitLongRange(final LongRangeBuilder builder,
final int precisionStep, final long minBound, final long maxBound
) {
if (precisionStep<1 || precisionStep>64)
throw new IllegalArgumentException("precisionStep may only be 1..64");
splitRange(
builder, 64, precisionStep, minBound, maxBound,
0 /* start with no shift */
);
}
private final long mask;
/**
* Expert: Splits an int range recursively.
* You may implement a builder that adds clauses to a
* {@link org.apache.lucene.search.BooleanQuery} for each call to its
* {@link IntRangeBuilder#addRange(String,String,int)}
* method.
* <p>This method is used by {@link IntTrieRangeFilter}.
*/
public static void splitIntRange(final IntRangeBuilder builder,
final int precisionStep, final int minBound, final int maxBound
) {
if (precisionStep<1 || precisionStep>32)
throw new IllegalArgumentException("precisionStep may only be 1..32");
splitRange(
builder, 32, precisionStep, (long)minBound, (long)maxBound,
0 /* start with no shift */
);
}
/** Number of bits used in this trie variant (2, 4, or 8) */
public final int TRIE_BITS;
/** This helper does the splitting for both 32 and 64 bit. */
private static void splitRange(
final Object builder, final int valSize,
final int precisionStep, final long minBound, final long maxBound,
final int shift
) {
// calculate new bounds for inner precision
final long diff = 1L << (shift+precisionStep),
mask = ((1L<<precisionStep) - 1L) << shift;
final boolean
hasLower = (minBound & mask) != 0L,
hasUpper = (maxBound & mask) != mask;
final long
nextMinBound = (hasLower ? (minBound + diff) : minBound) & ~mask,
nextMaxBound = (hasUpper ? (maxBound - diff) : maxBound) & ~mask;
/** Length (in chars) of an encoded value (8, 16, or 32 chars) */
public final int TRIE_CODED_LENGTH;
/** Character used as upper end (depends on trie bits, its <code>{@link #TRIE_CODED_SYMBOL_MIN}+2^{@link #TRIE_BITS}-1</code>) */
public final char TRIE_CODED_SYMBOL_MAX;
/** minimum encoded value of a numerical index entry: {@link Long#MIN_VALUE} */
public final String TRIE_CODED_NUMERIC_MIN;
/** maximum encoded value of a numerical index entry: {@link Long#MAX_VALUE} */
public final String TRIE_CODED_NUMERIC_MAX;
if (shift+precisionStep>=valSize || nextMinBound>nextMaxBound) {
// We are in the lowest precision or the next precision is not available.
addRange(builder, valSize, precisionStep, minBound, maxBound, shift);
} else {
if (hasLower)
addRange(builder, valSize, precisionStep, minBound, minBound | mask, shift);
if (hasUpper)
addRange(builder, valSize, precisionStep, maxBound & ~mask, maxBound, shift);
// recurse down to next precision
splitRange(
builder, valSize, precisionStep,
nextMinBound, nextMaxBound,
shift+precisionStep
);
}
}
/** Helper that delegates to correct range builder */
private static void addRange(
final Object builder, final int valSize,
final int precisionStep, long minBound, long maxBound,
final int shift
) {
// for the max bound set all lower bits (that were shifted away):
// this is important for testing or other usages of the splitted range
// (e.g. to reconstruct the full range). The prefixEncoding will remove
// the bits anyway, so they do not hurt!
maxBound |= (1L << shift) - 1L;
// delegate to correct range builder
switch(valSize) {
case 64:
((LongRangeBuilder)builder).addRange(precisionStep, minBound, maxBound, shift);
break;
case 32:
((IntRangeBuilder)builder).addRange(precisionStep, (int)minBound, (int)maxBound, shift);
break;
default:
// Should not happen!
throw new IllegalArgumentException("valSize must be 32 or 64.");
}
}
/**
* Expert: Callback for {@link #splitLongRange}.
* You need to overwrite only one of the methods.
*/
public static abstract class LongRangeBuilder {
/**
* Overwrite this method, if you like to receive the already prefix encoded range bounds.
* You can directly build classical range queries from them.
* The level gives the precision level (0 = highest precision) of the encoded values.
* This parameter could be used as an index to an array of fieldnames like the
* parameters to {@link #addIndexedFields(Document,String[],String[])} for specifying
* the field names for each precision:
* <pre>
* String field = fields[Math.min(fields.length-1, level)];
* </pre>
*/
public void addRange(String minPrefixCoded, String maxPrefixCoded, int level) {
throw new UnsupportedOperationException();
}
/**
* Overwrite this method, if you like to receive the raw long range bounds.
* You can use this for e.g. debugging purposes (print out range bounds).
*/
public void addRange(final int precisionStep, final long min, final long max, final int shift) {
/*System.out.println(Long.toHexString((min^0x8000000000000000L) >>> shift)+".."+
Long.toHexString((max^0x8000000000000000L) >>> shift));*/
addRange(longToPrefixCoded(min, shift), longToPrefixCoded(max, shift), shift/precisionStep);
}
}
/**
* Expert: Callback for {@link #splitIntRange}.
* You need to overwrite only one of the methods.
*/
public static abstract class IntRangeBuilder {
/**
* Overwrite this method, if you like to receive the already prefix encoded range bounds.
* You can directly build classical range queries from them.
* The level gives the precision level (0 = highest precision) of the encoded values.
* This parameter could be used as an index to an array of fieldnames like the
* parameters to {@link #addIndexedFields(Document,String[],String[])} for specifying
* the field names for each precision:
* <pre>
* String field = fields[Math.min(fields.length-1, level)];
* </pre>
*/
public void addRange(String minPrefixCoded, String maxPrefixCoded, int level) {
throw new UnsupportedOperationException();
}
/**
* Overwrite this method, if you like to receive the raw int range bounds.
* You can use this for e.g. debugging purposes (print out range bounds).
*/
public void addRange(final int precisionStep, final int min, final int max, final int shift) {
/*System.out.println(Integer.toHexString((min^0x80000000) >>> shift)+".."+
Integer.toHexString((max^0x80000000) >>> shift));*/
addRange(intToPrefixCoded(min, shift), intToPrefixCoded(max, shift), shift/precisionStep);
}
}
}

View File

@ -1,7 +1,7 @@
<html>
<body>
<p>This package provides fast numeric range queries/filters on <code>long</code>, <code>double</code> or <code>Date</code>
fields based on trie structures.</p>
<p>This package provides fast numeric range queries/filters on <code>long</code>, <code>double</code>, <code>int</code>,
or <code>float</code> (and other data types, that can be converted to numerical values) fields based on trie structures.</p>
<h3>How it works</h3>
<p>See the publication about <a target="_blank" href="http://www.panfmp.org">panFMP</a>, where this algorithm was described:
@ -14,66 +14,96 @@ Computers &amp; Geosciences 34 (12), 1947-1955.
it cannot handle numerical ranges (e.g., field value is inside user defined bounds, even dates are numerical values).
We have developed an extension to Apache Lucene that stores
the numerical values in a special string-encoded format with variable precision
(all numerical values like doubles, longs, and timestamps are converted to lexicographic sortable string representations
and stored with different precisions from one byte to the full 8 bytes - depending on the variant used).
For a more detailed description of how the values are stored, see {@link org.apache.lucene.search.trie.TrieUtils}.
A range is then divided recursively into multiple intervals for searching:
The center of the range is searched only with the lowest possible precision in the trie, while the boundaries are matched
more exactly. This reduces the number of terms dramatically.</p>
(all numerical values like doubles, longs, floats, and ints are converted to lexicographic sortable string representations
and stored with different precisions. For a more detailed description of how the values are stored,
see {@link org.apache.lucene.search.trie.TrieUtils}. A range is then divided recursively into
multiple intervals for searching:
The center of the range is searched only with the lowest possible precision in the trie,
while the boundaries are matched more exactly. This reduces the number of terms dramatically.</p>
<p>For the variant that uses a lowest precision of 1-byte the index
contains only a maximum of 256 distinct values in the lowest precision.
Overall, a range could consist of a theoretical maximum of
<p>For the variant that stores long values in 8 different precisions (each reduced by 8 bits) that
uses a lowest precision of 1 byte, the index contains only a maximum of 256 distinct values in the
lowest precision. Overall, a range could consist of a theoretical maximum of
<code>7*255*2 + 255 = 3825</code> distinct terms (when there is a term for every distinct value of an
8-byte-number in the index and the range covers all of them; a maximum of 255 distinct values is used
8-byte-number in the index and the range covers almost all of them; a maximum of 255 distinct values is used
because it would always be possible to reduce the full 256 values to one term with degraded precision).
In practise, we have seen up to 300 terms in most cases (index with 500,000 metadata records
and a uniform value distribution).</p>
<p>There are two other variants of encoding: 4bit and 2bit. Each variant stores more different precisions
of the longs and thus needs more storage space (because it generates more and longer terms -
4bit: two times the length and number of terms; 2bit: four times the length and number of terms).
But on the other hand, the maximum number of distinct terms used for range queries is
<code>15*15*2 + 15 = 465</code> for the 4bit variant, and
<code>31*3*2 + 3 = 189</code> for the 2bit variant.</p>
<p>You can choose any <code>precisionStep</code> when encoding integer values.
Lower step values mean more precisions and so more terms in index (and index gets larger).
On the other hand, the maximum number of terms to match reduces, which optimized query speed.
The formula to calculate the maximum term count is:
<pre>
n = [ (bitsPerValue/precisionStep - 1) * (2^precisionStep - 1 ) * 2 ] + (2^precisionStep - 1 )
</pre>
<p><em>(this formula is only correct, when <code>bitsPerValue/precisionStep</code> is an integer;
in other cases, the value must be rounded up and the last summand must contain the modulo of the division as
precision step)</em>.
For longs stored using a precision step of 4, <code>n = 15*15*2 + 15 = 465</code>, and for a precision
step of 2, <code>n = 31*3*2 + 3 = 189</code>. But the faster search speed is reduced by more seeking
in the term enum of the index. Because of this, the ideal <code>precisionStep</code> value can only
be found out by testing. <b>Important:</b> You can index with a lower precision step value and test search speed
using a multiple of the original step value.</p>
<p>This dramatically improves the performance of Apache Lucene with range queries, which
are no longer dependent on the index size and the number of distinct values because there is
an upper limit unrelated to either of these properties.</p>
<h3>Usage</h3>
<p>To use the new query types the numerical values, which may be <code>long</code>, <code>double</code> or <code>Date</code>,
the values must be stored during indexing in a special format in the index (using {@link org.apache.lucene.search.trie.TrieUtils}).
This can be done like this:</p>
<p>To use the new query types the numerical values, which may be<code>long</code>, <code>double</code>, <code>int</code>,
<code>float</code>, or <code>Date</code>, the values must be indexed in a special prefix encoded format
(using {@link org.apache.lucene.search.trie.TrieUtils}). This can be done like this:</p>
<pre>
<em>// chose a step value, 8 is a general good value for large indexes:</em>
int precisionStep = 8;
Document doc = new Document();
// add some standard fields:
<em>// add some standard fields:</em>
String svalue = "anything to index";
doc.add(new Field("exampleString", svalue, Field.Store.YES, Field.Index.ANALYZED) ;
// add some numerical fields:
double fvalue = 1.057E17;
TrieUtils.VARIANT_8BIT.addDoubleTrieCodedDocumentField(doc, "exampleDouble", fvalue, true /* index the field */, Field.Store.YES);
doc.add(new Field("exampleString", svalue, Field.Store.YES, Field.Index.ANALYZED));
<em>// add some numerical fields:</em>
long lvalue = 121345L;
TrieUtils.VARIANT_8BIT.addLongTrieCodedDocumentField(doc, "exampleLong", lvalue, true /* index the field */, Field.Store.YES);
Date dvalue = new Date(); // actual time
TrieUtils.VARIANT_8BIT.addDateTrieCodedDocumentField(doc, "exampleDate", dvalue, true /* index the field */, Field.Store.YES);
// add document to IndexWriter
TrieUtils.addIndexedFields(doc, "exampleLong", TrieUtils.trieCodeLong(lvalue, precisionStep));
double dvalue = 1.057E17;
TrieUtils.addIndexedFields(doc, "exampleDouble", TrieUtils.trieCodeLong(TrieUtils.doubleToSortableLong(dvalue), precisionStep));
int ivalue = 121345;
TrieUtils.addIndexedFields(doc, "exampleInt", TrieUtils.trieCodeInt(ivalue, precisionStep));
float fvalue = 1.057E17f;
TrieUtils.addIndexedFields(doc, "exampleFloat", TrieUtils.trieCodeInt(TrieUtils.floatToSortableInt(fvalue), precisionStep));
Date datevalue = new Date(); <em>// actual time</em>
TrieUtils.addIndexedFields(doc, "exampleDate", TrieUtils.trieCodeLong(datevalue.getTime(), precisionStep));
<em>// if you want to also store one of the values:</em>
doc.add(new Field("exampleLong", Long.toString(lvalue), Field.Store.YES, Field.Index.NO));
<em>// or as encoded value:</em>
doc.add(new Field("exampleLong2", TrieUtils.longToPrefixCoded(lvalue), Field.Store.YES, Field.Index.NO));
<em>// now add document to IndexWriter, as usual</em>
</pre>
<p>The numeric index fields you prepared in this way can be searched by {@link org.apache.lucene.search.trie.TrieRangeQuery}:</p>
<p>The numeric index fields you prepared in this way can be searched by
{@link org.apache.lucene.search.trie.LongTrieRangeFilter} or {@link org.apache.lucene.search.trie.IntTrieRangeFilter}:</p>
<pre>
// Java 1.4, because Double.valueOf(double) is not available:
Query q = new TrieRangeQuery("exampleDouble", new Double(1.0E17), new Double(2.0E17), TrieUtils.VARIANT_8BIT);
// OR, Java 1.5, using autoboxing:
Query q = new TrieRangeQuery("exampleDouble", 1.0E17, 2.0E17, TrieUtils.VARIANT_8BIT);
<em>// Java 1.4, because Long.valueOf(long) is not available:</em>
Query q = new LongTrieRangeFilter("exampleLong", precisionStep, new Long(123L), new Long(999999L), true, true).asQuery();
<em>// OR, Java 1.5, using autoboxing:</em>
Query q = new LongTrieRangeFilter("exampleLong", precisionStep, 123L, 999999L, true, true).asQuery();
<em>// execute the search, as usual:</em>
TopDocs docs = searcher.search(q, 10);
for (int i = 0; i&lt;docs.scoreDocs.length; i++) {
Document doc = searcher.doc(docs.scoreDocs[i].doc);
System.out.println(doc.get("exampleString"));
// decode the stored numerical value (important!!!):
System.out.println(TrieUtils.VARIANT_8BIT.trieCodedToDouble(doc.get("exampleDouble")));
<em>// decode a prefix coded, stored field:</em>
System.out.println(TrieUtils.prefixCodedToLong(doc.get("exampleLong2")));
}
</pre>
@ -82,9 +112,9 @@ This can be done like this:</p>
<p>Comparisions of the different types of RangeQueries on an index with about 500,000 docs showed
that the old {@link org.apache.lucene.search.RangeQuery} (with raised
{@link org.apache.lucene.search.BooleanQuery} clause count) took about 30-40 secs to complete,
{@link org.apache.lucene.search.ConstantScoreRangeQuery} took 5 secs and
{@link org.apache.lucene.search.trie.TrieRangeQuery} took &lt;100ms to
complete (on an Opteron64 machine, Java 1.5, {@link org.apache.lucene.search.trie.TrieUtils#VARIANT_8BIT}).
{@link org.apache.lucene.search.ConstantScoreRangeQuery} took 5 secs and executing
{@link org.apache.lucene.search.trie.LongTrieRangeFilter}<code>.asQuery()</code> took &lt;100ms to
complete (on an Opteron64 machine, Java 1.5, 8 bit precision step).
This query type was developed for a geographic portal, where the performance for
e.g. bounding boxes or exact date/time stamps is important.</p>

View File

@ -0,0 +1,291 @@
package org.apache.lucene.search.trie;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.Random;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriter.MaxFieldLength;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.RangeQuery;
import org.apache.lucene.util.LuceneTestCase;
public class TestIntTrieRangeFilter extends LuceneTestCase
{
// distance of entries
private static final int distance = 6666;
// shift the starting of the values to the left, to also have negative values:
private static final int startOffset = - 1 << 15;
// number of docs to generate for testing
private static final int noDocs = 10000;
private static final RAMDirectory directory;
private static final IndexSearcher searcher;
static {
try {
directory = new RAMDirectory();
IndexWriter writer = new IndexWriter(directory, new WhitespaceAnalyzer(),
true, MaxFieldLength.UNLIMITED);
// Add a series of noDocs docs with increasing int values
for (int l=0; l<noDocs; l++) {
Document doc=new Document();
// add fields, that have a distance to test general functionality
final int val=distance*l+startOffset;
TrieUtils.addIndexedFields(doc,"field8", TrieUtils.trieCodeInt(val, 8));
doc.add(new Field("field8", TrieUtils.intToPrefixCoded(val), Field.Store.YES, Field.Index.NO));
TrieUtils.addIndexedFields(doc,"field4", TrieUtils.trieCodeInt(val, 4));
doc.add(new Field("field4", TrieUtils.intToPrefixCoded(val), Field.Store.YES, Field.Index.NO));
TrieUtils.addIndexedFields(doc,"field2", TrieUtils.trieCodeInt(val, 2));
doc.add(new Field("field2", TrieUtils.intToPrefixCoded(val), Field.Store.YES, Field.Index.NO));
// add ascending fields with a distance of 1, beginning at -noDocs/2 to test the correct splitting of range and inclusive/exclusive
TrieUtils.addIndexedFields(doc,"ascfield8", TrieUtils.trieCodeInt(l-(noDocs/2), 8));
TrieUtils.addIndexedFields(doc,"ascfield4", TrieUtils.trieCodeInt(l-(noDocs/2), 4));
TrieUtils.addIndexedFields(doc,"ascfield2", TrieUtils.trieCodeInt(l-(noDocs/2), 2));
writer.addDocument(doc);
}
writer.optimize();
writer.close();
searcher=new IndexSearcher(directory);
} catch (Exception e) {
throw new Error(e);
}
}
private void testRange(int precisionStep) throws Exception {
String field="field"+precisionStep;
int count=3000;
int lower=(distance*3/2)+startOffset, upper=lower + count*distance + (distance/3);
IntTrieRangeFilter f=new IntTrieRangeFilter(field, precisionStep, new Integer(lower), new Integer(upper), true, true);
TopDocs topDocs = searcher.search(f.asQuery(), null, noDocs, Sort.INDEXORDER);
System.out.println("Found "+f.getLastNumberOfTerms()+" distinct terms in range for field '"+field+"'.");
ScoreDoc[] sd = topDocs.scoreDocs;
assertNotNull(sd);
assertEquals("Score doc count", count, sd.length );
Document doc=searcher.doc(sd[0].doc);
assertEquals("First doc", 2*distance+startOffset, TrieUtils.prefixCodedToInt(doc.get(field)) );
doc=searcher.doc(sd[sd.length-1].doc);
assertEquals("Last doc", (1+count)*distance+startOffset, TrieUtils.prefixCodedToInt(doc.get(field)) );
}
public void testRange_8bit() throws Exception {
testRange(8);
}
public void testRange_4bit() throws Exception {
testRange(4);
}
public void testRange_2bit() throws Exception {
testRange(2);
}
private void testLeftOpenRange(int precisionStep) throws Exception {
String field="field"+precisionStep;
int count=3000;
int upper=(count-1)*distance + (distance/3) + startOffset;
IntTrieRangeFilter f=new IntTrieRangeFilter(field, precisionStep, null, new Integer(upper), true, true);
TopDocs topDocs = searcher.search(f.asQuery(), null, noDocs, Sort.INDEXORDER);
System.out.println("Found "+f.getLastNumberOfTerms()+" distinct terms in left open range for field '"+field+"'.");
ScoreDoc[] sd = topDocs.scoreDocs;
assertNotNull(sd);
assertEquals("Score doc count", count, sd.length );
Document doc=searcher.doc(sd[0].doc);
assertEquals("First doc", startOffset, TrieUtils.prefixCodedToInt(doc.get(field)) );
doc=searcher.doc(sd[sd.length-1].doc);
assertEquals("Last doc", (count-1)*distance+startOffset, TrieUtils.prefixCodedToInt(doc.get(field)) );
}
public void testLeftOpenRange_8bit() throws Exception {
testLeftOpenRange(8);
}
public void testLeftOpenRange_4bit() throws Exception {
testLeftOpenRange(4);
}
public void testLeftOpenRange_2bit() throws Exception {
testLeftOpenRange(2);
}
private void testRightOpenRange(int precisionStep) throws Exception {
String field="field"+precisionStep;
int count=3000;
int lower=(count-1)*distance + (distance/3) +startOffset;
IntTrieRangeFilter f=new IntTrieRangeFilter(field, precisionStep, new Integer(lower), null, true, true);
TopDocs topDocs = searcher.search(f.asQuery(), null, noDocs, Sort.INDEXORDER);
System.out.println("Found "+f.getLastNumberOfTerms()+" distinct terms in right open range for field '"+field+"'.");
ScoreDoc[] sd = topDocs.scoreDocs;
assertNotNull(sd);
assertEquals("Score doc count", noDocs-count, sd.length );
Document doc=searcher.doc(sd[0].doc);
assertEquals("First doc", count*distance+startOffset, TrieUtils.prefixCodedToInt(doc.get(field)) );
doc=searcher.doc(sd[sd.length-1].doc);
assertEquals("Last doc", (noDocs-1)*distance+startOffset, TrieUtils.prefixCodedToInt(doc.get(field)) );
}
public void testRightOpenRange_8bit() throws Exception {
testRightOpenRange(8);
}
public void testRightOpenRange_4bit() throws Exception {
testRightOpenRange(4);
}
public void testRightOpenRange_2bit() throws Exception {
testRightOpenRange(2);
}
private void testRandomTrieAndClassicRangeQuery(int precisionStep) throws Exception {
final Random rnd=newRandom();
String field="field"+precisionStep;
// 50 random tests, the tests may also return 0 results, if min>max, but this is ok
for (int i=0; i<50; i++) {
int lower=(int)(rnd.nextDouble()*noDocs*distance)+startOffset;
int upper=(int)(rnd.nextDouble()*noDocs*distance)+startOffset;
// test inclusive range
Query tq=new IntTrieRangeFilter(field, precisionStep, new Integer(lower), new Integer(upper), true, true).asQuery();
RangeQuery cq=new RangeQuery(field, TrieUtils.intToPrefixCoded(lower), TrieUtils.intToPrefixCoded(upper), true, true);
cq.setConstantScoreRewrite(true);
TopDocs tTopDocs = searcher.search(tq, 1);
TopDocs cTopDocs = searcher.search(cq, 1);
assertEquals("Returned count for IntTrieRangeFilter and RangeQuery must be equal", cTopDocs.totalHits, tTopDocs.totalHits );
// test exclusive range
tq=new IntTrieRangeFilter(field, precisionStep, new Integer(lower), new Integer(upper), false, false).asQuery();
cq=new RangeQuery(field, TrieUtils.intToPrefixCoded(lower), TrieUtils.intToPrefixCoded(upper), false, false);
cq.setConstantScoreRewrite(true);
tTopDocs = searcher.search(tq, 1);
cTopDocs = searcher.search(cq, 1);
assertEquals("Returned count for IntTrieRangeFilter and RangeQuery must be equal", cTopDocs.totalHits, tTopDocs.totalHits );
// test left exclusive range
tq=new IntTrieRangeFilter(field, precisionStep, new Integer(lower), new Integer(upper), false, true).asQuery();
cq=new RangeQuery(field, TrieUtils.intToPrefixCoded(lower), TrieUtils.intToPrefixCoded(upper), false, true);
cq.setConstantScoreRewrite(true);
tTopDocs = searcher.search(tq, 1);
cTopDocs = searcher.search(cq, 1);
assertEquals("Returned count for IntTrieRangeFilter and RangeQuery must be equal", cTopDocs.totalHits, tTopDocs.totalHits );
// test right exclusive range
tq=new IntTrieRangeFilter(field, precisionStep, new Integer(lower), new Integer(upper), true, false).asQuery();
cq=new RangeQuery(field, TrieUtils.intToPrefixCoded(lower), TrieUtils.intToPrefixCoded(upper), true, false);
cq.setConstantScoreRewrite(true);
tTopDocs = searcher.search(tq, 1);
cTopDocs = searcher.search(cq, 1);
assertEquals("Returned count for IntTrieRangeFilter and RangeQuery must be equal", cTopDocs.totalHits, tTopDocs.totalHits );
}
}
public void testRandomTrieAndClassicRangeQuery_8bit() throws Exception {
testRandomTrieAndClassicRangeQuery(8);
}
public void testRandomTrieAndClassicRangeQuery_4bit() throws Exception {
testRandomTrieAndClassicRangeQuery(4);
}
public void testRandomTrieAndClassicRangeQuery_2bit() throws Exception {
testRandomTrieAndClassicRangeQuery(2);
}
private void testRangeSplit(int precisionStep) throws Exception {
final Random rnd=newRandom();
String field="ascfield"+precisionStep;
// 50 random tests
for (int i=0; i<50; i++) {
int lower=(int)(rnd.nextDouble()*noDocs - noDocs/2);
int upper=(int)(rnd.nextDouble()*noDocs - noDocs/2);
if (lower>upper) {
int a=lower; lower=upper; upper=a;
}
// test inclusive range
Query tq=new IntTrieRangeFilter(field, precisionStep, new Integer(lower), new Integer(upper), true, true).asQuery();
TopDocs tTopDocs = searcher.search(tq, 1);
assertEquals("Returned count of range query must be equal to inclusive range length", upper-lower+1, tTopDocs.totalHits );
// test exclusive range
tq=new IntTrieRangeFilter(field, precisionStep, new Integer(lower), new Integer(upper), false, false).asQuery();
tTopDocs = searcher.search(tq, 1);
assertEquals("Returned count of range query must be equal to exclusive range length", Math.max(upper-lower-1, 0), tTopDocs.totalHits );
// test left exclusive range
tq=new IntTrieRangeFilter(field, precisionStep, new Integer(lower), new Integer(upper), false, true).asQuery();
tTopDocs = searcher.search(tq, 1);
assertEquals("Returned count of range query must be equal to half exclusive range length", upper-lower, tTopDocs.totalHits );
// test right exclusive range
tq=new IntTrieRangeFilter(field, precisionStep, new Integer(lower), new Integer(upper), true, false).asQuery();
tTopDocs = searcher.search(tq, 1);
assertEquals("Returned count of range query must be equal to half exclusive range length", upper-lower, tTopDocs.totalHits );
}
}
public void testRangeSplit_8bit() throws Exception {
testRangeSplit(8);
}
public void testRangeSplit_4bit() throws Exception {
testRangeSplit(4);
}
public void testRangeSplit_2bit() throws Exception {
testRangeSplit(2);
}
private void testSorting(int precisionStep) throws Exception {
final Random rnd=newRandom();
String field="field"+precisionStep;
// 10 random tests, the index order is ascending,
// so using a reverse sort field should retun descending documents
for (int i=0; i<10; i++) {
int lower=(int)(rnd.nextDouble()*noDocs*distance)+startOffset;
int upper=(int)(rnd.nextDouble()*noDocs*distance)+startOffset;
if (lower>upper) {
int a=lower; lower=upper; upper=a;
}
Query tq=new IntTrieRangeFilter(field, precisionStep, new Integer(lower), new Integer(upper), true, true).asQuery();
TopDocs topDocs = searcher.search(tq, null, noDocs, new Sort(TrieUtils.getIntSortField(field, true)));
if (topDocs.totalHits==0) continue;
ScoreDoc[] sd = topDocs.scoreDocs;
assertNotNull(sd);
int last=TrieUtils.prefixCodedToInt(searcher.doc(sd[0].doc).get(field));
for (int j=1; j<sd.length; j++) {
int act=TrieUtils.prefixCodedToInt(searcher.doc(sd[j].doc).get(field));
assertTrue("Docs should be sorted backwards", last>act );
last=act;
}
}
}
public void testSorting_8bit() throws Exception {
testSorting(8);
}
public void testSorting_4bit() throws Exception {
testSorting(4);
}
public void testSorting_2bit() throws Exception {
testSorting(2);
}
}

View File

@ -0,0 +1,291 @@
package org.apache.lucene.search.trie;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.Random;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriter.MaxFieldLength;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.RangeQuery;
import org.apache.lucene.util.LuceneTestCase;
public class TestLongTrieRangeFilter extends LuceneTestCase
{
// distance of entries
private static final long distance = 66666L;
// shift the starting of the values to the left, to also have negative values:
private static final long startOffset = - 1L << 31;
// number of docs to generate for testing
private static final int noDocs = 10000;
private static final RAMDirectory directory;
private static final IndexSearcher searcher;
static {
try {
directory = new RAMDirectory();
IndexWriter writer = new IndexWriter(directory, new WhitespaceAnalyzer(),
true, MaxFieldLength.UNLIMITED);
// Add a series of noDocs docs with increasing long values
for (int l=0; l<noDocs; l++) {
Document doc=new Document();
// add fields, that have a distance to test general functionality
final long val=distance*l+startOffset;
TrieUtils.addIndexedFields(doc,"field8", TrieUtils.trieCodeLong(val, 8));
doc.add(new Field("field8", TrieUtils.longToPrefixCoded(val), Field.Store.YES, Field.Index.NO));
TrieUtils.addIndexedFields(doc,"field4", TrieUtils.trieCodeLong(val, 4));
doc.add(new Field("field4", TrieUtils.longToPrefixCoded(val), Field.Store.YES, Field.Index.NO));
TrieUtils.addIndexedFields(doc,"field2", TrieUtils.trieCodeLong(val, 2));
doc.add(new Field("field2", TrieUtils.longToPrefixCoded(val), Field.Store.YES, Field.Index.NO));
// add ascending fields with a distance of 1, beginning at -noDocs/2 to test the correct splitting of range and inclusive/exclusive
TrieUtils.addIndexedFields(doc,"ascfield8", TrieUtils.trieCodeLong(l-(noDocs/2), 8));
TrieUtils.addIndexedFields(doc,"ascfield4", TrieUtils.trieCodeLong(l-(noDocs/2), 4));
TrieUtils.addIndexedFields(doc,"ascfield2", TrieUtils.trieCodeLong(l-(noDocs/2), 2));
writer.addDocument(doc);
}
writer.optimize();
writer.close();
searcher=new IndexSearcher(directory);
} catch (Exception e) {
throw new Error(e);
}
}
private void testRange(int precisionStep) throws Exception {
String field="field"+precisionStep;
int count=3000;
long lower=(distance*3/2)+startOffset, upper=lower + count*distance + (distance/3);
LongTrieRangeFilter f=new LongTrieRangeFilter(field, precisionStep, new Long(lower), new Long(upper), true, true);
TopDocs topDocs = searcher.search(f.asQuery(), null, noDocs, Sort.INDEXORDER);
System.out.println("Found "+f.getLastNumberOfTerms()+" distinct terms in range for field '"+field+"'.");
ScoreDoc[] sd = topDocs.scoreDocs;
assertNotNull(sd);
assertEquals("Score doc count", count, sd.length );
Document doc=searcher.doc(sd[0].doc);
assertEquals("First doc", 2*distance+startOffset, TrieUtils.prefixCodedToLong(doc.get(field)) );
doc=searcher.doc(sd[sd.length-1].doc);
assertEquals("Last doc", (1+count)*distance+startOffset, TrieUtils.prefixCodedToLong(doc.get(field)) );
}
public void testRange_8bit() throws Exception {
testRange(8);
}
public void testRange_4bit() throws Exception {
testRange(4);
}
public void testRange_2bit() throws Exception {
testRange(2);
}
private void testLeftOpenRange(int precisionStep) throws Exception {
String field="field"+precisionStep;
int count=3000;
long upper=(count-1)*distance + (distance/3) + startOffset;
LongTrieRangeFilter f=new LongTrieRangeFilter(field, precisionStep, null, new Long(upper), true, true);
TopDocs topDocs = searcher.search(f.asQuery(), null, noDocs, Sort.INDEXORDER);
System.out.println("Found "+f.getLastNumberOfTerms()+" distinct terms in left open range for field '"+field+"'.");
ScoreDoc[] sd = topDocs.scoreDocs;
assertNotNull(sd);
assertEquals("Score doc count", count, sd.length );
Document doc=searcher.doc(sd[0].doc);
assertEquals("First doc", startOffset, TrieUtils.prefixCodedToLong(doc.get(field)) );
doc=searcher.doc(sd[sd.length-1].doc);
assertEquals("Last doc", (count-1)*distance+startOffset, TrieUtils.prefixCodedToLong(doc.get(field)) );
}
public void testLeftOpenRange_8bit() throws Exception {
testLeftOpenRange(8);
}
public void testLeftOpenRange_4bit() throws Exception {
testLeftOpenRange(4);
}
public void testLeftOpenRange_2bit() throws Exception {
testLeftOpenRange(2);
}
private void testRightOpenRange(int precisionStep) throws Exception {
String field="field"+precisionStep;
int count=3000;
long lower=(count-1)*distance + (distance/3) +startOffset;
LongTrieRangeFilter f=new LongTrieRangeFilter(field, precisionStep, new Long(lower), null, true, true);
TopDocs topDocs = searcher.search(f.asQuery(), null, noDocs, Sort.INDEXORDER);
System.out.println("Found "+f.getLastNumberOfTerms()+" distinct terms in right open range for field '"+field+"'.");
ScoreDoc[] sd = topDocs.scoreDocs;
assertNotNull(sd);
assertEquals("Score doc count", noDocs-count, sd.length );
Document doc=searcher.doc(sd[0].doc);
assertEquals("First doc", count*distance+startOffset, TrieUtils.prefixCodedToLong(doc.get(field)) );
doc=searcher.doc(sd[sd.length-1].doc);
assertEquals("Last doc", (noDocs-1)*distance+startOffset, TrieUtils.prefixCodedToLong(doc.get(field)) );
}
public void testRightOpenRange_8bit() throws Exception {
testRightOpenRange(8);
}
public void testRightOpenRange_4bit() throws Exception {
testRightOpenRange(4);
}
public void testRightOpenRange_2bit() throws Exception {
testRightOpenRange(2);
}
private void testRandomTrieAndClassicRangeQuery(int precisionStep) throws Exception {
final Random rnd=newRandom();
String field="field"+precisionStep;
// 50 random tests, the tests may also return 0 results, if min>max, but this is ok
for (int i=0; i<50; i++) {
long lower=(long)(rnd.nextDouble()*noDocs*distance)+startOffset;
long upper=(long)(rnd.nextDouble()*noDocs*distance)+startOffset;
// test inclusive range
Query tq=new LongTrieRangeFilter(field, precisionStep, new Long(lower), new Long(upper), true, true).asQuery();
RangeQuery cq=new RangeQuery(field, TrieUtils.longToPrefixCoded(lower), TrieUtils.longToPrefixCoded(upper), true, true);
cq.setConstantScoreRewrite(true);
TopDocs tTopDocs = searcher.search(tq, 1);
TopDocs cTopDocs = searcher.search(cq, 1);
assertEquals("Returned count for LongTrieRangeFilter and RangeQuery must be equal", cTopDocs.totalHits, tTopDocs.totalHits );
// test exclusive range
tq=new LongTrieRangeFilter(field, precisionStep, new Long(lower), new Long(upper), false, false).asQuery();
cq=new RangeQuery(field, TrieUtils.longToPrefixCoded(lower), TrieUtils.longToPrefixCoded(upper), false, false);
cq.setConstantScoreRewrite(true);
tTopDocs = searcher.search(tq, 1);
cTopDocs = searcher.search(cq, 1);
assertEquals("Returned count for LongTrieRangeFilter and RangeQuery must be equal", cTopDocs.totalHits, tTopDocs.totalHits );
// test left exclusive range
tq=new LongTrieRangeFilter(field, precisionStep, new Long(lower), new Long(upper), false, true).asQuery();
cq=new RangeQuery(field, TrieUtils.longToPrefixCoded(lower), TrieUtils.longToPrefixCoded(upper), false, true);
cq.setConstantScoreRewrite(true);
tTopDocs = searcher.search(tq, 1);
cTopDocs = searcher.search(cq, 1);
assertEquals("Returned count for LongTrieRangeFilter and RangeQuery must be equal", cTopDocs.totalHits, tTopDocs.totalHits );
// test right exclusive range
tq=new LongTrieRangeFilter(field, precisionStep, new Long(lower), new Long(upper), true, false).asQuery();
cq=new RangeQuery(field, TrieUtils.longToPrefixCoded(lower), TrieUtils.longToPrefixCoded(upper), true, false);
cq.setConstantScoreRewrite(true);
tTopDocs = searcher.search(tq, 1);
cTopDocs = searcher.search(cq, 1);
assertEquals("Returned count for LongTrieRangeFilter and RangeQuery must be equal", cTopDocs.totalHits, tTopDocs.totalHits );
}
}
public void testRandomTrieAndClassicRangeQuery_8bit() throws Exception {
testRandomTrieAndClassicRangeQuery(8);
}
public void testRandomTrieAndClassicRangeQuery_4bit() throws Exception {
testRandomTrieAndClassicRangeQuery(4);
}
public void testRandomTrieAndClassicRangeQuery_2bit() throws Exception {
testRandomTrieAndClassicRangeQuery(2);
}
private void testRangeSplit(int precisionStep) throws Exception {
final Random rnd=newRandom();
String field="ascfield"+precisionStep;
// 50 random tests
for (int i=0; i<50; i++) {
long lower=(long)(rnd.nextDouble()*noDocs - noDocs/2);
long upper=(long)(rnd.nextDouble()*noDocs - noDocs/2);
if (lower>upper) {
long a=lower; lower=upper; upper=a;
}
// test inclusive range
Query tq=new LongTrieRangeFilter(field, precisionStep, new Long(lower), new Long(upper), true, true).asQuery();
TopDocs tTopDocs = searcher.search(tq, 1);
assertEquals("Returned count of range query must be equal to inclusive range length", upper-lower+1, tTopDocs.totalHits );
// test exclusive range
tq=new LongTrieRangeFilter(field, precisionStep, new Long(lower), new Long(upper), false, false).asQuery();
tTopDocs = searcher.search(tq, 1);
assertEquals("Returned count of range query must be equal to exclusive range length", Math.max(upper-lower-1, 0), tTopDocs.totalHits );
// test left exclusive range
tq=new LongTrieRangeFilter(field, precisionStep, new Long(lower), new Long(upper), false, true).asQuery();
tTopDocs = searcher.search(tq, 1);
assertEquals("Returned count of range query must be equal to half exclusive range length", upper-lower, tTopDocs.totalHits );
// test right exclusive range
tq=new LongTrieRangeFilter(field, precisionStep, new Long(lower), new Long(upper), true, false).asQuery();
tTopDocs = searcher.search(tq, 1);
assertEquals("Returned count of range query must be equal to half exclusive range length", upper-lower, tTopDocs.totalHits );
}
}
public void testRangeSplit_8bit() throws Exception {
testRangeSplit(8);
}
public void testRangeSplit_4bit() throws Exception {
testRangeSplit(4);
}
public void testRangeSplit_2bit() throws Exception {
testRangeSplit(2);
}
private void testSorting(int precisionStep) throws Exception {
final Random rnd=newRandom();
String field="field"+precisionStep;
// 10 random tests, the index order is ascending,
// so using a reverse sort field should retun descending documents
for (int i=0; i<10; i++) {
long lower=(long)(rnd.nextDouble()*noDocs*distance)+startOffset;
long upper=(long)(rnd.nextDouble()*noDocs*distance)+startOffset;
if (lower>upper) {
long a=lower; lower=upper; upper=a;
}
Query tq=new LongTrieRangeFilter(field, precisionStep, new Long(lower), new Long(upper), true, true).asQuery();
TopDocs topDocs = searcher.search(tq, null, noDocs, new Sort(TrieUtils.getLongSortField(field, true)));
if (topDocs.totalHits==0) continue;
ScoreDoc[] sd = topDocs.scoreDocs;
assertNotNull(sd);
long last=TrieUtils.prefixCodedToLong(searcher.doc(sd[0].doc).get(field));
for (int j=1; j<sd.length; j++) {
long act=TrieUtils.prefixCodedToLong(searcher.doc(sd[j].doc).get(field));
assertTrue("Docs should be sorted backwards", last>act );
last=act;
}
}
}
public void testSorting_8bit() throws Exception {
testSorting(8);
}
public void testSorting_4bit() throws Exception {
testSorting(4);
}
public void testSorting_2bit() throws Exception {
testSorting(2);
}
}

View File

@ -1,265 +0,0 @@
package org.apache.lucene.search.trie;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.Random;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriter.MaxFieldLength;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.RangeQuery;
import org.apache.lucene.util.LuceneTestCase;
public class TestTrieRangeQuery extends LuceneTestCase
{
private static final long distance=66666;
private static final RAMDirectory directory;
private static final IndexSearcher searcher;
static {
try {
directory = new RAMDirectory();
IndexWriter writer = new IndexWriter(directory, new WhitespaceAnalyzer(),
true, MaxFieldLength.UNLIMITED);
// Add a series of 10000 docs with increasing long values
for (long l=0L; l<10000L; l++) {
Document doc=new Document();
// add fields, that have a distance to test general functionality
TrieUtils.VARIANT_8BIT.addLongTrieCodedDocumentField(
doc, "field8", distance*l, true /*index it*/, Field.Store.YES
);
TrieUtils.VARIANT_4BIT.addLongTrieCodedDocumentField(
doc, "field4", distance*l, true /*index it*/, Field.Store.YES
);
TrieUtils.VARIANT_2BIT.addLongTrieCodedDocumentField(
doc, "field2", distance*l, true /*index it*/, Field.Store.YES
);
// add ascending fields with a distance of 1 to test the correct splitting of range and inclusive/exclusive
TrieUtils.VARIANT_8BIT.addLongTrieCodedDocumentField(
doc, "ascfield8", l, true /*index it*/, Field.Store.NO
);
TrieUtils.VARIANT_4BIT.addLongTrieCodedDocumentField(
doc, "ascfield4", l, true /*index it*/, Field.Store.NO
);
TrieUtils.VARIANT_2BIT.addLongTrieCodedDocumentField(
doc, "ascfield2", l, true /*index it*/, Field.Store.NO
);
writer.addDocument(doc);
}
writer.optimize();
writer.close();
searcher=new IndexSearcher(directory);
} catch (Exception e) {
throw new Error(e);
}
}
private void testRange(final TrieUtils variant) throws Exception {
String field="field"+variant.TRIE_BITS;
int count=3000;
long lower=96666L, upper=lower + count*distance + 1234L;
TrieRangeQuery q=new TrieRangeQuery(field, new Long(lower), new Long(upper), true, true, variant);
TopDocs topDocs = searcher.search(q, null, 10000, Sort.INDEXORDER);
System.out.println("Found "+q.getLastNumberOfTerms()+" distinct terms in range for field '"+field+"'.");
ScoreDoc[] sd = topDocs.scoreDocs;
assertNotNull(sd);
assertEquals("Score docs must match "+count+" docs, found "+sd.length+" docs", sd.length, count );
Document doc=searcher.doc(sd[0].doc);
assertEquals("First doc should be "+(2*distance), variant.trieCodedToLong(doc.get(field)), 2*distance );
doc=searcher.doc(sd[sd.length-1].doc);
assertEquals("Last doc should be "+((1+count)*distance), variant.trieCodedToLong(doc.get(field)), (1+count)*distance );
}
public void testRange_8bit() throws Exception {
testRange(TrieUtils.VARIANT_8BIT);
}
public void testRange_4bit() throws Exception {
testRange(TrieUtils.VARIANT_4BIT);
}
public void testRange_2bit() throws Exception {
testRange(TrieUtils.VARIANT_2BIT);
}
private void testLeftOpenRange(final TrieUtils variant) throws Exception {
String field="field"+variant.TRIE_BITS;
int count=3000;
long upper=(count-1)*distance + 1234L;
TrieRangeQuery q=new TrieRangeQuery(field, null, new Long(upper), true, true, variant);
TopDocs topDocs = searcher.search(q, null, 10000, Sort.INDEXORDER);
System.out.println("Found "+q.getLastNumberOfTerms()+" distinct terms in left open range for field '"+field+"'.");
ScoreDoc[] sd = topDocs.scoreDocs;
assertNotNull(sd);
assertEquals("Score docs must match "+count+" docs, found "+sd.length+" docs", sd.length, count );
Document doc=searcher.doc(sd[0].doc);
assertEquals("First doc should be 0", variant.trieCodedToLong(doc.get(field)), 0L );
doc=searcher.doc(sd[sd.length-1].doc);
assertEquals("Last doc should be "+((count-1)*distance), variant.trieCodedToLong(doc.get(field)), (count-1)*distance );
}
public void testLeftOpenRange_8bit() throws Exception {
testLeftOpenRange(TrieUtils.VARIANT_8BIT);
}
public void testLeftOpenRange_4bit() throws Exception {
testLeftOpenRange(TrieUtils.VARIANT_4BIT);
}
public void testLeftOpenRange_2bit() throws Exception {
testLeftOpenRange(TrieUtils.VARIANT_2BIT);
}
private void testRandomTrieAndClassicRangeQuery(final TrieUtils variant) throws Exception {
final Random rnd=newRandom();
String field="field"+variant.TRIE_BITS;
// 50 random tests, the tests may also return 0 results, if min>max, but this is ok
for (int i=0; i<50; i++) {
long lower=(long)(rnd.nextDouble()*10000L*distance);
long upper=(long)(rnd.nextDouble()*10000L*distance);
// test inclusive range
TrieRangeQuery tq=new TrieRangeQuery(field, new Long(lower), new Long(upper), true, true, variant);
RangeQuery cq=new RangeQuery(field, variant.longToTrieCoded(lower), variant.longToTrieCoded(upper), true, true);
cq.setConstantScoreRewrite(true);
TopDocs tTopDocs = searcher.search(tq, 1);
TopDocs cTopDocs = searcher.search(cq, 1);
assertEquals("Returned count for TrieRangeQuery and RangeQuery must be equal", tTopDocs.totalHits, cTopDocs.totalHits );
// test exclusive range
tq=new TrieRangeQuery(field, new Long(lower), new Long(upper), false, false, variant);
cq=new RangeQuery(field, variant.longToTrieCoded(lower), variant.longToTrieCoded(upper), false, false);
cq.setConstantScoreRewrite(true);
tTopDocs = searcher.search(tq, 1);
cTopDocs = searcher.search(cq, 1);
assertEquals("Returned count for TrieRangeQuery and RangeQuery must be equal", tTopDocs.totalHits, cTopDocs.totalHits );
// test left exclusive range
tq=new TrieRangeQuery(field, new Long(lower), new Long(upper), false, true, variant);
cq=new RangeQuery(field, variant.longToTrieCoded(lower), variant.longToTrieCoded(upper), false, true);
cq.setConstantScoreRewrite(true);
tTopDocs = searcher.search(tq, 1);
cTopDocs = searcher.search(cq, 1);
assertEquals("Returned count for TrieRangeQuery and RangeQuery must be equal", tTopDocs.totalHits, cTopDocs.totalHits );
// test right exclusive range
tq=new TrieRangeQuery(field, new Long(lower), new Long(upper), true, false, variant);
cq=new RangeQuery(field, variant.longToTrieCoded(lower), variant.longToTrieCoded(upper), true, false);
cq.setConstantScoreRewrite(true);
tTopDocs = searcher.search(tq, 1);
cTopDocs = searcher.search(cq, 1);
assertEquals("Returned count for TrieRangeQuery and RangeQuery must be equal", tTopDocs.totalHits, cTopDocs.totalHits );
}
}
public void testRandomTrieAndClassicRangeQuery_8bit() throws Exception {
testRandomTrieAndClassicRangeQuery(TrieUtils.VARIANT_8BIT);
}
public void testRandomTrieAndClassicRangeQuery_4bit() throws Exception {
testRandomTrieAndClassicRangeQuery(TrieUtils.VARIANT_4BIT);
}
public void testRandomTrieAndClassicRangeQuery_2bit() throws Exception {
testRandomTrieAndClassicRangeQuery(TrieUtils.VARIANT_2BIT);
}
private void testRangeSplit(final TrieUtils variant) throws Exception {
final Random rnd=newRandom();
String field="ascfield"+variant.TRIE_BITS;
// 50 random tests
for (int i=0; i<50; i++) {
long lower=(long)(rnd.nextDouble()*10000L);
long upper=(long)(rnd.nextDouble()*10000L);
if (lower>upper) {
long a=lower; lower=upper; upper=a;
}
// test inclusive range
TrieRangeQuery tq=new TrieRangeQuery(field, new Long(lower), new Long(upper), true, true, variant);
TopDocs tTopDocs = searcher.search(tq, 1);
assertEquals("Returned count of range query must be equal to inclusive range length", tTopDocs.totalHits, upper-lower+1 );
// test exclusive range
tq=new TrieRangeQuery(field, new Long(lower), new Long(upper), false, false, variant);
tTopDocs = searcher.search(tq, 1);
assertEquals("Returned count of range query must be equal to exclusive range length", tTopDocs.totalHits, Math.max(upper-lower-1, 0) );
// test left exclusive range
tq=new TrieRangeQuery(field, new Long(lower), new Long(upper), false, true, variant);
tTopDocs = searcher.search(tq, 1);
assertEquals("Returned count of range query must be equal to half exclusive range length", tTopDocs.totalHits, upper-lower );
// test right exclusive range
tq=new TrieRangeQuery(field, new Long(lower), new Long(upper), true, false, variant);
tTopDocs = searcher.search(tq, 1);
assertEquals("Returned count of range query must be equal to half exclusive range length", tTopDocs.totalHits, upper-lower );
}
}
public void testRangeSplit_8bit() throws Exception {
testRangeSplit(TrieUtils.VARIANT_8BIT);
}
public void testRangeSplit_4bit() throws Exception {
testRangeSplit(TrieUtils.VARIANT_4BIT);
}
public void testRangeSplit_2bit() throws Exception {
testRangeSplit(TrieUtils.VARIANT_2BIT);
}
private void testSorting(final TrieUtils variant) throws Exception {
final Random rnd=newRandom();
String field="field"+variant.TRIE_BITS;
// 10 random tests, the index order is ascending,
// so using a reverse sort field should retun descending documents
for (int i=0; i<10; i++) {
long lower=(long)(rnd.nextDouble()*10000L*distance);
long upper=(long)(rnd.nextDouble()*10000L*distance);
if (lower>upper) {
long a=lower; lower=upper; upper=a;
}
TrieRangeQuery tq=new TrieRangeQuery(field, new Long(lower), new Long(upper), true, true, variant);
TopDocs topDocs = searcher.search(tq, null, 10000, new Sort(variant.getSortField(field, true)));
if (topDocs.totalHits==0) continue;
ScoreDoc[] sd = topDocs.scoreDocs;
assertNotNull(sd);
long last=variant.trieCodedToLong(searcher.doc(sd[0].doc).get(field));
for (int j=1; j<sd.length; j++) {
long act=variant.trieCodedToLong(searcher.doc(sd[j].doc).get(field));
assertTrue("Docs should be sorted backwards", last>act );
last=act;
}
}
}
public void testSorting_8bit() throws Exception {
testSorting(TrieUtils.VARIANT_8BIT);
}
public void testSorting_4bit() throws Exception {
testSorting(TrieUtils.VARIANT_4BIT);
}
public void testSorting_2bit() throws Exception {
testSorting(TrieUtils.VARIANT_2BIT);
}
}

View File

@ -17,155 +17,304 @@ package org.apache.lucene.search.trie;
* limitations under the License.
*/
import java.util.Date;
import java.util.GregorianCalendar;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.OpenBitSet;
import java.util.Arrays;
import java.util.Iterator;
public class TestTrieUtils extends LuceneTestCase {
public void testSpecialValues() throws Exception {
// Variant 8bit values
assertEquals( TrieUtils.VARIANT_8BIT.TRIE_CODED_NUMERIC_MIN, "\u0100\u0100\u0100\u0100\u0100\u0100\u0100\u0100");
assertEquals( TrieUtils.VARIANT_8BIT.TRIE_CODED_NUMERIC_MAX, "\u01ff\u01ff\u01ff\u01ff\u01ff\u01ff\u01ff\u01ff");
assertEquals( TrieUtils.VARIANT_8BIT.longToTrieCoded(-1), "\u017f\u01ff\u01ff\u01ff\u01ff\u01ff\u01ff\u01ff");
assertEquals( TrieUtils.VARIANT_8BIT.longToTrieCoded(0), "\u0180\u0100\u0100\u0100\u0100\u0100\u0100\u0100");
assertEquals( TrieUtils.VARIANT_8BIT.longToTrieCoded(1), "\u0180\u0100\u0100\u0100\u0100\u0100\u0100\u0101");
// Variant 4bit values
assertEquals( TrieUtils.VARIANT_4BIT.TRIE_CODED_NUMERIC_MIN, "\u0100\u0100\u0100\u0100\u0100\u0100\u0100\u0100\u0100\u0100\u0100\u0100\u0100\u0100\u0100\u0100");
assertEquals( TrieUtils.VARIANT_4BIT.TRIE_CODED_NUMERIC_MAX, "\u010f\u010f\u010f\u010f\u010f\u010f\u010f\u010f\u010f\u010f\u010f\u010f\u010f\u010f\u010f\u010f");
assertEquals( TrieUtils.VARIANT_4BIT.longToTrieCoded(-1), "\u0107\u010f\u010f\u010f\u010f\u010f\u010f\u010f\u010f\u010f\u010f\u010f\u010f\u010f\u010f\u010f");
assertEquals( TrieUtils.VARIANT_4BIT.longToTrieCoded(0), "\u0108\u0100\u0100\u0100\u0100\u0100\u0100\u0100\u0100\u0100\u0100\u0100\u0100\u0100\u0100\u0100");
assertEquals( TrieUtils.VARIANT_4BIT.longToTrieCoded(1), "\u0108\u0100\u0100\u0100\u0100\u0100\u0100\u0100\u0100\u0100\u0100\u0100\u0100\u0100\u0100\u0101");
// TODO: 2bit tests
}
private void testBinaryOrderingAndIncrement(TrieUtils variant) throws Exception {
public void testLongConversionAndOrdering() throws Exception {
// generate a series of encoded longs, each numerical one bigger than the one before
String last=null;
for (long l=-100000L; l<100000L; l++) {
String act=variant.longToTrieCoded(l);
String act=TrieUtils.longToPrefixCoded(l);
if (last!=null) {
// test if smaller
assertTrue( last.compareTo(act) < 0 );
// test the increment method (the last incremented by one should be the actual)
assertEquals( variant.incrementTrieCoded(last), act );
// test the decrement method (the actual decremented by one should be the last)
assertEquals( last, variant.decrementTrieCoded(act) );
assertTrue("actual bigger than last", last.compareTo(act) < 0 );
}
// test is back and forward conversion works
assertEquals("forward and back conversion should generate same long", l, TrieUtils.prefixCodedToLong(act));
// next step
last=act;
}
}
public void testBinaryOrderingAndIncrement_8bit() throws Exception {
testBinaryOrderingAndIncrement(TrieUtils.VARIANT_8BIT);
public void testIntConversionAndOrdering() throws Exception {
// generate a series of encoded ints, each numerical one bigger than the one before
String last=null;
for (int i=-100000; i<100000; i++) {
String act=TrieUtils.intToPrefixCoded(i);
if (last!=null) {
// test if smaller
assertTrue("actual bigger than last", last.compareTo(act) < 0 );
}
// test is back and forward conversion works
assertEquals("forward and back conversion should generate same int", i, TrieUtils.prefixCodedToInt(act));
// next step
last=act;
}
}
public void testBinaryOrderingAndIncrement_4bit() throws Exception {
testBinaryOrderingAndIncrement(TrieUtils.VARIANT_8BIT);
}
public void testBinaryOrderingAndIncrement_2bit() throws Exception {
testBinaryOrderingAndIncrement(TrieUtils.VARIANT_8BIT);
}
private void testLongs(TrieUtils variant) throws Exception {
public void testLongSpecialValues() throws Exception {
long[] vals=new long[]{
Long.MIN_VALUE, -5000L, -4000L, -3000L, -2000L, -1000L, 0L,
1L, 10L, 300L, 5000L, Long.MAX_VALUE-2, Long.MAX_VALUE-1, Long.MAX_VALUE
Long.MIN_VALUE, Long.MIN_VALUE+1, Long.MIN_VALUE+2, -5003400000000L,
-4000L, -3000L, -2000L, -1000L, -1L, 0L, 1L, 10L, 300L, 50006789999999999L, Long.MAX_VALUE-2, Long.MAX_VALUE-1, Long.MAX_VALUE
};
String[] trieVals=new String[vals.length];
String[] prefixVals=new String[vals.length];
// check back and forth conversion
for (int i=0; i<vals.length; i++) {
trieVals[i]=variant.longToTrieCoded(vals[i]);
assertEquals( "Back and forth conversion should return same value", vals[i], variant.trieCodedToLong(trieVals[i]) );
assertEquals( "Automatic back conversion with encoding detection should return same value", vals[i], TrieUtils.trieCodedToLongAuto(trieVals[i]) );
prefixVals[i]=TrieUtils.longToPrefixCoded(vals[i]);
// check forward and back conversion
assertEquals( "forward and back conversion should generate same long", vals[i], TrieUtils.prefixCodedToLong(prefixVals[i]) );
// test if decoding values as int fails correctly
try {
TrieUtils.prefixCodedToInt(prefixVals[i]);
fail("decoding a prefix coded long value as int should fail");
} catch (NumberFormatException e) {
// worked
}
}
// check sort order (trieVals should be ascending)
for (int i=1; i<vals.length; i++) {
assertTrue( trieVals[i-1].compareTo( trieVals[i] ) < 0 );
// check sort order (prefixVals should be ascending)
for (int i=1; i<prefixVals.length; i++) {
assertTrue( "check sort order", prefixVals[i-1].compareTo( prefixVals[i] ) < 0 );
}
// check the prefix encoding, lower precision should have the difference to original value equal to the lower removed bits
for (int i=0; i<vals.length; i++) {
for (int j=0; j<64; j++) {
long prefixVal=TrieUtils.prefixCodedToLong(TrieUtils.longToPrefixCoded(vals[i], j));
long mask=(1L << j) - 1L;
assertEquals( "difference between prefix val and original value for "+vals[i]+" with shift="+j, vals[i] & mask, vals[i]-prefixVal );
}
}
}
public void testLongs_8bit() throws Exception {
testLongs(TrieUtils.VARIANT_8BIT);
public void testIntSpecialValues() throws Exception {
int[] vals=new int[]{
Integer.MIN_VALUE, Integer.MIN_VALUE+1, Integer.MIN_VALUE+2, -64765767,
-4000, -3000, -2000, -1000, -1, 0, 1, 10, 300, 765878989, Integer.MAX_VALUE-2, Integer.MAX_VALUE-1, Integer.MAX_VALUE
};
String[] prefixVals=new String[vals.length];
for (int i=0; i<vals.length; i++) {
prefixVals[i]=TrieUtils.intToPrefixCoded(vals[i]);
// check forward and back conversion
assertEquals( "forward and back conversion should generate same int", vals[i], TrieUtils.prefixCodedToInt(prefixVals[i]) );
// test if decoding values as long fails correctly
try {
TrieUtils.prefixCodedToLong(prefixVals[i]);
fail("decoding a prefix coded int value as long should fail");
} catch (NumberFormatException e) {
// worked
}
}
// check sort order (prefixVals should be ascending)
for (int i=1; i<prefixVals.length; i++) {
assertTrue( "check sort order", prefixVals[i-1].compareTo( prefixVals[i] ) < 0 );
}
// check the prefix encoding, lower precision should have the difference to original value equal to the lower removed bits
for (int i=0; i<vals.length; i++) {
for (int j=0; j<32; j++) {
int prefixVal=TrieUtils.prefixCodedToInt(TrieUtils.intToPrefixCoded(vals[i], j));
int mask=(1 << j) - 1;
assertEquals( "difference between prefix val and original value for "+vals[i]+" with shift="+j, vals[i] & mask, vals[i]-prefixVal );
}
}
}
public void testLongs_4bit() throws Exception {
testLongs(TrieUtils.VARIANT_4BIT);
}
public void testLongs_2bit() throws Exception {
testLongs(TrieUtils.VARIANT_2BIT);
}
private void testDoubles(TrieUtils variant) throws Exception {
public void testDoubles() throws Exception {
double[] vals=new double[]{
Double.NEGATIVE_INFINITY, -2.3E25, -1.0E15, -1.0, -1.0E-1, -1.0E-2, -0.0,
+0.0, 1.0E-2, 1.0E-1, 1.0, 1.0E15, 2.3E25, Double.POSITIVE_INFINITY
};
String[] trieVals=new String[vals.length];
long[] longVals=new long[vals.length];
// check back and forth conversion
// check forward and back conversion
for (int i=0; i<vals.length; i++) {
trieVals[i]=variant.doubleToTrieCoded(vals[i]);
assertTrue( "Back and forth conversion should return same value", vals[i]==variant.trieCodedToDouble(trieVals[i]) );
assertTrue( "Automatic back conversion with encoding detection should return same value", vals[i]==TrieUtils.trieCodedToDoubleAuto(trieVals[i]) );
longVals[i]=TrieUtils.doubleToSortableLong(vals[i]);
assertTrue( "forward and back conversion should generate same double", Double.compare(vals[i], TrieUtils.sortableLongToDouble(longVals[i]))==0 );
}
// check sort order (trieVals should be ascending)
for (int i=1; i<vals.length; i++) {
assertTrue( trieVals[i-1].compareTo( trieVals[i] ) < 0 );
// check sort order (prefixVals should be ascending)
for (int i=1; i<longVals.length; i++) {
assertTrue( "check sort order", longVals[i-1] < longVals[i] );
}
}
public void testDoubles_8bit() throws Exception {
testDoubles(TrieUtils.VARIANT_8BIT);
}
public void testDoubles_4bit() throws Exception {
testDoubles(TrieUtils.VARIANT_4BIT);
}
public void testDoubles_2bit() throws Exception {
testDoubles(TrieUtils.VARIANT_2BIT);
}
private void testDates(TrieUtils variant) throws Exception {
Date[] vals=new Date[]{
new GregorianCalendar(1000,1,1).getTime(),
new GregorianCalendar(1999,1,1).getTime(),
new GregorianCalendar(2000,1,1).getTime(),
new GregorianCalendar(2001,1,1).getTime()
public void testFloats() throws Exception {
float[] vals=new float[]{
Float.NEGATIVE_INFINITY, -2.3E25f, -1.0E15f, -1.0f, -1.0E-1f, -1.0E-2f, -0.0f,
+0.0f, 1.0E-2f, 1.0E-1f, 1.0f, 1.0E15f, 2.3E25f, Float.POSITIVE_INFINITY
};
String[] trieVals=new String[vals.length];
int[] intVals=new int[vals.length];
// check back and forth conversion
// check forward and back conversion
for (int i=0; i<vals.length; i++) {
trieVals[i]=variant.dateToTrieCoded(vals[i]);
assertEquals( "Back and forth conversion should return same value", vals[i], variant.trieCodedToDate(trieVals[i]) );
assertEquals( "Automatic back conversion with encoding detection should return same value", vals[i], TrieUtils.trieCodedToDateAuto(trieVals[i]) );
intVals[i]=TrieUtils.floatToSortableInt(vals[i]);
assertTrue( "forward and back conversion should generate same double", Float.compare(vals[i], TrieUtils.sortableIntToFloat(intVals[i]))==0 );
}
// check sort order (trieVals should be ascending)
for (int i=1; i<vals.length; i++) {
assertTrue( trieVals[i-1].compareTo( trieVals[i] ) < 0 );
// check sort order (prefixVals should be ascending)
for (int i=1; i<intVals.length; i++) {
assertTrue( "check sort order", intVals[i-1] < intVals[i] );
}
}
// INFO: Tests for trieCodeLong()/trieCodeInt() not needed because implicitely tested by range filter tests
/** Note: The neededBounds iterator must be unsigned (easier understanding what's happening) */
protected void assertLongRangeSplit(final long lower, final long upper, int precisionStep,
final boolean useBitSet, final Iterator neededBounds
) throws Exception {
final OpenBitSet bits=useBitSet ? new OpenBitSet(upper-lower+1) : null;
TrieUtils.splitLongRange(new TrieUtils.LongRangeBuilder() {
public void addRange(int precisionStep, long min, long max, int shift) {
assertTrue("min, max should be inside bounds", min>=lower && min<=upper && max>=lower && max<=upper);
if (useBitSet) for (long l=min; l<=max; l++) {
assertFalse("ranges should not overlap", bits.getAndSet(l-lower) );
}
// make unsigned longs for easier display and understanding
min ^= 0x8000000000000000L;
max ^= 0x8000000000000000L;
//System.out.println("new Long(0x"+Long.toHexString(min>>>shift)+"L),new Long(0x"+Long.toHexString(max>>>shift)+"L),");
assertEquals( "inner min bound", ((Long)neededBounds.next()).longValue(), min>>>shift);
assertEquals( "inner max bound", ((Long)neededBounds.next()).longValue(), max>>>shift);
}
}, precisionStep, lower, upper);
if (useBitSet) {
// after flipping all bits in the range, the cardinality should be zero
bits.flip(0,upper-lower+1);
assertTrue("The sub-range concenated should match the whole range", bits.isEmpty());
}
}
public void testSplitLongRange() throws Exception {
// a hard-coded "standard" range
assertLongRangeSplit(-5000L, 9500L, 4, true, Arrays.asList(new Long[]{
new Long(0x7fffffffffffec78L),new Long(0x7fffffffffffec7fL),
new Long(0x8000000000002510L),new Long(0x800000000000251cL),
new Long(0x7fffffffffffec8L), new Long(0x7fffffffffffecfL),
new Long(0x800000000000250L), new Long(0x800000000000250L),
new Long(0x7fffffffffffedL), new Long(0x7fffffffffffefL),
new Long(0x80000000000020L), new Long(0x80000000000024L),
new Long(0x7ffffffffffffL), new Long(0x8000000000001L)
}).iterator());
// the same with no range splitting
assertLongRangeSplit(-5000L, 9500L, 64, true, Arrays.asList(new Long[]{
new Long(0x7fffffffffffec78L),new Long(0x800000000000251cL)
}).iterator());
// this tests optimized range splitting, if one of the inner bounds
// is also the bound of the next lower precision, it should be used completely
assertLongRangeSplit(0L, 1024L+63L, 4, true, Arrays.asList(new Long[]{
new Long(0x800000000000040L), new Long(0x800000000000043L),
new Long(0x80000000000000L), new Long(0x80000000000003L)
}).iterator());
// the full long range should only consist of a lowest precision range; no bitset testing here, as too much memory needed :-)
assertLongRangeSplit(Long.MIN_VALUE, Long.MAX_VALUE, 8, false, Arrays.asList(new Long[]{
new Long(0x00L),new Long(0xffL)
}).iterator());
public void testDates_8bit() throws Exception {
testDates(TrieUtils.VARIANT_8BIT);
// the same with precisionStep=4
assertLongRangeSplit(Long.MIN_VALUE, Long.MAX_VALUE, 4, false, Arrays.asList(new Long[]{
new Long(0x0L),new Long(0xfL)
}).iterator());
// the same with precisionStep=2
assertLongRangeSplit(Long.MIN_VALUE, Long.MAX_VALUE, 2, false, Arrays.asList(new Long[]{
new Long(0x0L),new Long(0x3L)
}).iterator());
// the same with precisionStep=1
assertLongRangeSplit(Long.MIN_VALUE, Long.MAX_VALUE, 1, false, Arrays.asList(new Long[]{
new Long(0x0L),new Long(0x1L)
}).iterator());
}
public void testDates_4bit() throws Exception {
testDates(TrieUtils.VARIANT_4BIT);
/** Note: The neededBounds iterator must be unsigned (easier understanding what's happening) */
protected void assertIntRangeSplit(final int lower, final int upper, int precisionStep,
final boolean useBitSet, final Iterator neededBounds
) throws Exception {
final OpenBitSet bits=useBitSet ? new OpenBitSet(upper-lower+1) : null;
TrieUtils.splitIntRange(new TrieUtils.IntRangeBuilder() {
public void addRange(int precisionStep, int min, int max, int shift) {
assertTrue("min, max should be inside bounds", min>=lower && min<=upper && max>=lower && max<=upper);
if (useBitSet) for (int i=min; i<=max; i++) {
assertFalse("ranges should not overlap", bits.getAndSet(i-lower) );
}
// make unsigned ints for easier display and understanding
min ^= 0x80000000;
max ^= 0x80000000;
//System.out.println("new Integer(0x"+Integer.toHexString(min>>>shift)+"),new Integer(0x"+Integer.toHexString(max>>>shift)+"),");
assertEquals( "inner min bound", ((Integer)neededBounds.next()).intValue(), min>>>shift);
assertEquals( "inner max bound", ((Integer)neededBounds.next()).intValue(), max>>>shift);
}
}, precisionStep, lower, upper);
if (useBitSet) {
// after flipping all bits in the range, the cardinality should be zero
bits.flip(0,upper-lower+1);
assertTrue("The sub-range concenated should match the whole range", bits.isEmpty());
}
}
public void testSplitIntRange() throws Exception {
// a hard-coded "standard" range
assertIntRangeSplit(-5000, 9500, 4, true, Arrays.asList(new Integer[]{
new Integer(0x7fffec78),new Integer(0x7fffec7f),
new Integer(0x80002510),new Integer(0x8000251c),
new Integer(0x7fffec8), new Integer(0x7fffecf),
new Integer(0x8000250), new Integer(0x8000250),
new Integer(0x7fffed), new Integer(0x7fffef),
new Integer(0x800020), new Integer(0x800024),
new Integer(0x7ffff), new Integer(0x80001)
}).iterator());
// the same with no range splitting
assertIntRangeSplit(-5000, 9500, 32, true, Arrays.asList(new Integer[]{
new Integer(0x7fffec78),new Integer(0x8000251c)
}).iterator());
// this tests optimized range splitting, if one of the inner bounds
// is also the bound of the next lower precision, it should be used completely
assertIntRangeSplit(0, 1024+63, 4, true, Arrays.asList(new Integer[]{
new Integer(0x8000040), new Integer(0x8000043),
new Integer(0x800000), new Integer(0x800003)
}).iterator());
// the full int range should only consist of a lowest precision range; no bitset testing here, as too much memory needed :-)
assertIntRangeSplit(Integer.MIN_VALUE, Integer.MAX_VALUE, 8, false, Arrays.asList(new Integer[]{
new Integer(0x00),new Integer(0xff)
}).iterator());
public void testDates_2bit() throws Exception {
testDates(TrieUtils.VARIANT_2BIT);
// the same with precisionStep=4
assertIntRangeSplit(Integer.MIN_VALUE, Integer.MAX_VALUE, 4, false, Arrays.asList(new Integer[]{
new Integer(0x0),new Integer(0xf)
}).iterator());
// the same with precisionStep=2
assertIntRangeSplit(Integer.MIN_VALUE, Integer.MAX_VALUE, 2, false, Arrays.asList(new Integer[]{
new Integer(0x0),new Integer(0x3)
}).iterator());
// the same with precisionStep=1
assertIntRangeSplit(Integer.MIN_VALUE, Integer.MAX_VALUE, 1, false, Arrays.asList(new Integer[]{
new Integer(0x0),new Integer(0x1)
}).iterator());
}
}