mirror of https://github.com/apache/lucene.git
LUCENE-1582: Make TrieRange completely independent from Document/Field with TokenStream of prefix encoded values
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@762710 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
b86ded6fae
commit
13ae26b7fe
|
@ -18,7 +18,6 @@ package org.apache.lucene.search.trie;
|
|||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.apache.lucene.search.Filter;
|
||||
import org.apache.lucene.search.Query;
|
||||
|
@ -33,10 +32,10 @@ import org.apache.lucene.util.ToStringUtils;
|
|||
|
||||
abstract class AbstractTrieRangeFilter extends Filter {
|
||||
|
||||
AbstractTrieRangeFilter(final String[] fields, final int precisionStep,
|
||||
AbstractTrieRangeFilter(final String field, final int precisionStep,
|
||||
Number min, Number max, final boolean minInclusive, final boolean maxInclusive
|
||||
) {
|
||||
this.fields=(String[])fields.clone();
|
||||
this.field=field.intern();
|
||||
this.precisionStep=precisionStep;
|
||||
this.min=min;
|
||||
this.max=max;
|
||||
|
@ -51,7 +50,7 @@ abstract class AbstractTrieRangeFilter extends Filter {
|
|||
|
||||
public String toString(final String field) {
|
||||
final StringBuffer sb=new StringBuffer();
|
||||
if (!this.fields[0].equals(field)) sb.append(this.fields[0]).append(':');
|
||||
if (!this.field.equals(field)) sb.append(this.field).append(':');
|
||||
return sb.append(minInclusive ? '[' : '{')
|
||||
.append((min==null) ? "*" : min.toString())
|
||||
.append(" TO ")
|
||||
|
@ -66,7 +65,7 @@ abstract class AbstractTrieRangeFilter extends Filter {
|
|||
if (this.getClass().equals(o.getClass())) {
|
||||
AbstractTrieRangeFilter q=(AbstractTrieRangeFilter)o;
|
||||
return (
|
||||
Arrays.equals(fields,q.fields) &&
|
||||
field==q.field &&
|
||||
(q.min == null ? min == null : q.min.equals(min)) &&
|
||||
(q.max == null ? max == null : q.max.equals(max)) &&
|
||||
minInclusive==q.minInclusive &&
|
||||
|
@ -79,7 +78,7 @@ abstract class AbstractTrieRangeFilter extends Filter {
|
|||
|
||||
//@Override
|
||||
public final int hashCode() {
|
||||
int hash=Arrays.asList(fields).hashCode()+(precisionStep^0x64365465);
|
||||
int hash = field.hashCode() + (precisionStep^0x64365465);
|
||||
if (min!=null) hash += min.hashCode()^0x14fa55fb;
|
||||
if (max!=null) hash += max.hashCode()^0x733fa5fe;
|
||||
return hash+
|
||||
|
@ -123,12 +122,10 @@ abstract class AbstractTrieRangeFilter extends Filter {
|
|||
void fillBits(
|
||||
final IndexReader reader,
|
||||
final OpenBitSet bits, final TermDocs termDocs,
|
||||
String field,
|
||||
final String lowerTerm, final String upperTerm
|
||||
) throws IOException {
|
||||
final int len=lowerTerm.length();
|
||||
assert upperTerm.length()==len;
|
||||
field=field.intern();
|
||||
|
||||
// find the docs
|
||||
final TermEnum enumerator = reader.terms(new Term(field, lowerTerm));
|
||||
|
@ -151,7 +148,7 @@ abstract class AbstractTrieRangeFilter extends Filter {
|
|||
}
|
||||
|
||||
// members
|
||||
final String[] fields;
|
||||
final String field;
|
||||
final int precisionStep;
|
||||
final Number min,max;
|
||||
final boolean minInclusive,maxInclusive;
|
||||
|
|
|
@ -30,7 +30,7 @@ import org.apache.lucene.util.OpenBitSet;
|
|||
/**
|
||||
* Implementation of a Lucene {@link Filter} that implements trie-based range filtering for ints/floats.
|
||||
* This filter depends on a specific structure of terms in the index that can only be created
|
||||
* by {@link TrieUtils} methods.
|
||||
* by indexing via {@link IntTrieTokenStream} methods.
|
||||
* For more information, how the algorithm works, see the {@linkplain org.apache.lucene.search.trie package description}.
|
||||
*/
|
||||
public class IntTrieRangeFilter extends AbstractTrieRangeFilter {
|
||||
|
@ -43,49 +43,11 @@ public class IntTrieRangeFilter extends AbstractTrieRangeFilter {
|
|||
* You can leave the bounds open, by supplying <code>null</code> for <code>min</code> and/or
|
||||
* <code>max</code>. Inclusive/exclusive bounds can also be supplied.
|
||||
* To query float values use the converter {@link TrieUtils#floatToSortableInt}.
|
||||
* <p>This is the counterpart to {@link TrieUtils#addIndexedFields(Document,String,String[])}.
|
||||
* <p><b>This is the recommended usage of TrieUtils/IntTrieRangeFilter.</b>
|
||||
*/
|
||||
public IntTrieRangeFilter(final String field, final int precisionStep,
|
||||
final Integer min, final Integer max, final boolean minInclusive, final boolean maxInclusive
|
||||
) {
|
||||
this(
|
||||
new String[]{field, field+TrieUtils.LOWER_PRECISION_FIELD_NAME_SUFFIX},
|
||||
precisionStep,min,max,minInclusive,maxInclusive
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Expert: A trie filter for matching trie coded values using the given field names.
|
||||
* You can specify the main and helper field name, that was used to idex the values.
|
||||
* <code>precisionStep</code> must me equal or a multiple of the <code>precisionStep</code>
|
||||
* used for indexing the values.
|
||||
* You can leave the bounds open, by supplying <code>null</code> for <code>min</code> and/or
|
||||
* <code>max</code>. Inclusive/exclusive bounds can also be supplied.
|
||||
* To query float values use the converter {@link TrieUtils#floatToSortableInt}.
|
||||
* <p>This is the counterpart to {@link TrieUtils#addIndexedFields(Document,String,String,String[])}.
|
||||
*/
|
||||
public IntTrieRangeFilter(final String field, final String lowerPrecisionField, final int precisionStep,
|
||||
final Integer min, final Integer max, final boolean minInclusive, final boolean maxInclusive
|
||||
) {
|
||||
this(new String[]{field, lowerPrecisionField},precisionStep,min,max,minInclusive,maxInclusive);
|
||||
}
|
||||
|
||||
/**
|
||||
* Expert: A trie filter for matching trie coded values
|
||||
* using the given field names. If the array of field names is shorter than the
|
||||
* trieCoded one, all trieCoded values with higher index get the last field name.
|
||||
* <code>precisionStep</code> must me equal or a multiple of the <code>precisionStep</code>
|
||||
* used for indexing the values.
|
||||
* You can leave the bounds open, by supplying <code>null</code> for <code>min</code> and/or
|
||||
* <code>max</code>. Inclusive/exclusive bounds can also be supplied.
|
||||
* To query float values use the converter {@link TrieUtils#floatToSortableInt}.
|
||||
* <p>This is the counterpart to {@link TrieUtils#addIndexedFields(Document,String[],String[])}.
|
||||
*/
|
||||
public IntTrieRangeFilter(final String[] fields, final int precisionStep,
|
||||
Integer min, Integer max, final boolean minInclusive, final boolean maxInclusive
|
||||
) {
|
||||
super(fields, precisionStep, min, max, minInclusive, maxInclusive);
|
||||
super(field,precisionStep,min,max,minInclusive,maxInclusive);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -112,11 +74,10 @@ public class IntTrieRangeFilter extends AbstractTrieRangeFilter {
|
|||
TrieUtils.splitIntRange(new TrieUtils.IntRangeBuilder() {
|
||||
|
||||
//@Override
|
||||
public final void addRange(String minPrefixCoded, String maxPrefixCoded, int level) {
|
||||
public final void addRange(String minPrefixCoded, String maxPrefixCoded) {
|
||||
try {
|
||||
fillBits(
|
||||
reader, bits, termDocs,
|
||||
fields[Math.min(fields.length-1, level)],
|
||||
minPrefixCoded, maxPrefixCoded
|
||||
);
|
||||
} catch (IOException ioe) {
|
||||
|
|
|
@ -0,0 +1,172 @@
|
|||
package org.apache.lucene.search.trie;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
|
||||
/**
|
||||
* This class provides a {@link TokenStream} for indexing <code>int</code> values
|
||||
* that can be queried by {@link IntTrieRangeFilter}. This stream is not intended
|
||||
* to be used in analyzers, its more for iterating the different precisions during
|
||||
* indexing a specific numeric value.
|
||||
* <p>A <code>int</code> value is indexed as multiple string encoded terms, each reduced
|
||||
* by zeroing bits from the right. Each value is also prefixed (in the first char) by the
|
||||
* <code>shift</code> value (number of bits removed) used during encoding.
|
||||
* <p>The number of bits removed from the right for each trie entry is called
|
||||
* <code>precisionStep</code> in this API. For comparing the different step values, see the
|
||||
* {@linkplain org.apache.lucene.search.trie package description}.
|
||||
* <p>The usage pattern is (it is recommened to switch off norms and term frequencies
|
||||
* for numeric fields; it does not make sense to have them):
|
||||
* <pre>
|
||||
* Field field = new Field(name, new IntTrieTokenStream(value, precisionStep));
|
||||
* field.setOmitNorms(true);
|
||||
* field.setOmitTermFreqAndPositions(true);
|
||||
* document.add(field);
|
||||
* </pre>
|
||||
* <p>For optimal performance, re-use the TokenStream and Field instance
|
||||
* for more than one document:
|
||||
* <pre>
|
||||
* <em>// init</em>
|
||||
* TokenStream stream = new IntTrieTokenStream(precisionStep);
|
||||
* Field field = new Field(name, stream);
|
||||
* field.setOmitNorms(true);
|
||||
* field.setOmitTermFreqAndPositions(true);
|
||||
* <em>// use this code to index many documents:</em>
|
||||
* stream.setValue(value1)
|
||||
* document.add(field);
|
||||
* writer.addDocument(document);
|
||||
* stream.setValue(value2)
|
||||
* document.add(field);
|
||||
* writer.addDocument(document);
|
||||
* ...
|
||||
* </pre>
|
||||
* <p><em>Please note:</em> Token streams are read, when the document is added to index.
|
||||
* If you index more than one numeric field, use a separate instance for each.
|
||||
* <p>For more information, how trie fields work, see the
|
||||
* {@linkplain org.apache.lucene.search.trie package description}.
|
||||
*/
|
||||
public class IntTrieTokenStream extends TokenStream {
|
||||
|
||||
/** The full precision token gets this token type assigned. */
|
||||
public static final String TOKEN_TYPE_FULL_PREC = "fullPrecTrieInt";
|
||||
|
||||
/** The lower precision tokens gets this token type assigned. */
|
||||
public static final String TOKEN_TYPE_LOWER_PREC = "lowerPrecTrieInt";
|
||||
|
||||
/**
|
||||
* Creates a token stream for indexing <code>value</code> with the given
|
||||
* <code>precisionStep</code>. As instance creating is a major cost,
|
||||
* consider using a {@link #IntTrieTokenStream(int)} instance once for
|
||||
* indexing a large number of documents and assign a value with
|
||||
* {@link #setValue} for each document.
|
||||
* To index float values use the converter {@link TrieUtils#doubleToSortableLong}.
|
||||
*/
|
||||
public IntTrieTokenStream(final int value, final int precisionStep) {
|
||||
if (precisionStep<1 || precisionStep>32)
|
||||
throw new IllegalArgumentException("precisionStep may only be 1..32");
|
||||
this.value = value;
|
||||
this.precisionStep = precisionStep;
|
||||
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||
typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
|
||||
posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
|
||||
shiftAtt = (ShiftAttribute) addAttribute(ShiftAttribute.class);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a token stream for indexing values with the given
|
||||
* <code>precisionStep</code>. This stream is initially "empty"
|
||||
* (using a numeric value of 0), assign a value before indexing
|
||||
* each document using {@link #setValue}.
|
||||
*/
|
||||
public IntTrieTokenStream(final int precisionStep) {
|
||||
this(0, precisionStep);
|
||||
}
|
||||
|
||||
/**
|
||||
* Resets the token stream to deliver prefix encoded values
|
||||
* for <code>value</code>. Use this method to index the same
|
||||
* numeric field for a large number of documents and reuse the
|
||||
* current stream instance.
|
||||
* To index float values use the converter {@link TrieUtils#doubleToSortableLong}.
|
||||
*/
|
||||
public void setValue(final int value) {
|
||||
this.value = value;
|
||||
reset();
|
||||
}
|
||||
|
||||
// @Override
|
||||
public void reset() {
|
||||
shift = 0;
|
||||
}
|
||||
|
||||
// @Override
|
||||
public boolean incrementToken() {
|
||||
if (shift>=32) return false;
|
||||
final char[] buffer = termAtt.resizeTermBuffer(TrieUtils.INT_BUF_SIZE);
|
||||
termAtt.setTermLength(TrieUtils.intToPrefixCoded(value, shift, buffer));
|
||||
shiftAtt.setShift(shift);
|
||||
if (shift==0) {
|
||||
typeAtt.setType(TOKEN_TYPE_FULL_PREC);
|
||||
posIncrAtt.setPositionIncrement(1);
|
||||
} else {
|
||||
typeAtt.setType(TOKEN_TYPE_LOWER_PREC);
|
||||
posIncrAtt.setPositionIncrement(0);
|
||||
}
|
||||
shift += precisionStep;
|
||||
return true;
|
||||
}
|
||||
|
||||
// @Override
|
||||
/** @deprecated */
|
||||
public Token next(final Token reusableToken) {
|
||||
if (shift>=32) return null;
|
||||
reusableToken.clear();
|
||||
final char[] buffer = reusableToken.resizeTermBuffer(TrieUtils.INT_BUF_SIZE);
|
||||
reusableToken.setTermLength(TrieUtils.intToPrefixCoded(value, shift, buffer));
|
||||
if (shift==0) {
|
||||
reusableToken.setType(TOKEN_TYPE_FULL_PREC);
|
||||
reusableToken.setPositionIncrement(1);
|
||||
} else {
|
||||
reusableToken.setType(TOKEN_TYPE_LOWER_PREC);
|
||||
reusableToken.setPositionIncrement(0);
|
||||
}
|
||||
shift += precisionStep;
|
||||
return reusableToken;
|
||||
}
|
||||
|
||||
// @Override
|
||||
public String toString() {
|
||||
final StringBuffer sb = new StringBuffer("(trie-int,value=").append(value);
|
||||
sb.append(",precisionStep=").append(precisionStep).append(')');
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
// members
|
||||
private final TermAttribute termAtt;
|
||||
private final TypeAttribute typeAtt;
|
||||
private final PositionIncrementAttribute posIncrAtt;
|
||||
private final ShiftAttribute shiftAtt;
|
||||
|
||||
private int shift = 0;
|
||||
private int value;
|
||||
private final int precisionStep;
|
||||
}
|
|
@ -30,7 +30,7 @@ import org.apache.lucene.util.OpenBitSet;
|
|||
/**
|
||||
* Implementation of a Lucene {@link Filter} that implements trie-based range filtering for longs/doubles.
|
||||
* This filter depends on a specific structure of terms in the index that can only be created
|
||||
* by {@link TrieUtils} methods.
|
||||
* by indexing via {@link LongTrieTokenStream} methods.
|
||||
* For more information, how the algorithm works, see the {@linkplain org.apache.lucene.search.trie package description}.
|
||||
*/
|
||||
public class LongTrieRangeFilter extends AbstractTrieRangeFilter {
|
||||
|
@ -43,49 +43,11 @@ public class LongTrieRangeFilter extends AbstractTrieRangeFilter {
|
|||
* You can leave the bounds open, by supplying <code>null</code> for <code>min</code> and/or
|
||||
* <code>max</code>. Inclusive/exclusive bounds can also be supplied.
|
||||
* To query double values use the converter {@link TrieUtils#doubleToSortableLong}.
|
||||
* <p>This is the counterpart to {@link TrieUtils#addIndexedFields(Document,String,String[])}.
|
||||
* <p><b>This is the recommended usage of TrieUtils/LongTrieRangeFilter.</b>
|
||||
*/
|
||||
public LongTrieRangeFilter(final String field, final int precisionStep,
|
||||
final Long min, final Long max, final boolean minInclusive, final boolean maxInclusive
|
||||
) {
|
||||
this(
|
||||
new String[]{field, field+TrieUtils.LOWER_PRECISION_FIELD_NAME_SUFFIX},
|
||||
precisionStep,min,max,minInclusive,maxInclusive
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Expert: A trie filter for matching trie coded values using the given field names.
|
||||
* You can specify the main and helper field name, that was used to idex the values.
|
||||
* <code>precisionStep</code> must me equal or a multiple of the <code>precisionStep</code>
|
||||
* used for indexing the values.
|
||||
* You can leave the bounds open, by supplying <code>null</code> for <code>min</code> and/or
|
||||
* <code>max</code>. Inclusive/exclusive bounds can also be supplied.
|
||||
* To query double values use the converter {@link TrieUtils#doubleToSortableLong}.
|
||||
* <p>This is the counterpart to {@link TrieUtils#addIndexedFields(Document,String,String,String[])}.
|
||||
*/
|
||||
public LongTrieRangeFilter(final String field, final String lowerPrecisionField, final int precisionStep,
|
||||
final Long min, final Long max, final boolean minInclusive, final boolean maxInclusive
|
||||
) {
|
||||
this(new String[]{field, lowerPrecisionField},precisionStep,min,max,minInclusive,maxInclusive);
|
||||
}
|
||||
|
||||
/**
|
||||
* Expert: A trie filter for matching trie coded values
|
||||
* using the given field names. If the array of field names is shorter than the
|
||||
* trieCoded one, all trieCoded values with higher index get the last field name.
|
||||
* <code>precisionStep</code> must me equal or a multiple of the <code>precisionStep</code>
|
||||
* used for indexing the values.
|
||||
* You can leave the bounds open, by supplying <code>null</code> for <code>min</code> and/or
|
||||
* <code>max</code>. Inclusive/exclusive bounds can also be supplied.
|
||||
* To query double values use the converter {@link TrieUtils#doubleToSortableLong}.
|
||||
* <p>This is the counterpart to {@link TrieUtils#addIndexedFields(Document,String[],String[])}.
|
||||
*/
|
||||
public LongTrieRangeFilter(final String[] fields, final int precisionStep,
|
||||
Long min, Long max, final boolean minInclusive, final boolean maxInclusive
|
||||
) {
|
||||
super(fields, precisionStep, min, max, minInclusive, maxInclusive);
|
||||
super(field,precisionStep,min,max,minInclusive,maxInclusive);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -112,11 +74,10 @@ public class LongTrieRangeFilter extends AbstractTrieRangeFilter {
|
|||
TrieUtils.splitLongRange(new TrieUtils.LongRangeBuilder() {
|
||||
|
||||
//@Override
|
||||
public final void addRange(String minPrefixCoded, String maxPrefixCoded, int level) {
|
||||
public final void addRange(String minPrefixCoded, String maxPrefixCoded) {
|
||||
try {
|
||||
fillBits(
|
||||
reader, bits, termDocs,
|
||||
fields[Math.min(fields.length-1, level)],
|
||||
minPrefixCoded, maxPrefixCoded
|
||||
);
|
||||
} catch (IOException ioe) {
|
||||
|
|
|
@ -0,0 +1,172 @@
|
|||
package org.apache.lucene.search.trie;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
|
||||
/**
|
||||
* This class provides a {@link TokenStream} for indexing <code>long</code> values
|
||||
* that can be queried by {@link LongTrieRangeFilter}. This stream is not intended
|
||||
* to be used in analyzers, its more for iterating the different precisions during
|
||||
* indexing a specific numeric value.
|
||||
* <p>A <code>long</code> value is indexed as multiple string encoded terms, each reduced
|
||||
* by zeroing bits from the right. Each value is also prefixed (in the first char) by the
|
||||
* <code>shift</code> value (number of bits removed) used during encoding.
|
||||
* <p>The number of bits removed from the right for each trie entry is called
|
||||
* <code>precisionStep</code> in this API. For comparing the different step values, see the
|
||||
* {@linkplain org.apache.lucene.search.trie package description}.
|
||||
* <p>The usage pattern is (it is recommened to switch off norms and term frequencies
|
||||
* for numeric fields; it does not make sense to have them):
|
||||
* <pre>
|
||||
* Field field = new Field(name, new LongTrieTokenStream(value, precisionStep));
|
||||
* field.setOmitNorms(true);
|
||||
* field.setOmitTermFreqAndPositions(true);
|
||||
* document.add(field);
|
||||
* </pre>
|
||||
* <p>For optimal performance, re-use the TokenStream and Field instance
|
||||
* for more than one document:
|
||||
* <pre>
|
||||
* <em>// init</em>
|
||||
* TokenStream stream = new LongTrieTokenStream(precisionStep);
|
||||
* Field field = new Field(name, stream);
|
||||
* field.setOmitNorms(true);
|
||||
* field.setOmitTermFreqAndPositions(true);
|
||||
* <em>// use this code to index many documents:</em>
|
||||
* stream.setValue(value1)
|
||||
* document.add(field);
|
||||
* writer.addDocument(document);
|
||||
* stream.setValue(value2)
|
||||
* document.add(field);
|
||||
* writer.addDocument(document);
|
||||
* ...
|
||||
* </pre>
|
||||
* <p><em>Please note:</em> Token streams are read, when the document is added to index.
|
||||
* If you index more than one numeric field, use a separate instance for each.
|
||||
* <p>For more information, how trie fields work, see the
|
||||
* {@linkplain org.apache.lucene.search.trie package description}.
|
||||
*/
|
||||
public class LongTrieTokenStream extends TokenStream {
|
||||
|
||||
/** The full precision token gets this token type assigned. */
|
||||
public static final String TOKEN_TYPE_FULL_PREC = "fullPrecTrieLong";
|
||||
|
||||
/** The lower precision tokens gets this token type assigned. */
|
||||
public static final String TOKEN_TYPE_LOWER_PREC = "lowerPrecTrieLong";
|
||||
|
||||
/**
|
||||
* Creates a token stream for indexing <code>value</code> with the given
|
||||
* <code>precisionStep</code>. As instance creating is a major cost,
|
||||
* consider using a {@link #LongTrieTokenStream(int)} instance once for
|
||||
* indexing a large number of documents and assign a value with
|
||||
* {@link #setValue} for each document.
|
||||
* To index double values use the converter {@link TrieUtils#doubleToSortableLong}.
|
||||
*/
|
||||
public LongTrieTokenStream(final long value, final int precisionStep) {
|
||||
if (precisionStep<1 || precisionStep>64)
|
||||
throw new IllegalArgumentException("precisionStep may only be 1..64");
|
||||
this.value = value;
|
||||
this.precisionStep = precisionStep;
|
||||
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||
typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
|
||||
posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
|
||||
shiftAtt = (ShiftAttribute) addAttribute(ShiftAttribute.class);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a token stream for indexing values with the given
|
||||
* <code>precisionStep</code>. This stream is initially "empty"
|
||||
* (using a numeric value of 0), assign a value before indexing
|
||||
* each document using {@link #setValue}.
|
||||
*/
|
||||
public LongTrieTokenStream(final int precisionStep) {
|
||||
this(0L, precisionStep);
|
||||
}
|
||||
|
||||
/**
|
||||
* Resets the token stream to deliver prefix encoded values
|
||||
* for <code>value</code>. Use this method to index the same
|
||||
* numeric field for a large number of documents and reuse the
|
||||
* current stream instance.
|
||||
* To index double values use the converter {@link TrieUtils#doubleToSortableLong}.
|
||||
*/
|
||||
public void setValue(final long value) {
|
||||
this.value = value;
|
||||
reset();
|
||||
}
|
||||
|
||||
// @Override
|
||||
public void reset() {
|
||||
shift = 0;
|
||||
}
|
||||
|
||||
// @Override
|
||||
public boolean incrementToken() {
|
||||
if (shift>=64) return false;
|
||||
final char[] buffer = termAtt.resizeTermBuffer(TrieUtils.LONG_BUF_SIZE);
|
||||
termAtt.setTermLength(TrieUtils.longToPrefixCoded(value, shift, buffer));
|
||||
shiftAtt.setShift(shift);
|
||||
if (shift==0) {
|
||||
typeAtt.setType(TOKEN_TYPE_FULL_PREC);
|
||||
posIncrAtt.setPositionIncrement(1);
|
||||
} else {
|
||||
typeAtt.setType(TOKEN_TYPE_LOWER_PREC);
|
||||
posIncrAtt.setPositionIncrement(0);
|
||||
}
|
||||
shift += precisionStep;
|
||||
return true;
|
||||
}
|
||||
|
||||
// @Override
|
||||
/** @deprecated */
|
||||
public Token next(final Token reusableToken) {
|
||||
if (shift>=64) return null;
|
||||
reusableToken.clear();
|
||||
final char[] buffer = reusableToken.resizeTermBuffer(TrieUtils.LONG_BUF_SIZE);
|
||||
reusableToken.setTermLength(TrieUtils.longToPrefixCoded(value, shift, buffer));
|
||||
if (shift==0) {
|
||||
reusableToken.setType(TOKEN_TYPE_FULL_PREC);
|
||||
reusableToken.setPositionIncrement(1);
|
||||
} else {
|
||||
reusableToken.setType(TOKEN_TYPE_LOWER_PREC);
|
||||
reusableToken.setPositionIncrement(0);
|
||||
}
|
||||
shift += precisionStep;
|
||||
return reusableToken;
|
||||
}
|
||||
|
||||
// @Override
|
||||
public String toString() {
|
||||
final StringBuffer sb = new StringBuffer("(trie-long,value=").append(value);
|
||||
sb.append(",precisionStep=").append(precisionStep).append(')');
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
// members
|
||||
private final TermAttribute termAtt;
|
||||
private final TypeAttribute typeAtt;
|
||||
private final PositionIncrementAttribute posIncrAtt;
|
||||
private final ShiftAttribute shiftAtt;
|
||||
|
||||
private int shift = 0;
|
||||
private long value;
|
||||
private final int precisionStep;
|
||||
}
|
|
@ -0,0 +1,67 @@
|
|||
package org.apache.lucene.search.trie;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.util.Attribute;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
/**
|
||||
* This attribute is updated by {@link IntTrieTokenStream} and {@link LongTrieTokenStream}
|
||||
* to the shift value of the current prefix-encoded token.
|
||||
* It may be used by filters or consumers to e.g. distribute the values to various fields.
|
||||
*/
|
||||
public final class ShiftAttribute extends Attribute implements Cloneable, Serializable {
|
||||
private int shift = 0;
|
||||
|
||||
/**
|
||||
* Returns the shift value of the current prefix encoded token.
|
||||
*/
|
||||
public int getShift() {
|
||||
return shift;
|
||||
}
|
||||
|
||||
void setShift(final int shift) {
|
||||
this.shift = shift;
|
||||
}
|
||||
|
||||
public void clear() {
|
||||
shift = 0;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return "shift=" + shift;
|
||||
}
|
||||
|
||||
public boolean equals(Object other) {
|
||||
if (this == other) return true;
|
||||
if (other instanceof ShiftAttribute) {
|
||||
return ((ShiftAttribute) other).shift == shift;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public int hashCode() {
|
||||
return shift;
|
||||
}
|
||||
|
||||
public void copyTo(Attribute target) {
|
||||
final ShiftAttribute t = (ShiftAttribute) target;
|
||||
t.setShift(shift);
|
||||
}
|
||||
}
|
|
@ -17,17 +17,13 @@ package org.apache.lucene.search.trie;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.search.SortField;
|
||||
import org.apache.lucene.search.FieldCache;
|
||||
import org.apache.lucene.search.ExtendedFieldCache;
|
||||
|
||||
/**
|
||||
* This is a helper class to construct the trie-based index entries for numerical values.
|
||||
* For more information on how the algorithm works, see the
|
||||
* {@linkplain org.apache.lucene.search.trie package description}.
|
||||
* <h3>The trie format using prefix encoded numerical values</h3>
|
||||
* This is a helper class to generate prefix-encoded representations for numerical values
|
||||
* and supplies converters to represent float/double values as sortable integers/longs.
|
||||
* <p>To quickly execute range queries in Apache Lucene, a range is divided recursively
|
||||
* into multiple intervals for searching: The center of the range is searched only with
|
||||
* the lowest possible precision in the trie, while the boundaries are matched
|
||||
|
@ -35,54 +31,48 @@ import org.apache.lucene.search.ExtendedFieldCache;
|
|||
* <p>This class generates terms to achive this: First the numerical integer values need to
|
||||
* be converted to strings. For that integer values (32 bit or 64 bit) are made unsigned
|
||||
* and the bits are converted to ASCII chars with each 7 bit. The resulting string is
|
||||
* sortable like the original integer value.
|
||||
* sortable like the original integer value. Each value is also prefixed
|
||||
* (in the first char) by the <code>shift</code> value (number of bits removed) used
|
||||
* during encoding.
|
||||
* <p>To also index floating point numbers, this class supplies two methods to convert them
|
||||
* to integer values by changing their bit layout: {@link #doubleToSortableLong},
|
||||
* {@link #floatToSortableInt}. You will have no precision loss by
|
||||
* converting floating point numbers to integers and back (only that the integer form
|
||||
* is not usable). Other data types like dates can easily converted to longs or ints (e.g.
|
||||
* date to long: {@link java.util.Date#getTime}).
|
||||
* <p>To index the different precisions of the long values each encoded value is also reduced
|
||||
* by zeroing bits from the right. Each value is also prefixed (in the first char) by the
|
||||
* <code>shift</code> value (number of bits removed) used during encoding. This series of
|
||||
* different precision values can be indexed into a Lucene {@link Document} using
|
||||
* {@link #addIndexedFields(Document,String,String[])}. The default is to index the original
|
||||
* precision in the supplied field name and the lower precisions in an additional helper field.
|
||||
* Because of this, the full-precision field can also be sorted (using {@link #getLongSortField}
|
||||
* or {@link #getIntSortField}).
|
||||
* <p>The number of bits removed from the right for each trie entry is called
|
||||
* <code>precisionStep</code> in this API. For comparing the different step values, see the
|
||||
* {@linkplain org.apache.lucene.search.trie package description}.
|
||||
* <p>Prefix encoded fields can also be sorted using the {@link SortField} factories
|
||||
* {@link #getLongSortField} or {@link #getIntSortField}.
|
||||
*/
|
||||
public final class TrieUtils {
|
||||
|
||||
private TrieUtils() {} // no instance!
|
||||
|
||||
/**
|
||||
* The default "helper" field containing the lower precision terms is the original
|
||||
* fieldname with this appended. This suffix is used in
|
||||
* {@link #addIndexedFields(Document,String,String[])} and the corresponding c'tor
|
||||
* of <code>(Long|Int)TrieRangeFilter</code>.
|
||||
*/
|
||||
public static final String LOWER_PRECISION_FIELD_NAME_SUFFIX="#trie";
|
||||
|
||||
/**
|
||||
* Longs are stored at lower precision by shifting off lower bits. The shift count is
|
||||
* stored as <code>SHIFT_START_LONG+shift</code> in the first character
|
||||
*/
|
||||
public static final char SHIFT_START_LONG = (char)0x20;
|
||||
|
||||
/** internal: maximum needed <code>char[]</code> buffer size for encoding */
|
||||
static final int LONG_BUF_SIZE = 63/7 + 2;
|
||||
|
||||
/**
|
||||
* Integers are stored at lower precision by shifting off lower bits. The shift count is
|
||||
* stored as <code>SHIFT_START_INT+shift</code> in the first character
|
||||
*/
|
||||
public static final char SHIFT_START_INT = (char)0x60;
|
||||
|
||||
/** internal: maximum needed <code>char[]</code> buffer size for encoding */
|
||||
static final int INT_BUF_SIZE = 31/7 + 2;
|
||||
|
||||
/**
|
||||
* A parser instance for filling a {@link ExtendedFieldCache}, that parses prefix encoded fields as longs.
|
||||
*/
|
||||
public static final ExtendedFieldCache.LongParser FIELD_CACHE_LONG_PARSER=new ExtendedFieldCache.LongParser(){
|
||||
public final long parseLong(final String val) {
|
||||
final int shift = val.charAt(0)-SHIFT_START_LONG;
|
||||
if (shift>0 && shift<=63)
|
||||
throw new FieldCache.StopFillCacheException();
|
||||
return prefixCodedToLong(val);
|
||||
}
|
||||
};
|
||||
|
@ -92,6 +82,9 @@ public final class TrieUtils {
|
|||
*/
|
||||
public static final FieldCache.IntParser FIELD_CACHE_INT_PARSER=new FieldCache.IntParser(){
|
||||
public final int parseInt(final String val) {
|
||||
final int shift = val.charAt(0)-SHIFT_START_INT;
|
||||
if (shift>0 && shift<=31)
|
||||
throw new FieldCache.StopFillCacheException();
|
||||
return prefixCodedToInt(val);
|
||||
}
|
||||
};
|
||||
|
@ -102,6 +95,9 @@ public final class TrieUtils {
|
|||
*/
|
||||
public static final ExtendedFieldCache.DoubleParser FIELD_CACHE_DOUBLE_PARSER=new ExtendedFieldCache.DoubleParser(){
|
||||
public final double parseDouble(final String val) {
|
||||
final int shift = val.charAt(0)-SHIFT_START_LONG;
|
||||
if (shift>0 && shift<=63)
|
||||
throw new FieldCache.StopFillCacheException();
|
||||
return sortableLongToDouble(prefixCodedToLong(val));
|
||||
}
|
||||
};
|
||||
|
@ -112,9 +108,28 @@ public final class TrieUtils {
|
|||
*/
|
||||
public static final FieldCache.FloatParser FIELD_CACHE_FLOAT_PARSER=new FieldCache.FloatParser(){
|
||||
public final float parseFloat(final String val) {
|
||||
final int shift = val.charAt(0)-SHIFT_START_INT;
|
||||
if (shift>0 && shift<=31)
|
||||
throw new FieldCache.StopFillCacheException();
|
||||
return sortableIntToFloat(prefixCodedToInt(val));
|
||||
}
|
||||
};
|
||||
|
||||
/** internal */
|
||||
static int longToPrefixCoded(final long val, final int shift, final char[] buffer) {
|
||||
int nChars = (63-shift)/7 + 1, len = nChars+1;
|
||||
buffer[0] = (char)(SHIFT_START_LONG + shift);
|
||||
long sortableBits = val ^ 0x8000000000000000L;
|
||||
sortableBits >>>= shift;
|
||||
while (nChars>=1) {
|
||||
// Store 7 bits per character for good efficiency when UTF-8 encoding.
|
||||
// The whole number is right-justified so that lucene can prefix-encode
|
||||
// the terms more efficiently.
|
||||
buffer[nChars--] = (char)(sortableBits & 0x7f);
|
||||
sortableBits >>>= 7;
|
||||
}
|
||||
return len;
|
||||
}
|
||||
|
||||
/**
|
||||
* This is a convenience method, that returns prefix coded bits of a long without
|
||||
|
@ -125,27 +140,33 @@ public final class TrieUtils {
|
|||
public static String longToPrefixCoded(final long val) {
|
||||
return longToPrefixCoded(val, 0);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Expert: Returns prefix coded bits after reducing the precision by <code>shift</code> bits.
|
||||
* This is method is used by {@link #trieCodeLong}.
|
||||
* This is method is used by {@link LongRangeBuilder}.
|
||||
*/
|
||||
public static String longToPrefixCoded(final long val, final int shift) {
|
||||
if (shift>63 || shift<0)
|
||||
throw new IllegalArgumentException("Illegal shift value, must be 0..63");
|
||||
int nChars = (63-shift)/7 + 1;
|
||||
final char[] arr = new char[nChars+1];
|
||||
arr[0] = (char)(SHIFT_START_LONG + shift);
|
||||
long sortableBits = val ^ 0x8000000000000000L;
|
||||
final char[] buffer = new char[LONG_BUF_SIZE];
|
||||
final int len = longToPrefixCoded(val, shift, buffer);
|
||||
return new String(buffer, 0, len);
|
||||
}
|
||||
|
||||
/** internal */
|
||||
static int intToPrefixCoded(final int val, final int shift, final char[] buffer) {
|
||||
int nChars = (31-shift)/7 + 1, len = nChars+1;
|
||||
buffer[0] = (char)(SHIFT_START_INT + shift);
|
||||
int sortableBits = val ^ 0x80000000;
|
||||
sortableBits >>>= shift;
|
||||
while (nChars>=1) {
|
||||
// Store 7 bits per character for good efficiency when UTF-8 encoding.
|
||||
// The whole number is right-justified so that lucene can prefix-encode
|
||||
// the terms more efficiently.
|
||||
arr[nChars--] = (char)(sortableBits & 0x7f);
|
||||
buffer[nChars--] = (char)(sortableBits & 0x7f);
|
||||
sortableBits >>>= 7;
|
||||
}
|
||||
return new String(arr);
|
||||
return len;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -160,39 +181,30 @@ public final class TrieUtils {
|
|||
|
||||
/**
|
||||
* Expert: Returns prefix coded bits after reducing the precision by <code>shift</code> bits.
|
||||
* This is method is used by {@link #trieCodeInt}.
|
||||
* This is method is used by {@link IntRangeBuilder}.
|
||||
*/
|
||||
public static String intToPrefixCoded(final int val, final int shift) {
|
||||
if (shift>31 || shift<0)
|
||||
throw new IllegalArgumentException("Illegal shift value, must be 0..31");
|
||||
int nChars = (31-shift)/7 + 1;
|
||||
final char[] arr = new char[nChars+1];
|
||||
arr[0] = (char)(SHIFT_START_INT + shift);
|
||||
int sortableBits = val ^ 0x80000000;
|
||||
sortableBits >>>= shift;
|
||||
while (nChars>=1) {
|
||||
// Store 7 bits per character for good efficiency when UTF-8 encoding.
|
||||
// The whole number is right-justified so that lucene can prefix-encode
|
||||
// the terms more efficiently.
|
||||
arr[nChars--] = (char)(sortableBits & 0x7f);
|
||||
sortableBits >>>= 7;
|
||||
}
|
||||
return new String(arr);
|
||||
final char[] buffer = new char[INT_BUF_SIZE];
|
||||
final int len = intToPrefixCoded(val, shift, buffer);
|
||||
return new String(buffer, 0, len);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a long from prefixCoded characters.
|
||||
* Rightmost bits will be zero for lower precision codes.
|
||||
* This method can be used to decode e.g. a stored field.
|
||||
* @throws NumberFormatException if the supplied string is
|
||||
* not correctly prefix encoded.
|
||||
* @see #longToPrefixCoded(long)
|
||||
*/
|
||||
public static long prefixCodedToLong(final String prefixCoded) {
|
||||
final int len = prefixCoded.length();
|
||||
final int shift = prefixCoded.charAt(0)-SHIFT_START_LONG;
|
||||
if (shift>63 || shift<0)
|
||||
throw new NumberFormatException("Invalid shift value in prefixCoded string (is encoded value really a LONG?)");
|
||||
long sortableBits = 0L;
|
||||
for (int i=1; i<len; i++) {
|
||||
for (int i=1, len=prefixCoded.length(); i<len; i++) {
|
||||
sortableBits <<= 7;
|
||||
final char ch = prefixCoded.charAt(i);
|
||||
if (ch>0x7f) {
|
||||
|
@ -201,7 +213,7 @@ public final class TrieUtils {
|
|||
Integer.toHexString((int)ch)+" at position "+i+" is invalid)"
|
||||
);
|
||||
}
|
||||
sortableBits |= (long)(ch & 0x7f);
|
||||
sortableBits |= (long)ch;
|
||||
}
|
||||
return (sortableBits << shift) ^ 0x8000000000000000L;
|
||||
}
|
||||
|
@ -210,15 +222,16 @@ public final class TrieUtils {
|
|||
* Returns an int from prefixCoded characters.
|
||||
* Rightmost bits will be zero for lower precision codes.
|
||||
* This method can be used to decode e.g. a stored field.
|
||||
* @throws NumberFormatException if the supplied string is
|
||||
* not correctly prefix encoded.
|
||||
* @see #intToPrefixCoded(int)
|
||||
*/
|
||||
public static int prefixCodedToInt(final String prefixCoded) {
|
||||
final int len = prefixCoded.length();
|
||||
final int shift = prefixCoded.charAt(0)-SHIFT_START_INT;
|
||||
if (shift>31 || shift<0)
|
||||
throw new NumberFormatException("Invalid shift value in prefixCoded string (is encoded value really an INT?)");
|
||||
int sortableBits = 0;
|
||||
for (int i=1; i<len; i++) {
|
||||
for (int i=1, len=prefixCoded.length(); i<len; i++) {
|
||||
sortableBits <<= 7;
|
||||
final char ch = prefixCoded.charAt(i);
|
||||
if (ch>0x7f) {
|
||||
|
@ -227,7 +240,7 @@ public final class TrieUtils {
|
|||
Integer.toHexString((int)ch)+" at position "+i+" is invalid)"
|
||||
);
|
||||
}
|
||||
sortableBits |= (int)(ch & 0x7f);
|
||||
sortableBits |= (int)ch;
|
||||
}
|
||||
return (sortableBits << shift) ^ 0x80000000;
|
||||
}
|
||||
|
@ -277,116 +290,20 @@ public final class TrieUtils {
|
|||
}
|
||||
|
||||
/** A factory method, that generates a {@link SortField} instance for sorting prefix encoded long values. */
|
||||
public static SortField getLongSortField(final String field, boolean reverse) {
|
||||
public static SortField getLongSortField(final String field, final boolean reverse) {
|
||||
return new SortField(field, FIELD_CACHE_LONG_PARSER, reverse);
|
||||
}
|
||||
|
||||
/** A factory method, that generates a {@link SortField} instance for sorting prefix encoded int values. */
|
||||
public static SortField getIntSortField(final String field, boolean reverse) {
|
||||
public static SortField getIntSortField(final String field, final boolean reverse) {
|
||||
return new SortField(field, FIELD_CACHE_INT_PARSER, reverse);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a sequence of trie coded numbers suitable for {@link LongTrieRangeFilter}.
|
||||
* Each successive string in the list has had it's precision reduced by <code>precisionStep</code>.
|
||||
* For sorting, index the first full-precision value into a separate field and the
|
||||
* remaining values into another field.
|
||||
* <p>To achieve this, use {@link #addIndexedFields(Document,String,String[])}.
|
||||
*/
|
||||
public static String[] trieCodeLong(long val, int precisionStep) {
|
||||
if (precisionStep<1 || precisionStep>64)
|
||||
throw new IllegalArgumentException("precisionStep may only be 1..64");
|
||||
String[] arr = new String[63/precisionStep+1];
|
||||
int idx = 0;
|
||||
for (int shift=0; shift<64; shift+=precisionStep) {
|
||||
arr[idx++] = longToPrefixCoded(val, shift);
|
||||
}
|
||||
return arr;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a sequence of trie coded numbers suitable for {@link IntTrieRangeFilter}.
|
||||
* Each successive string in the list has had it's precision reduced by <code>precisionStep</code>.
|
||||
* For sorting, index the first full-precision value into a separate field and the
|
||||
* remaining values into another field.
|
||||
* <p>To achieve this, use {@link #addIndexedFields(Document,String,String[])}.
|
||||
*/
|
||||
public static String[] trieCodeInt(int val, int precisionStep) {
|
||||
if (precisionStep<1 || precisionStep>32)
|
||||
throw new IllegalArgumentException("precisionStep may only be 1..32");
|
||||
String[] arr = new String[31/precisionStep+1];
|
||||
int idx = 0;
|
||||
for (int shift=0; shift<32; shift+=precisionStep) {
|
||||
arr[idx++] = intToPrefixCoded(val, shift);
|
||||
}
|
||||
return arr;
|
||||
}
|
||||
|
||||
/**
|
||||
* Indexes the full precision value only in the main field (for sorting), and indexes all other
|
||||
* lower precision values in <code>field+LOWER_PRECISION_FIELD_NAME_SUFFIX</code>.
|
||||
* <p><b>This is the recommended variant to add trie fields to the index.</b>
|
||||
* By this it is possible to sort the field using a <code>SortField</code> instance
|
||||
* returned by {@link #getLongSortField} or {@link #getIntSortField}.
|
||||
* <p>This method does not store the fields and saves no term frequency or norms
|
||||
* (which are normally not needed for trie fields). If you want to additionally store
|
||||
* the value, you can use the normal methods of {@link Document} to achive this, just specify
|
||||
* <code>Field.Store.YES</code>, <code>Field.Index.NO</code> and the same field name.
|
||||
* <p>Examples:
|
||||
* <pre>
|
||||
* addIndexedFields(doc, "mydouble", trieCodeLong(doubleToSortableLong(1.414d), 4));
|
||||
* addIndexedFields(doc, "mylong", trieCodeLong(123456L, 4));
|
||||
* </pre>
|
||||
**/
|
||||
public static void addIndexedFields(Document doc, String field, String[] trieCoded) {
|
||||
addIndexedFields(doc, new String[]{field, field+LOWER_PRECISION_FIELD_NAME_SUFFIX}, trieCoded);
|
||||
}
|
||||
|
||||
/**
|
||||
* Expert: Indexes the full precision value only in the main field (for sorting), and indexes all other
|
||||
* lower precision values in the <code>lowerPrecision</code> field.
|
||||
* If you do not specify the same field name for the main and lower precision one,
|
||||
* it is possible to sort the field using a <code>SortField</code> instance
|
||||
* returned by {@link #getLongSortField} or {@link #getIntSortField}.
|
||||
* <p>This method does not store the fields and saves no term frequency or norms
|
||||
* (which are normally not needed for trie fields). If you want to additionally store
|
||||
* the value, you can use the normal methods of {@link Document} to achive this, just specify
|
||||
* <code>Field.Store.YES</code>, <code>Field.Index.NO</code> and the same main field name.
|
||||
* <p>Examples:
|
||||
* <pre>
|
||||
* addIndexedFields(doc, "mydouble", "mydoubletrie", trieCodeLong(doubleToSortableLong(1.414d), 4));
|
||||
* addIndexedFields(doc, "mylong", "mylongtrie", trieCodeLong(123456L, 4));
|
||||
* </pre>
|
||||
* @see #addIndexedFields(Document,String,String[])
|
||||
**/
|
||||
public static void addIndexedFields(Document doc, String field, String lowerPrecisionField, String[] trieCoded) {
|
||||
addIndexedFields(doc, new String[]{field, lowerPrecisionField}, trieCoded);
|
||||
}
|
||||
|
||||
/**
|
||||
* Expert: Indexes a series of trie coded values into a lucene {@link Document}
|
||||
* using the given field names.
|
||||
* If the array of field names is shorter than the trie coded one, all trie coded
|
||||
* values with higher index get the last field name.
|
||||
* <p>This method does not store the fields and saves no term frequency or norms
|
||||
* (which are normally not needed for trie fields). If you want to additionally store
|
||||
* the value, you can use the normal methods of {@link Document} to achive this, just specify
|
||||
* <code>Field.Store.YES</code>, <code>Field.Index.NO</code> and the same main field name.
|
||||
**/
|
||||
public static void addIndexedFields(Document doc, String[] fields, String[] trieCoded) {
|
||||
for (int i=0; i<trieCoded.length; i++) {
|
||||
final int fnum = Math.min(fields.length-1, i);
|
||||
final Field f = new Field(fields[fnum], trieCoded[i], Field.Store.NO, Field.Index.NOT_ANALYZED_NO_NORMS);
|
||||
f.setOmitTermFreqAndPositions(true);
|
||||
doc.add(f);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Expert: Splits a long range recursively.
|
||||
* You may implement a builder that adds clauses to a
|
||||
* {@link org.apache.lucene.search.BooleanQuery} for each call to its
|
||||
* {@link LongRangeBuilder#addRange(String,String,int)}
|
||||
* {@link LongRangeBuilder#addRange(String,String)}
|
||||
* method.
|
||||
* <p>This method is used by {@link LongTrieRangeFilter}.
|
||||
*/
|
||||
|
@ -402,7 +319,7 @@ public final class TrieUtils {
|
|||
* Expert: Splits an int range recursively.
|
||||
* You may implement a builder that adds clauses to a
|
||||
* {@link org.apache.lucene.search.BooleanQuery} for each call to its
|
||||
* {@link IntRangeBuilder#addRange(String,String,int)}
|
||||
* {@link IntRangeBuilder#addRange(String,String)}
|
||||
* method.
|
||||
* <p>This method is used by {@link IntTrieRangeFilter}.
|
||||
*/
|
||||
|
@ -419,7 +336,7 @@ public final class TrieUtils {
|
|||
final Object builder, final int valSize,
|
||||
final int precisionStep, long minBound, long maxBound
|
||||
) {
|
||||
for (int level=0,shift=0;; level++) {
|
||||
for (int shift=0; ; shift += precisionStep) {
|
||||
// calculate new bounds for inner precision
|
||||
final long diff = 1L << (shift+precisionStep),
|
||||
mask = ((1L<<precisionStep) - 1L) << shift;
|
||||
|
@ -432,20 +349,19 @@ public final class TrieUtils {
|
|||
|
||||
if (shift+precisionStep>=valSize || nextMinBound>nextMaxBound) {
|
||||
// We are in the lowest precision or the next precision is not available.
|
||||
addRange(builder, valSize, minBound, maxBound, shift, level);
|
||||
addRange(builder, valSize, minBound, maxBound, shift);
|
||||
// exit the split recursion loop
|
||||
break;
|
||||
}
|
||||
|
||||
if (hasLower)
|
||||
addRange(builder, valSize, minBound, minBound | mask, shift, level);
|
||||
addRange(builder, valSize, minBound, minBound | mask, shift);
|
||||
if (hasUpper)
|
||||
addRange(builder, valSize, maxBound & ~mask, maxBound, shift, level);
|
||||
addRange(builder, valSize, maxBound & ~mask, maxBound, shift);
|
||||
|
||||
// recurse to next precision
|
||||
minBound = nextMinBound;
|
||||
maxBound = nextMaxBound;
|
||||
shift += precisionStep;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -453,7 +369,7 @@ public final class TrieUtils {
|
|||
private static void addRange(
|
||||
final Object builder, final int valSize,
|
||||
long minBound, long maxBound,
|
||||
final int shift, final int level
|
||||
final int shift
|
||||
) {
|
||||
// for the max bound set all lower bits (that were shifted away):
|
||||
// this is important for testing or other usages of the splitted range
|
||||
|
@ -463,10 +379,10 @@ public final class TrieUtils {
|
|||
// delegate to correct range builder
|
||||
switch(valSize) {
|
||||
case 64:
|
||||
((LongRangeBuilder)builder).addRange(minBound, maxBound, shift, level);
|
||||
((LongRangeBuilder)builder).addRange(minBound, maxBound, shift);
|
||||
break;
|
||||
case 32:
|
||||
((IntRangeBuilder)builder).addRange((int)minBound, (int)maxBound, shift, level);
|
||||
((IntRangeBuilder)builder).addRange((int)minBound, (int)maxBound, shift);
|
||||
break;
|
||||
default:
|
||||
// Should not happen!
|
||||
|
@ -484,16 +400,9 @@ public final class TrieUtils {
|
|||
|
||||
/**
|
||||
* Overwrite this method, if you like to receive the already prefix encoded range bounds.
|
||||
* You can directly build classical range queries from them.
|
||||
* The level gives the precision level (0 = highest precision) of the encoded values.
|
||||
* This parameter could be used as an index to an array of fieldnames like the
|
||||
* parameters to {@link #addIndexedFields(Document,String[],String[])} for specifying
|
||||
* the field names for each precision:
|
||||
* <pre>
|
||||
* String field = fields[Math.min(fields.length-1, level)];
|
||||
* </pre>
|
||||
* You can directly build classical (inclusive) range queries from them.
|
||||
*/
|
||||
public void addRange(String minPrefixCoded, String maxPrefixCoded, int level) {
|
||||
public void addRange(String minPrefixCoded, String maxPrefixCoded) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
|
@ -501,10 +410,8 @@ public final class TrieUtils {
|
|||
* Overwrite this method, if you like to receive the raw long range bounds.
|
||||
* You can use this for e.g. debugging purposes (print out range bounds).
|
||||
*/
|
||||
public void addRange(final long min, final long max, final int shift, final int level) {
|
||||
/*System.out.println(Long.toHexString((min^0x8000000000000000L) >>> shift)+".."+
|
||||
Long.toHexString((max^0x8000000000000000L) >>> shift));*/
|
||||
addRange(longToPrefixCoded(min, shift), longToPrefixCoded(max, shift), level);
|
||||
public void addRange(final long min, final long max, final int shift) {
|
||||
addRange(longToPrefixCoded(min, shift), longToPrefixCoded(max, shift));
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -519,16 +426,9 @@ public final class TrieUtils {
|
|||
|
||||
/**
|
||||
* Overwrite this method, if you like to receive the already prefix encoded range bounds.
|
||||
* You can directly build classical range queries from them.
|
||||
* The level gives the precision level (0 = highest precision) of the encoded values.
|
||||
* This parameter could be used as an index to an array of fieldnames like the
|
||||
* parameters to {@link #addIndexedFields(Document,String[],String[])} for specifying
|
||||
* the field names for each precision:
|
||||
* <pre>
|
||||
* String field = fields[Math.min(fields.length-1, level)];
|
||||
* </pre>
|
||||
* You can directly build classical range (inclusive) queries from them.
|
||||
*/
|
||||
public void addRange(String minPrefixCoded, String maxPrefixCoded, int level) {
|
||||
public void addRange(String minPrefixCoded, String maxPrefixCoded) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
|
@ -536,10 +436,8 @@ public final class TrieUtils {
|
|||
* Overwrite this method, if you like to receive the raw int range bounds.
|
||||
* You can use this for e.g. debugging purposes (print out range bounds).
|
||||
*/
|
||||
public void addRange(final int min, final int max, final int shift, final int level) {
|
||||
/*System.out.println(Integer.toHexString((min^0x80000000) >>> shift)+".."+
|
||||
Integer.toHexString((max^0x80000000) >>> shift));*/
|
||||
addRange(intToPrefixCoded(min, shift), intToPrefixCoded(max, shift), level);
|
||||
public void addRange(final int min, final int max, final int shift) {
|
||||
addRange(intToPrefixCoded(min, shift), intToPrefixCoded(max, shift));
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -50,10 +50,14 @@ using a multiple of the original step value.</p>
|
|||
are no longer dependent on the index size and the number of distinct values because there is
|
||||
an upper limit unrelated to either of these properties.</p>
|
||||
|
||||
<h3>Usage</h3>
|
||||
<h3>Indexing Usage</h3>
|
||||
<p>To use the new query types the numerical values, which may be<code>long</code>, <code>double</code>, <code>int</code>,
|
||||
<code>float</code>, or <code>Date</code>, the values must be indexed in a special prefix encoded format
|
||||
(using {@link org.apache.lucene.search.trie.TrieUtils}). This can be done like this:</p>
|
||||
using {@link org.apache.lucene.search.trie.LongTrieTokenStream} or
|
||||
{@link org.apache.lucene.search.trie.IntTrieTokenStream}, which generate the necessary tokens.
|
||||
Use {@link org.apache.lucene.search.trie.TrieUtils} to convert floating point values to integers.
|
||||
Example code for indexing (it is recommened to disable norms and term frequencies during indexing
|
||||
trie encoded fields):</p>
|
||||
|
||||
<pre>
|
||||
<em>// chose a step value, 8 is a general good value for large indexes:</em>
|
||||
|
@ -67,15 +71,25 @@ an upper limit unrelated to either of these properties.</p>
|
|||
|
||||
<em>// add some numerical fields:</em>
|
||||
long lvalue = 121345L;
|
||||
TrieUtils.addIndexedFields(doc, "exampleLong", TrieUtils.trieCodeLong(lvalue, precisionStep));
|
||||
Field f = new Field("exampleLong", new LongTrieTokenStream(lvalue, precisionStep));
|
||||
f.setOmitNorms(true); f.setOmitTermFreqAndPositions(true);
|
||||
doc.add(f);
|
||||
double dvalue = 1.057E17;
|
||||
TrieUtils.addIndexedFields(doc, "exampleDouble", TrieUtils.trieCodeLong(TrieUtils.doubleToSortableLong(dvalue), precisionStep));
|
||||
f = new Field("exampleDouble", new LongTrieTokenStream(TrieUtils.doubleToSortableLong(dvalue), precisionStep));
|
||||
f.setOmitNorms(true); f.setOmitTermFreqAndPositions(true);
|
||||
doc.add(f);
|
||||
int ivalue = 121345;
|
||||
TrieUtils.addIndexedFields(doc, "exampleInt", TrieUtils.trieCodeInt(ivalue, precisionStep));
|
||||
f = new Field("exampleInt", new IntTrieTokenStream(ivalue, precisionStep));
|
||||
f.setOmitNorms(true); f.setOmitTermFreqAndPositions(true);
|
||||
doc.add(f);
|
||||
float fvalue = 1.057E17f;
|
||||
TrieUtils.addIndexedFields(doc, "exampleFloat", TrieUtils.trieCodeInt(TrieUtils.floatToSortableInt(fvalue), precisionStep));
|
||||
f = new Field("exampleFloat", new IntTrieTokenStream(TrieUtils.floatToSortableInt(fvalue), precisionStep));
|
||||
f.setOmitNorms(true); f.setOmitTermFreqAndPositions(true);
|
||||
doc.add(f);
|
||||
Date datevalue = new Date(); <em>// actual time</em>
|
||||
TrieUtils.addIndexedFields(doc, "exampleDate", TrieUtils.trieCodeLong(datevalue.getTime(), precisionStep));
|
||||
f = new Field("exampleDate", new LongTrieTokenStream(datevalue.getTime(), precisionStep));
|
||||
f.setOmitNorms(true); f.setOmitTermFreqAndPositions(true);
|
||||
doc.add(f);
|
||||
|
||||
<em>// if you want to also store one of the values:</em>
|
||||
doc.add(new Field("exampleLong", Long.toString(lvalue), Field.Store.YES, Field.Index.NO));
|
||||
|
@ -86,6 +100,11 @@ an upper limit unrelated to either of these properties.</p>
|
|||
<em>// now add document to IndexWriter, as usual</em>
|
||||
</pre>
|
||||
|
||||
<p><em>(for higher indexing performance, you can reuse the TokenStreams –
|
||||
more info about this in the stream documentation)</em></p>
|
||||
|
||||
<h3>Searching</h3>
|
||||
|
||||
<p>The numeric index fields you prepared in this way can be searched by
|
||||
{@link org.apache.lucene.search.trie.LongTrieRangeFilter} or {@link org.apache.lucene.search.trie.IntTrieRangeFilter}:</p>
|
||||
|
||||
|
|
|
@ -33,8 +33,7 @@ import org.apache.lucene.search.Sort;
|
|||
import org.apache.lucene.search.RangeQuery;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
public class TestIntTrieRangeFilter extends LuceneTestCase
|
||||
{
|
||||
public class TestIntTrieRangeFilter extends LuceneTestCase {
|
||||
// distance of entries
|
||||
private static final int distance = 6666;
|
||||
// shift the starting of the values to the left, to also have negative values:
|
||||
|
@ -42,6 +41,15 @@ public class TestIntTrieRangeFilter extends LuceneTestCase
|
|||
// number of docs to generate for testing
|
||||
private static final int noDocs = 10000;
|
||||
|
||||
private static Field newField(String name, int precisionStep) {
|
||||
IntTrieTokenStream stream = new IntTrieTokenStream(precisionStep);
|
||||
stream.setUseNewAPI(true);
|
||||
Field f=new Field(name, stream);
|
||||
f.setOmitTermFreqAndPositions(true);
|
||||
f.setOmitNorms(true);
|
||||
return f;
|
||||
}
|
||||
|
||||
private static final RAMDirectory directory;
|
||||
private static final IndexSearcher searcher;
|
||||
static {
|
||||
|
@ -50,21 +58,34 @@ public class TestIntTrieRangeFilter extends LuceneTestCase
|
|||
IndexWriter writer = new IndexWriter(directory, new WhitespaceAnalyzer(),
|
||||
true, MaxFieldLength.UNLIMITED);
|
||||
|
||||
Field
|
||||
field8 = newField("field8", 8),
|
||||
field4 = newField("field4", 4),
|
||||
field2 = newField("field2", 2),
|
||||
ascfield8 = newField("ascfield8", 8),
|
||||
ascfield4 = newField("ascfield4", 4),
|
||||
ascfield2 = newField("ascfield2", 2);
|
||||
|
||||
// Add a series of noDocs docs with increasing int values
|
||||
for (int l=0; l<noDocs; l++) {
|
||||
Document doc=new Document();
|
||||
// add fields, that have a distance to test general functionality
|
||||
final int val=distance*l+startOffset;
|
||||
TrieUtils.addIndexedFields(doc,"field8", TrieUtils.trieCodeInt(val, 8));
|
||||
doc.add(new Field("field8", TrieUtils.intToPrefixCoded(val), Field.Store.YES, Field.Index.NO));
|
||||
TrieUtils.addIndexedFields(doc,"field4", TrieUtils.trieCodeInt(val, 4));
|
||||
doc.add(new Field("field4", TrieUtils.intToPrefixCoded(val), Field.Store.YES, Field.Index.NO));
|
||||
TrieUtils.addIndexedFields(doc,"field2", TrieUtils.trieCodeInt(val, 2));
|
||||
doc.add(new Field("field2", TrieUtils.intToPrefixCoded(val), Field.Store.YES, Field.Index.NO));
|
||||
int val=distance*l+startOffset;
|
||||
doc.add(new Field("value", TrieUtils.intToPrefixCoded(val), Field.Store.YES, Field.Index.NO));
|
||||
((IntTrieTokenStream)field8.tokenStreamValue()).setValue(val);
|
||||
doc.add(field8);
|
||||
((IntTrieTokenStream)field4.tokenStreamValue()).setValue(val);
|
||||
doc.add(field4);
|
||||
((IntTrieTokenStream)field2.tokenStreamValue()).setValue(val);
|
||||
doc.add(field2);
|
||||
// add ascending fields with a distance of 1, beginning at -noDocs/2 to test the correct splitting of range and inclusive/exclusive
|
||||
TrieUtils.addIndexedFields(doc,"ascfield8", TrieUtils.trieCodeInt(l-(noDocs/2), 8));
|
||||
TrieUtils.addIndexedFields(doc,"ascfield4", TrieUtils.trieCodeInt(l-(noDocs/2), 4));
|
||||
TrieUtils.addIndexedFields(doc,"ascfield2", TrieUtils.trieCodeInt(l-(noDocs/2), 2));
|
||||
val=l-(noDocs/2);
|
||||
((IntTrieTokenStream)ascfield8.tokenStreamValue()).setValue(val);
|
||||
doc.add(ascfield8);
|
||||
((IntTrieTokenStream)ascfield4.tokenStreamValue()).setValue(val);
|
||||
doc.add(ascfield4);
|
||||
((IntTrieTokenStream)ascfield2.tokenStreamValue()).setValue(val);
|
||||
doc.add(ascfield2);
|
||||
writer.addDocument(doc);
|
||||
}
|
||||
|
||||
|
@ -87,9 +108,9 @@ public class TestIntTrieRangeFilter extends LuceneTestCase
|
|||
assertNotNull(sd);
|
||||
assertEquals("Score doc count", count, sd.length );
|
||||
Document doc=searcher.doc(sd[0].doc);
|
||||
assertEquals("First doc", 2*distance+startOffset, TrieUtils.prefixCodedToInt(doc.get(field)) );
|
||||
assertEquals("First doc", 2*distance+startOffset, TrieUtils.prefixCodedToInt(doc.get("value")) );
|
||||
doc=searcher.doc(sd[sd.length-1].doc);
|
||||
assertEquals("Last doc", (1+count)*distance+startOffset, TrieUtils.prefixCodedToInt(doc.get(field)) );
|
||||
assertEquals("Last doc", (1+count)*distance+startOffset, TrieUtils.prefixCodedToInt(doc.get("value")) );
|
||||
}
|
||||
|
||||
public void testRange_8bit() throws Exception {
|
||||
|
@ -115,9 +136,9 @@ public class TestIntTrieRangeFilter extends LuceneTestCase
|
|||
assertNotNull(sd);
|
||||
assertEquals("Score doc count", count, sd.length );
|
||||
Document doc=searcher.doc(sd[0].doc);
|
||||
assertEquals("First doc", startOffset, TrieUtils.prefixCodedToInt(doc.get(field)) );
|
||||
assertEquals("First doc", startOffset, TrieUtils.prefixCodedToInt(doc.get("value")) );
|
||||
doc=searcher.doc(sd[sd.length-1].doc);
|
||||
assertEquals("Last doc", (count-1)*distance+startOffset, TrieUtils.prefixCodedToInt(doc.get(field)) );
|
||||
assertEquals("Last doc", (count-1)*distance+startOffset, TrieUtils.prefixCodedToInt(doc.get("value")) );
|
||||
}
|
||||
|
||||
public void testLeftOpenRange_8bit() throws Exception {
|
||||
|
@ -143,9 +164,9 @@ public class TestIntTrieRangeFilter extends LuceneTestCase
|
|||
assertNotNull(sd);
|
||||
assertEquals("Score doc count", noDocs-count, sd.length );
|
||||
Document doc=searcher.doc(sd[0].doc);
|
||||
assertEquals("First doc", count*distance+startOffset, TrieUtils.prefixCodedToInt(doc.get(field)) );
|
||||
assertEquals("First doc", count*distance+startOffset, TrieUtils.prefixCodedToInt(doc.get("value")) );
|
||||
doc=searcher.doc(sd[sd.length-1].doc);
|
||||
assertEquals("Last doc", (noDocs-1)*distance+startOffset, TrieUtils.prefixCodedToInt(doc.get(field)) );
|
||||
assertEquals("Last doc", (noDocs-1)*distance+startOffset, TrieUtils.prefixCodedToInt(doc.get("value")) );
|
||||
}
|
||||
|
||||
public void testRightOpenRange_8bit() throws Exception {
|
||||
|
@ -163,39 +184,47 @@ public class TestIntTrieRangeFilter extends LuceneTestCase
|
|||
private void testRandomTrieAndClassicRangeQuery(int precisionStep) throws Exception {
|
||||
final Random rnd=newRandom();
|
||||
String field="field"+precisionStep;
|
||||
// 50 random tests, the tests may also return 0 results, if min>max, but this is ok
|
||||
int termCount=0;
|
||||
for (int i=0; i<50; i++) {
|
||||
int lower=(int)(rnd.nextDouble()*noDocs*distance)+startOffset;
|
||||
int upper=(int)(rnd.nextDouble()*noDocs*distance)+startOffset;
|
||||
if (lower>upper) {
|
||||
int a=lower; lower=upper; upper=a;
|
||||
}
|
||||
// test inclusive range
|
||||
Query tq=new IntTrieRangeFilter(field, precisionStep, new Integer(lower), new Integer(upper), true, true).asQuery();
|
||||
IntTrieRangeFilter tf=new IntTrieRangeFilter(field, precisionStep, new Integer(lower), new Integer(upper), true, true);
|
||||
RangeQuery cq=new RangeQuery(field, TrieUtils.intToPrefixCoded(lower), TrieUtils.intToPrefixCoded(upper), true, true);
|
||||
cq.setConstantScoreRewrite(true);
|
||||
TopDocs tTopDocs = searcher.search(tq, 1);
|
||||
TopDocs tTopDocs = searcher.search(tf.asQuery(), 1);
|
||||
TopDocs cTopDocs = searcher.search(cq, 1);
|
||||
assertEquals("Returned count for IntTrieRangeFilter and RangeQuery must be equal", cTopDocs.totalHits, tTopDocs.totalHits );
|
||||
termCount += tf.getLastNumberOfTerms();
|
||||
// test exclusive range
|
||||
tq=new IntTrieRangeFilter(field, precisionStep, new Integer(lower), new Integer(upper), false, false).asQuery();
|
||||
tf=new IntTrieRangeFilter(field, precisionStep, new Integer(lower), new Integer(upper), false, false);
|
||||
cq=new RangeQuery(field, TrieUtils.intToPrefixCoded(lower), TrieUtils.intToPrefixCoded(upper), false, false);
|
||||
cq.setConstantScoreRewrite(true);
|
||||
tTopDocs = searcher.search(tq, 1);
|
||||
tTopDocs = searcher.search(tf.asQuery(), 1);
|
||||
cTopDocs = searcher.search(cq, 1);
|
||||
assertEquals("Returned count for IntTrieRangeFilter and RangeQuery must be equal", cTopDocs.totalHits, tTopDocs.totalHits );
|
||||
termCount += tf.getLastNumberOfTerms();
|
||||
// test left exclusive range
|
||||
tq=new IntTrieRangeFilter(field, precisionStep, new Integer(lower), new Integer(upper), false, true).asQuery();
|
||||
tf=new IntTrieRangeFilter(field, precisionStep, new Integer(lower), new Integer(upper), false, true);
|
||||
cq=new RangeQuery(field, TrieUtils.intToPrefixCoded(lower), TrieUtils.intToPrefixCoded(upper), false, true);
|
||||
cq.setConstantScoreRewrite(true);
|
||||
tTopDocs = searcher.search(tq, 1);
|
||||
tTopDocs = searcher.search(tf.asQuery(), 1);
|
||||
cTopDocs = searcher.search(cq, 1);
|
||||
assertEquals("Returned count for IntTrieRangeFilter and RangeQuery must be equal", cTopDocs.totalHits, tTopDocs.totalHits );
|
||||
termCount += tf.getLastNumberOfTerms();
|
||||
// test right exclusive range
|
||||
tq=new IntTrieRangeFilter(field, precisionStep, new Integer(lower), new Integer(upper), true, false).asQuery();
|
||||
tf=new IntTrieRangeFilter(field, precisionStep, new Integer(lower), new Integer(upper), true, false);
|
||||
cq=new RangeQuery(field, TrieUtils.intToPrefixCoded(lower), TrieUtils.intToPrefixCoded(upper), true, false);
|
||||
cq.setConstantScoreRewrite(true);
|
||||
tTopDocs = searcher.search(tq, 1);
|
||||
tTopDocs = searcher.search(tf.asQuery(), 1);
|
||||
cTopDocs = searcher.search(cq, 1);
|
||||
assertEquals("Returned count for IntTrieRangeFilter and RangeQuery must be equal", cTopDocs.totalHits, tTopDocs.totalHits );
|
||||
termCount += tf.getLastNumberOfTerms();
|
||||
}
|
||||
System.out.println("Average number of terms during random search: " + (((double)termCount)/(50*4)));
|
||||
}
|
||||
|
||||
public void testRandomTrieAndClassicRangeQuery_8bit() throws Exception {
|
||||
|
@ -267,9 +296,9 @@ public class TestIntTrieRangeFilter extends LuceneTestCase
|
|||
if (topDocs.totalHits==0) continue;
|
||||
ScoreDoc[] sd = topDocs.scoreDocs;
|
||||
assertNotNull(sd);
|
||||
int last=TrieUtils.prefixCodedToInt(searcher.doc(sd[0].doc).get(field));
|
||||
int last=TrieUtils.prefixCodedToInt(searcher.doc(sd[0].doc).get("value"));
|
||||
for (int j=1; j<sd.length; j++) {
|
||||
int act=TrieUtils.prefixCodedToInt(searcher.doc(sd[j].doc).get(field));
|
||||
int act=TrieUtils.prefixCodedToInt(searcher.doc(sd[j].doc).get("value"));
|
||||
assertTrue("Docs should be sorted backwards", last>act );
|
||||
last=act;
|
||||
}
|
||||
|
|
|
@ -0,0 +1,53 @@
|
|||
package org.apache.lucene.search.trie;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
|
||||
public class TestIntTrieTokenStream extends LuceneTestCase {
|
||||
|
||||
static final int precisionStep = 8;
|
||||
static final int value = 123456;
|
||||
|
||||
public void testStreamNewAPI() throws Exception {
|
||||
final IntTrieTokenStream stream=new IntTrieTokenStream(value, precisionStep);
|
||||
stream.setUseNewAPI(true);
|
||||
final ShiftAttribute shiftAtt = (ShiftAttribute) stream.addAttribute(ShiftAttribute.class);
|
||||
final TermAttribute termAtt = (TermAttribute) stream.addAttribute(TermAttribute.class);
|
||||
for (int shift=0; shift<32; shift+=precisionStep) {
|
||||
assertTrue("New token is available", stream.incrementToken());
|
||||
assertEquals("Shift value", shift, shiftAtt.getShift());
|
||||
assertEquals("Term is correctly encoded", TrieUtils.intToPrefixCoded(value, shift), termAtt.term());
|
||||
}
|
||||
assertFalse("No more tokens available", stream.incrementToken());
|
||||
}
|
||||
|
||||
public void testStreamOldAPI() throws Exception {
|
||||
final IntTrieTokenStream stream=new IntTrieTokenStream(value, precisionStep);
|
||||
stream.setUseNewAPI(false);
|
||||
Token tok=new Token();
|
||||
for (int shift=0; shift<32; shift+=precisionStep) {
|
||||
assertNotNull("New token is available", tok=stream.next(tok));
|
||||
assertEquals("Term is correctly encoded", TrieUtils.intToPrefixCoded(value, shift), tok.term());
|
||||
}
|
||||
assertNull("No more tokens available", stream.next(tok));
|
||||
}
|
||||
|
||||
}
|
|
@ -33,8 +33,7 @@ import org.apache.lucene.search.Sort;
|
|||
import org.apache.lucene.search.RangeQuery;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
public class TestLongTrieRangeFilter extends LuceneTestCase
|
||||
{
|
||||
public class TestLongTrieRangeFilter extends LuceneTestCase {
|
||||
// distance of entries
|
||||
private static final long distance = 66666L;
|
||||
// shift the starting of the values to the left, to also have negative values:
|
||||
|
@ -42,6 +41,15 @@ public class TestLongTrieRangeFilter extends LuceneTestCase
|
|||
// number of docs to generate for testing
|
||||
private static final int noDocs = 10000;
|
||||
|
||||
private static Field newField(String name, int precisionStep) {
|
||||
LongTrieTokenStream stream = new LongTrieTokenStream(precisionStep);
|
||||
stream.setUseNewAPI(true);
|
||||
Field f=new Field(name, stream);
|
||||
f.setOmitTermFreqAndPositions(true);
|
||||
f.setOmitNorms(true);
|
||||
return f;
|
||||
}
|
||||
|
||||
private static final RAMDirectory directory;
|
||||
private static final IndexSearcher searcher;
|
||||
static {
|
||||
|
@ -50,21 +58,34 @@ public class TestLongTrieRangeFilter extends LuceneTestCase
|
|||
IndexWriter writer = new IndexWriter(directory, new WhitespaceAnalyzer(),
|
||||
true, MaxFieldLength.UNLIMITED);
|
||||
|
||||
Field
|
||||
field8 = newField("field8", 8),
|
||||
field4 = newField("field4", 4),
|
||||
field2 = newField("field2", 2),
|
||||
ascfield8 = newField("ascfield8", 8),
|
||||
ascfield4 = newField("ascfield4", 4),
|
||||
ascfield2 = newField("ascfield2", 2);
|
||||
|
||||
// Add a series of noDocs docs with increasing long values
|
||||
for (int l=0; l<noDocs; l++) {
|
||||
Document doc=new Document();
|
||||
// add fields, that have a distance to test general functionality
|
||||
final long val=distance*l+startOffset;
|
||||
TrieUtils.addIndexedFields(doc,"field8", TrieUtils.trieCodeLong(val, 8));
|
||||
doc.add(new Field("field8", TrieUtils.longToPrefixCoded(val), Field.Store.YES, Field.Index.NO));
|
||||
TrieUtils.addIndexedFields(doc,"field4", TrieUtils.trieCodeLong(val, 4));
|
||||
doc.add(new Field("field4", TrieUtils.longToPrefixCoded(val), Field.Store.YES, Field.Index.NO));
|
||||
TrieUtils.addIndexedFields(doc,"field2", TrieUtils.trieCodeLong(val, 2));
|
||||
doc.add(new Field("field2", TrieUtils.longToPrefixCoded(val), Field.Store.YES, Field.Index.NO));
|
||||
long val=distance*l+startOffset;
|
||||
doc.add(new Field("value", TrieUtils.longToPrefixCoded(val), Field.Store.YES, Field.Index.NO));
|
||||
((LongTrieTokenStream)field8.tokenStreamValue()).setValue(val);
|
||||
doc.add(field8);
|
||||
((LongTrieTokenStream)field4.tokenStreamValue()).setValue(val);
|
||||
doc.add(field4);
|
||||
((LongTrieTokenStream)field2.tokenStreamValue()).setValue(val);
|
||||
doc.add(field2);
|
||||
// add ascending fields with a distance of 1, beginning at -noDocs/2 to test the correct splitting of range and inclusive/exclusive
|
||||
TrieUtils.addIndexedFields(doc,"ascfield8", TrieUtils.trieCodeLong(l-(noDocs/2), 8));
|
||||
TrieUtils.addIndexedFields(doc,"ascfield4", TrieUtils.trieCodeLong(l-(noDocs/2), 4));
|
||||
TrieUtils.addIndexedFields(doc,"ascfield2", TrieUtils.trieCodeLong(l-(noDocs/2), 2));
|
||||
val=l-(noDocs/2);
|
||||
((LongTrieTokenStream)ascfield8.tokenStreamValue()).setValue(val);
|
||||
doc.add(ascfield8);
|
||||
((LongTrieTokenStream)ascfield4.tokenStreamValue()).setValue(val);
|
||||
doc.add(ascfield4);
|
||||
((LongTrieTokenStream)ascfield2.tokenStreamValue()).setValue(val);
|
||||
doc.add(ascfield2);
|
||||
writer.addDocument(doc);
|
||||
}
|
||||
|
||||
|
@ -87,9 +108,9 @@ public class TestLongTrieRangeFilter extends LuceneTestCase
|
|||
assertNotNull(sd);
|
||||
assertEquals("Score doc count", count, sd.length );
|
||||
Document doc=searcher.doc(sd[0].doc);
|
||||
assertEquals("First doc", 2*distance+startOffset, TrieUtils.prefixCodedToLong(doc.get(field)) );
|
||||
assertEquals("First doc", 2*distance+startOffset, TrieUtils.prefixCodedToLong(doc.get("value")) );
|
||||
doc=searcher.doc(sd[sd.length-1].doc);
|
||||
assertEquals("Last doc", (1+count)*distance+startOffset, TrieUtils.prefixCodedToLong(doc.get(field)) );
|
||||
assertEquals("Last doc", (1+count)*distance+startOffset, TrieUtils.prefixCodedToLong(doc.get("value")) );
|
||||
}
|
||||
|
||||
public void testRange_8bit() throws Exception {
|
||||
|
@ -115,9 +136,9 @@ public class TestLongTrieRangeFilter extends LuceneTestCase
|
|||
assertNotNull(sd);
|
||||
assertEquals("Score doc count", count, sd.length );
|
||||
Document doc=searcher.doc(sd[0].doc);
|
||||
assertEquals("First doc", startOffset, TrieUtils.prefixCodedToLong(doc.get(field)) );
|
||||
assertEquals("First doc", startOffset, TrieUtils.prefixCodedToLong(doc.get("value")) );
|
||||
doc=searcher.doc(sd[sd.length-1].doc);
|
||||
assertEquals("Last doc", (count-1)*distance+startOffset, TrieUtils.prefixCodedToLong(doc.get(field)) );
|
||||
assertEquals("Last doc", (count-1)*distance+startOffset, TrieUtils.prefixCodedToLong(doc.get("value")) );
|
||||
}
|
||||
|
||||
public void testLeftOpenRange_8bit() throws Exception {
|
||||
|
@ -143,9 +164,9 @@ public class TestLongTrieRangeFilter extends LuceneTestCase
|
|||
assertNotNull(sd);
|
||||
assertEquals("Score doc count", noDocs-count, sd.length );
|
||||
Document doc=searcher.doc(sd[0].doc);
|
||||
assertEquals("First doc", count*distance+startOffset, TrieUtils.prefixCodedToLong(doc.get(field)) );
|
||||
assertEquals("First doc", count*distance+startOffset, TrieUtils.prefixCodedToLong(doc.get("value")) );
|
||||
doc=searcher.doc(sd[sd.length-1].doc);
|
||||
assertEquals("Last doc", (noDocs-1)*distance+startOffset, TrieUtils.prefixCodedToLong(doc.get(field)) );
|
||||
assertEquals("Last doc", (noDocs-1)*distance+startOffset, TrieUtils.prefixCodedToLong(doc.get("value")) );
|
||||
}
|
||||
|
||||
public void testRightOpenRange_8bit() throws Exception {
|
||||
|
@ -163,39 +184,47 @@ public class TestLongTrieRangeFilter extends LuceneTestCase
|
|||
private void testRandomTrieAndClassicRangeQuery(int precisionStep) throws Exception {
|
||||
final Random rnd=newRandom();
|
||||
String field="field"+precisionStep;
|
||||
// 50 random tests, the tests may also return 0 results, if min>max, but this is ok
|
||||
int termCount=0;
|
||||
for (int i=0; i<50; i++) {
|
||||
long lower=(long)(rnd.nextDouble()*noDocs*distance)+startOffset;
|
||||
long upper=(long)(rnd.nextDouble()*noDocs*distance)+startOffset;
|
||||
if (lower>upper) {
|
||||
long a=lower; lower=upper; upper=a;
|
||||
}
|
||||
// test inclusive range
|
||||
Query tq=new LongTrieRangeFilter(field, precisionStep, new Long(lower), new Long(upper), true, true).asQuery();
|
||||
LongTrieRangeFilter tf=new LongTrieRangeFilter(field, precisionStep, new Long(lower), new Long(upper), true, true);
|
||||
RangeQuery cq=new RangeQuery(field, TrieUtils.longToPrefixCoded(lower), TrieUtils.longToPrefixCoded(upper), true, true);
|
||||
cq.setConstantScoreRewrite(true);
|
||||
TopDocs tTopDocs = searcher.search(tq, 1);
|
||||
TopDocs tTopDocs = searcher.search(tf.asQuery(), 1);
|
||||
TopDocs cTopDocs = searcher.search(cq, 1);
|
||||
assertEquals("Returned count for LongTrieRangeFilter and RangeQuery must be equal", cTopDocs.totalHits, tTopDocs.totalHits );
|
||||
termCount += tf.getLastNumberOfTerms();
|
||||
// test exclusive range
|
||||
tq=new LongTrieRangeFilter(field, precisionStep, new Long(lower), new Long(upper), false, false).asQuery();
|
||||
tf=new LongTrieRangeFilter(field, precisionStep, new Long(lower), new Long(upper), false, false);
|
||||
cq=new RangeQuery(field, TrieUtils.longToPrefixCoded(lower), TrieUtils.longToPrefixCoded(upper), false, false);
|
||||
cq.setConstantScoreRewrite(true);
|
||||
tTopDocs = searcher.search(tq, 1);
|
||||
tTopDocs = searcher.search(tf.asQuery(), 1);
|
||||
cTopDocs = searcher.search(cq, 1);
|
||||
assertEquals("Returned count for LongTrieRangeFilter and RangeQuery must be equal", cTopDocs.totalHits, tTopDocs.totalHits );
|
||||
termCount += tf.getLastNumberOfTerms();
|
||||
// test left exclusive range
|
||||
tq=new LongTrieRangeFilter(field, precisionStep, new Long(lower), new Long(upper), false, true).asQuery();
|
||||
tf=new LongTrieRangeFilter(field, precisionStep, new Long(lower), new Long(upper), false, true);
|
||||
cq=new RangeQuery(field, TrieUtils.longToPrefixCoded(lower), TrieUtils.longToPrefixCoded(upper), false, true);
|
||||
cq.setConstantScoreRewrite(true);
|
||||
tTopDocs = searcher.search(tq, 1);
|
||||
tTopDocs = searcher.search(tf.asQuery(), 1);
|
||||
cTopDocs = searcher.search(cq, 1);
|
||||
assertEquals("Returned count for LongTrieRangeFilter and RangeQuery must be equal", cTopDocs.totalHits, tTopDocs.totalHits );
|
||||
termCount += tf.getLastNumberOfTerms();
|
||||
// test right exclusive range
|
||||
tq=new LongTrieRangeFilter(field, precisionStep, new Long(lower), new Long(upper), true, false).asQuery();
|
||||
tf=new LongTrieRangeFilter(field, precisionStep, new Long(lower), new Long(upper), true, false);
|
||||
cq=new RangeQuery(field, TrieUtils.longToPrefixCoded(lower), TrieUtils.longToPrefixCoded(upper), true, false);
|
||||
cq.setConstantScoreRewrite(true);
|
||||
tTopDocs = searcher.search(tq, 1);
|
||||
tTopDocs = searcher.search(tf.asQuery(), 1);
|
||||
cTopDocs = searcher.search(cq, 1);
|
||||
assertEquals("Returned count for LongTrieRangeFilter and RangeQuery must be equal", cTopDocs.totalHits, tTopDocs.totalHits );
|
||||
termCount += tf.getLastNumberOfTerms();
|
||||
}
|
||||
System.out.println("Average number of terms during random search: " + (((double)termCount)/(50*4)));
|
||||
}
|
||||
|
||||
public void testRandomTrieAndClassicRangeQuery_8bit() throws Exception {
|
||||
|
@ -267,9 +296,9 @@ public class TestLongTrieRangeFilter extends LuceneTestCase
|
|||
if (topDocs.totalHits==0) continue;
|
||||
ScoreDoc[] sd = topDocs.scoreDocs;
|
||||
assertNotNull(sd);
|
||||
long last=TrieUtils.prefixCodedToLong(searcher.doc(sd[0].doc).get(field));
|
||||
long last=TrieUtils.prefixCodedToLong(searcher.doc(sd[0].doc).get("value"));
|
||||
for (int j=1; j<sd.length; j++) {
|
||||
long act=TrieUtils.prefixCodedToLong(searcher.doc(sd[j].doc).get(field));
|
||||
long act=TrieUtils.prefixCodedToLong(searcher.doc(sd[j].doc).get("value"));
|
||||
assertTrue("Docs should be sorted backwards", last>act );
|
||||
last=act;
|
||||
}
|
||||
|
|
|
@ -0,0 +1,53 @@
|
|||
package org.apache.lucene.search.trie;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
|
||||
public class TestLongTrieTokenStream extends LuceneTestCase {
|
||||
|
||||
static final int precisionStep = 8;
|
||||
static final long value = 4573245871874382L;
|
||||
|
||||
public void testStreamNewAPI() throws Exception {
|
||||
final LongTrieTokenStream stream=new LongTrieTokenStream(value, precisionStep);
|
||||
stream.setUseNewAPI(true);
|
||||
final ShiftAttribute shiftAtt = (ShiftAttribute) stream.addAttribute(ShiftAttribute.class);
|
||||
final TermAttribute termAtt = (TermAttribute) stream.addAttribute(TermAttribute.class);
|
||||
for (int shift=0; shift<64; shift+=precisionStep) {
|
||||
assertTrue("New token is available", stream.incrementToken());
|
||||
assertEquals("Shift value", shift, shiftAtt.getShift());
|
||||
assertEquals("Term is correctly encoded", TrieUtils.longToPrefixCoded(value, shift), termAtt.term());
|
||||
}
|
||||
assertFalse("No more tokens available", stream.incrementToken());
|
||||
}
|
||||
|
||||
public void testStreamOldAPI() throws Exception {
|
||||
final LongTrieTokenStream stream=new LongTrieTokenStream(value, precisionStep);
|
||||
stream.setUseNewAPI(false);
|
||||
Token tok=new Token();
|
||||
for (int shift=0; shift<64; shift+=precisionStep) {
|
||||
assertNotNull("New token is available", tok=stream.next(tok));
|
||||
assertEquals("Term is correctly encoded", TrieUtils.longToPrefixCoded(value, shift), tok.term());
|
||||
}
|
||||
assertNull("No more tokens available", stream.next(tok));
|
||||
}
|
||||
|
||||
}
|
|
@ -179,7 +179,7 @@ public class TestTrieUtils extends LuceneTestCase {
|
|||
|
||||
TrieUtils.splitLongRange(new TrieUtils.LongRangeBuilder() {
|
||||
//@Override
|
||||
public void addRange(long min, long max, int shift, int level) {
|
||||
public void addRange(long min, long max, int shift) {
|
||||
assertTrue("min, max should be inside bounds", min>=lower && min<=upper && max>=lower && max<=upper);
|
||||
if (useBitSet) for (long l=min; l<=max; l++) {
|
||||
assertFalse("ranges should not overlap", bits.getAndSet(l-lower) );
|
||||
|
@ -253,7 +253,7 @@ public class TestTrieUtils extends LuceneTestCase {
|
|||
|
||||
TrieUtils.splitIntRange(new TrieUtils.IntRangeBuilder() {
|
||||
//@Override
|
||||
public void addRange(int min, int max, int shift, int level) {
|
||||
public void addRange(int min, int max, int shift) {
|
||||
assertTrue("min, max should be inside bounds", min>=lower && min<=upper && max>=lower && max<=upper);
|
||||
if (useBitSet) for (int i=min; i<=max; i++) {
|
||||
assertFalse("ranges should not overlap", bits.getAndSet(i-lower) );
|
||||
|
|
Loading…
Reference in New Issue