LUCENE-2514, LUCENE-2551: collation uses byte[] keys, deprecate old unscalable locale sort/range, termrangequery/filter work on bytes

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1075210 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2011-02-28 05:15:50 +00:00
parent 81ee0e72d3
commit 308e0bd4a9
62 changed files with 1334 additions and 1535 deletions

View File

@ -313,6 +313,21 @@ LUCENE-1458, LUCENE-2111: Flexible Indexing
- o.a.l.analysis.StopwordAnalyzerBase -> o.a.l.analysis.util.StopwordAnalyzerBase
- o.a.l.analysis.WordListLoader -> o.a.l.analysis.util.WordListLoader
* LUCENE-2514: The option to use a Collator's order (instead of binary order) for
sorting and range queries has been moved to contrib/queries.
The Collated TermRangeQuery/Filter has been moved to SlowCollatedTermRangeQuery/Filter,
and the collated sorting has been moved to SlowCollatedStringComparator.
Note: this functionality isn't very scalable and if you are using it, consider
indexing collation keys with the collation support in the analysis module instead.
To perform collated range queries, use a suitable collating analyzer: CollationKeyAnalyzer
or ICUCollationKeyAnalyzer, and set qp.setAnalyzeRangeTerms(true).
TermRangeQuery and TermRangeFilter now work purely on bytes. Both have helper factory methods
(newStringRange) similar to the NumericRange API, to easily perform range queries on Strings.
* LUCENE-2691: The near-real-time API has moved from IndexWriter to
IndexReader. Instead of IndexWriter.getReader(), call
IndexReader.open(IndexWriter) or IndexReader.reopen(IndexWriter).

View File

@ -828,7 +828,7 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
@Override
public void run() throws Exception {
numHighlights = 0;
TermRangeFilter rf = new TermRangeFilter("contents", "john", "john", true, true);
TermRangeFilter rf = TermRangeFilter.newStringRange("contents", "john", "john", true, true);
SpanQuery clauses[] = { new SpanTermQuery(new Term("contents", "john")),
new SpanTermQuery(new Term("contents", "kennedy")), };
SpanNearQuery snq = new SpanNearQuery(clauses, 1, true);
@ -851,7 +851,7 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
@Override
public void run() throws Exception {
numHighlights = 0;
TermRangeFilter rf = new TermRangeFilter("contents", "john", "john", true, true);
TermRangeFilter rf = TermRangeFilter.newStringRange("contents", "john", "john", true, true);
PhraseQuery pq = new PhraseQuery();
pq.add(new Term("contents", "john"));
pq.add(new Term("contents", "kennedy"));

View File

@ -0,0 +1,106 @@
package org.apache.lucene.search;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.text.Collator;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.search.FieldCache.DocTerms;
import org.apache.lucene.util.BytesRef;
/** Sorts by a field's value using the given Collator
*
* <p><b>WARNING</b>: this is very slow; you'll
* get much better performance using the
* CollationKeyAnalyzer or ICUCollationKeyAnalyzer.
* @deprecated Index collation keys with CollationKeyAnalyzer or ICUCollationKeyAnalyzer instead.
* This class will be removed in Lucene 5.0
*/
@Deprecated
public final class SlowCollatedStringComparator extends FieldComparator {
private final String[] values;
private DocTerms currentDocTerms;
private final String field;
final Collator collator;
private String bottom;
private final BytesRef tempBR = new BytesRef();
public SlowCollatedStringComparator(int numHits, String field, Collator collator) {
values = new String[numHits];
this.field = field;
this.collator = collator;
}
@Override
public int compare(int slot1, int slot2) {
final String val1 = values[slot1];
final String val2 = values[slot2];
if (val1 == null) {
if (val2 == null) {
return 0;
}
return -1;
} else if (val2 == null) {
return 1;
}
return collator.compare(val1, val2);
}
@Override
public int compareBottom(int doc) {
final String val2 = currentDocTerms.getTerm(doc, tempBR).utf8ToString();
if (bottom == null) {
if (val2 == null) {
return 0;
}
return -1;
} else if (val2 == null) {
return 1;
}
return collator.compare(bottom, val2);
}
@Override
public void copy(int slot, int doc) {
final BytesRef br = currentDocTerms.getTerm(doc, tempBR);
if (br == null) {
values[slot] = null;
} else {
values[slot] = br.utf8ToString();
}
}
@Override
public FieldComparator setNextReader(AtomicReaderContext context) throws IOException {
currentDocTerms = FieldCache.DEFAULT.getTerms(context.reader, field);
return this;
}
@Override
public void setBottom(final int bottom) {
this.bottom = values[bottom];
}
@Override
public Comparable<?> value(int slot) {
final String s = values[slot];
return s == null ? null : new BytesRef(values[slot]);
}
}

View File

@ -0,0 +1,70 @@
package org.apache.lucene.search;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.text.Collator;
/**
* A Filter that restricts search results to a range of term
* values in a given field.
*
* <p>This filter matches the documents looking for terms that fall into the
* supplied range according to {@link
* String#compareTo(String)}, unless a <code>Collator</code> is provided. It is not intended
* for numerical ranges; use {@link NumericRangeFilter} instead.
*
* <p>If you construct a large number of range filters with different ranges but on the
* same field, {@link FieldCacheRangeFilter} may have significantly better performance.
* @deprecated Index collation keys with CollationKeyAnalyzer or ICUCollationKeyAnalyzer instead.
* This class will be removed in Lucene 5.0
*/
@Deprecated
public class SlowCollatedTermRangeFilter extends MultiTermQueryWrapperFilter<SlowCollatedTermRangeQuery> {
/**
*
* @param lowerTerm The lower bound on this range
* @param upperTerm The upper bound on this range
* @param includeLower Does this range include the lower bound?
* @param includeUpper Does this range include the upper bound?
* @param collator The collator to use when determining range inclusion; set
* to null to use Unicode code point ordering instead of collation.
* @throws IllegalArgumentException if both terms are null or if
* lowerTerm is null and includeLower is true (similar for upperTerm
* and includeUpper)
*/
public SlowCollatedTermRangeFilter(String fieldName, String lowerTerm, String upperTerm,
boolean includeLower, boolean includeUpper,
Collator collator) {
super(new SlowCollatedTermRangeQuery(fieldName, lowerTerm, upperTerm, includeLower, includeUpper, collator));
}
/** Returns the lower value of this range filter */
public String getLowerTerm() { return query.getLowerTerm(); }
/** Returns the upper value of this range filter */
public String getUpperTerm() { return query.getUpperTerm(); }
/** Returns <code>true</code> if the lower endpoint is inclusive */
public boolean includesLower() { return query.includesLower(); }
/** Returns <code>true</code> if the upper endpoint is inclusive */
public boolean includesUpper() { return query.includesUpper(); }
/** Returns the collator used to determine range inclusion, if any. */
public Collator getCollator() { return query.getCollator(); }
}

View File

@ -0,0 +1,176 @@
package org.apache.lucene.search;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.text.Collator;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.ToStringUtils;
/**
* A Query that matches documents within an range of terms.
*
* <p>This query matches the documents looking for terms that fall into the
* supplied range according to {@link
* String#compareTo(String)}, unless a <code>Collator</code> is provided. It is not intended
* for numerical ranges; use {@link NumericRangeQuery} instead.
*
* <p>This query uses the {@link
* MultiTermQuery#CONSTANT_SCORE_AUTO_REWRITE_DEFAULT}
* rewrite method.
* @deprecated Index collation keys with CollationKeyAnalyzer or ICUCollationKeyAnalyzer instead.
* This class will be removed in Lucene 5.0
*/
@Deprecated
public class SlowCollatedTermRangeQuery extends MultiTermQuery {
private String lowerTerm;
private String upperTerm;
private boolean includeLower;
private boolean includeUpper;
private Collator collator;
/** Constructs a query selecting all terms greater/equal than
* <code>lowerTerm</code> but less/equal than <code>upperTerm</code>.
* <p>
* If an endpoint is null, it is said
* to be "open". Either or both endpoints may be open. Open endpoints may not
* be exclusive (you can't select all but the first or last term without
* explicitly specifying the term to exclude.)
* <p>
*
* @param lowerTerm The Term text at the lower end of the range
* @param upperTerm The Term text at the upper end of the range
* @param includeLower
* If true, the <code>lowerTerm</code> is
* included in the range.
* @param includeUpper
* If true, the <code>upperTerm</code> is
* included in the range.
* @param collator The collator to use to collate index Terms, to determine
* their membership in the range bounded by <code>lowerTerm</code> and
* <code>upperTerm</code>.
*/
public SlowCollatedTermRangeQuery(String field, String lowerTerm, String upperTerm,
boolean includeLower, boolean includeUpper, Collator collator) {
super(field);
this.lowerTerm = lowerTerm;
this.upperTerm = upperTerm;
this.includeLower = includeLower;
this.includeUpper = includeUpper;
this.collator = collator;
}
/** Returns the lower value of this range query */
public String getLowerTerm() { return lowerTerm; }
/** Returns the upper value of this range query */
public String getUpperTerm() { return upperTerm; }
/** Returns <code>true</code> if the lower endpoint is inclusive */
public boolean includesLower() { return includeLower; }
/** Returns <code>true</code> if the upper endpoint is inclusive */
public boolean includesUpper() { return includeUpper; }
/** Returns the collator used to determine range inclusion */
public Collator getCollator() { return collator; }
@Override
protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException {
if (lowerTerm != null && upperTerm != null && collator.compare(lowerTerm, upperTerm) > 0) {
return TermsEnum.EMPTY;
}
TermsEnum tenum = terms.iterator();
if (lowerTerm == null && upperTerm == null) {
return tenum;
}
return new SlowCollatedTermRangeTermsEnum(tenum,
lowerTerm, upperTerm, includeLower, includeUpper, collator);
}
/** @deprecated */
@Deprecated
public String field() {
return getField();
}
/** Prints a user-readable version of this query. */
@Override
public String toString(String field) {
StringBuilder buffer = new StringBuilder();
if (!getField().equals(field)) {
buffer.append(getField());
buffer.append(":");
}
buffer.append(includeLower ? '[' : '{');
buffer.append(lowerTerm != null ? lowerTerm : "*");
buffer.append(" TO ");
buffer.append(upperTerm != null ? upperTerm : "*");
buffer.append(includeUpper ? ']' : '}');
buffer.append(ToStringUtils.boost(getBoost()));
return buffer.toString();
}
@Override
public int hashCode() {
final int prime = 31;
int result = super.hashCode();
result = prime * result + ((collator == null) ? 0 : collator.hashCode());
result = prime * result + (includeLower ? 1231 : 1237);
result = prime * result + (includeUpper ? 1231 : 1237);
result = prime * result + ((lowerTerm == null) ? 0 : lowerTerm.hashCode());
result = prime * result + ((upperTerm == null) ? 0 : upperTerm.hashCode());
return result;
}
@Override
public boolean equals(Object obj) {
if (this == obj)
return true;
if (!super.equals(obj))
return false;
if (getClass() != obj.getClass())
return false;
SlowCollatedTermRangeQuery other = (SlowCollatedTermRangeQuery) obj;
if (collator == null) {
if (other.collator != null)
return false;
} else if (!collator.equals(other.collator))
return false;
if (includeLower != other.includeLower)
return false;
if (includeUpper != other.includeUpper)
return false;
if (lowerTerm == null) {
if (other.lowerTerm != null)
return false;
} else if (!lowerTerm.equals(other.lowerTerm))
return false;
if (upperTerm == null) {
if (other.upperTerm != null)
return false;
} else if (!upperTerm.equals(other.upperTerm))
return false;
return true;
}
}

View File

@ -0,0 +1,102 @@
package org.apache.lucene.search;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.text.Collator;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.BytesRef;
/**
* Subclass of FilteredTermEnum for enumerating all terms that match the
* specified range parameters.
* <p>Term enumerations are always ordered by
* {@link #getComparator}. Each term in the enumeration is
* greater than all that precede it.</p>
* @deprecated Index collation keys with CollationKeyAnalyzer or ICUCollationKeyAnalyzer instead.
* This class will be removed in Lucene 5.0
*/
@Deprecated
public class SlowCollatedTermRangeTermsEnum extends FilteredTermsEnum {
private Collator collator;
private String upperTermText;
private String lowerTermText;
private boolean includeLower;
private boolean includeUpper;
/**
* Enumerates all terms greater/equal than <code>lowerTerm</code>
* but less/equal than <code>upperTerm</code>.
*
* If an endpoint is null, it is said to be "open". Either or both
* endpoints may be open. Open endpoints may not be exclusive
* (you can't select all but the first or last term without
* explicitly specifying the term to exclude.)
*
* @param tenum
* @param lowerTermText
* The term text at the lower end of the range
* @param upperTermText
* The term text at the upper end of the range
* @param includeLower
* If true, the <code>lowerTerm</code> is included in the range.
* @param includeUpper
* If true, the <code>upperTerm</code> is included in the range.
* @param collator
* The collator to use to collate index Terms, to determine their
* membership in the range bounded by <code>lowerTerm</code> and
* <code>upperTerm</code>.
*
* @throws IOException
*/
public SlowCollatedTermRangeTermsEnum(TermsEnum tenum, String lowerTermText, String upperTermText,
boolean includeLower, boolean includeUpper, Collator collator) throws IOException {
super(tenum);
this.collator = collator;
this.upperTermText = upperTermText;
this.lowerTermText = lowerTermText;
this.includeLower = includeLower;
this.includeUpper = includeUpper;
// do a little bit of normalization...
// open ended range queries should always be inclusive.
if (this.lowerTermText == null) {
this.lowerTermText = "";
this.includeLower = true;
}
// TODO: optimize
BytesRef startBytesRef = new BytesRef("");
setInitialSeekTerm(startBytesRef);
}
@Override
protected AcceptStatus accept(BytesRef term) {
if ((includeLower
? collator.compare(term.utf8ToString(), lowerTermText) >= 0
: collator.compare(term.utf8ToString(), lowerTermText) > 0)
&& (upperTermText == null
|| (includeUpper
? collator.compare(term.utf8ToString(), upperTermText) <= 0
: collator.compare(term.utf8ToString(), upperTermText) < 0))) {
return AcceptStatus.YES;
}
return AcceptStatus.NO;
}
}

View File

@ -70,7 +70,7 @@ public class BooleanFilterTest extends LuceneTestCase {
private Filter getRangeFilter(String field,String lowerPrice, String upperPrice)
{
Filter f = new TermRangeFilter(field,lowerPrice,upperPrice,true,true);
Filter f = TermRangeFilter.newStringRange(field,lowerPrice,upperPrice,true,true);
return f;
}
private Filter getTermsFilter(String field,String text)

View File

@ -84,7 +84,7 @@ public class ChainedFilterTest extends LuceneTestCase {
//Date pastTheEnd = parseDate("2099 Jan 1");
// dateFilter = DateFilter.Before("date", pastTheEnd);
// just treat dates as strings and select the whole range for now...
dateFilter = new TermRangeFilter("date","","ZZZZ",true,true);
dateFilter = TermRangeFilter.newStringRange("date","","ZZZZ",true,true);
bobFilter = new QueryWrapperFilter(
new TermQuery(new Term("owner", "bob")));

View File

@ -0,0 +1,137 @@
package org.apache.lucene.search;
import java.io.IOException;
import java.text.Collator;
import java.util.Locale;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util._TestUtil;
import org.junit.AfterClass;
import org.junit.BeforeClass;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Tests SlowCollatedStringComparator, SlowCollatedTermRangeQuery, and SlowCollatedTermRangeFilter
*/
public class TestSlowCollationMethods extends LuceneTestCase {
private static Collator collator;
private static IndexSearcher searcher;
private static IndexReader reader;
private static Directory dir;
private static int numDocs;
@BeforeClass
public static void beforeClass() throws Exception {
final Locale locale = LuceneTestCase.randomLocale(random);
collator = Collator.getInstance(locale);
collator.setStrength(Collator.IDENTICAL);
collator.setDecomposition(Collator.NO_DECOMPOSITION);
numDocs = 1000 * RANDOM_MULTIPLIER;
dir = newDirectory();
RandomIndexWriter iw = new RandomIndexWriter(random, dir);
for (int i = 0; i < numDocs; i++) {
Document doc = new Document();
String value = _TestUtil.randomUnicodeString(random);
Field field = newField("field", value, Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS);
doc.add(field);
iw.addDocument(doc);
}
reader = iw.getReader();
iw.close();
// TODO: we should be able to use newSearcher, but custom sorts are broken if IS has an executorservice
// see LUCENE-2941
//searcher = newSearcher(reader);
searcher = new IndexSearcher(reader);
}
@AfterClass
public static void afterClass() throws Exception {
searcher.close();
reader.close();
dir.close();
collator = null;
searcher = null;
reader = null;
dir = null;
}
public void testSort() throws Exception {
SortField sf = new SortField("field", new FieldComparatorSource() {
@Override
public FieldComparator newComparator(String fieldname, int numHits, int sortPos, boolean reversed) throws IOException {
return new SlowCollatedStringComparator(numHits, fieldname, collator);
}
});
TopFieldDocs docs = searcher.search(new MatchAllDocsQuery(), null, numDocs, new Sort(sf));
String prev = "";
for (ScoreDoc doc : docs.scoreDocs) {
String value = reader.document(doc.doc).get("field");
assertTrue(collator.compare(value, prev) >= 0);
prev = value;
}
}
private void doTestRanges(String startPoint, String endPoint, Query query) throws Exception {
// positive test
TopDocs docs = searcher.search(query, numDocs);
for (ScoreDoc doc : docs.scoreDocs) {
String value = reader.document(doc.doc).get("field");
assertTrue(collator.compare(value, startPoint) >= 0);
assertTrue(collator.compare(value, endPoint) <= 0);
}
// negative test
BooleanQuery bq = new BooleanQuery();
bq.add(new MatchAllDocsQuery(), Occur.SHOULD);
bq.add(query, Occur.MUST_NOT);
docs = searcher.search(bq, numDocs);
for (ScoreDoc doc : docs.scoreDocs) {
String value = reader.document(doc.doc).get("field");
assertTrue(collator.compare(value, startPoint) < 0 || collator.compare(value, endPoint) > 0);
}
}
public void testRangeQuery() throws Exception {
int numQueries = 50*RANDOM_MULTIPLIER;
for (int i = 0; i < numQueries; i++) {
String startPoint = _TestUtil.randomUnicodeString(random);
String endPoint = _TestUtil.randomUnicodeString(random);
Query query = new SlowCollatedTermRangeQuery("field", startPoint, endPoint, true, true, collator);
doTestRanges(startPoint, endPoint, query);
}
}
public void testRangeFilter() throws Exception {
int numQueries = 50*RANDOM_MULTIPLIER;
for (int i = 0; i < numQueries; i++) {
String startPoint = _TestUtil.randomUnicodeString(random);
String endPoint = _TestUtil.randomUnicodeString(random);
Query query = new ConstantScoreQuery(new SlowCollatedTermRangeFilter("field", startPoint, endPoint, true, true, collator));
doTestRanges(startPoint, endPoint, query);
}
}
}

View File

@ -174,8 +174,7 @@ public class ComplexPhraseQueryParser extends QueryParser {
if (isPass2ResolvingPhrases) {
// Must use old-style RangeQuery in order to produce a BooleanQuery
// that can be turned into SpanOr clause
TermRangeQuery rangeQuery = new TermRangeQuery(field, part1, part2, startInclusive, endInclusive,
getRangeCollator());
TermRangeQuery rangeQuery = TermRangeQuery.newStringRange(field, part1, part2, startInclusive, endInclusive);
rangeQuery.setRewriteMethod(MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE);
return rangeQuery;
}

View File

@ -17,7 +17,6 @@ package org.apache.lucene.queryParser.standard;
* limitations under the License.
*/
import java.text.Collator;
import java.util.Locale;
import java.util.Map;
import java.util.TooManyListenersException;
@ -41,10 +40,8 @@ import org.apache.lucene.queryParser.standard.config.LowercaseExpandedTermsAttri
import org.apache.lucene.queryParser.standard.config.MultiFieldAttribute;
import org.apache.lucene.queryParser.standard.config.MultiTermRewriteMethodAttribute;
import org.apache.lucene.queryParser.standard.config.PositionIncrementsAttribute;
import org.apache.lucene.queryParser.standard.config.RangeCollatorAttribute;
import org.apache.lucene.queryParser.standard.config.StandardQueryConfigHandler;
import org.apache.lucene.queryParser.standard.config.DefaultOperatorAttribute.Operator;
import org.apache.lucene.queryParser.standard.nodes.RangeQueryNode;
import org.apache.lucene.queryParser.standard.parser.StandardSyntaxParser;
import org.apache.lucene.queryParser.standard.processors.StandardQueryNodeProcessorPipeline;
import org.apache.lucene.search.FuzzyQuery;
@ -187,32 +184,6 @@ public class StandardQueryParser extends QueryParserHelper {
return attr.getOperator();
}
/**
* Sets the collator used to determine index term inclusion in ranges for
* RangeQuerys.
* <p/>
* <strong>WARNING:</strong> Setting the rangeCollator to a non-null collator
* using this method will cause every single index Term in the Field
* referenced by lowerTerm and/or upperTerm to be examined. Depending on the
* number of index Terms in this Field, the operation could be very slow.
*
* @param collator
* the collator to use when constructing {@link RangeQueryNode}s
*/
public void setRangeCollator(Collator collator) {
RangeCollatorAttribute attr = getQueryConfigHandler().getAttribute(RangeCollatorAttribute.class);
attr.setDateResolution(collator);
}
/**
* @return the collator used to determine index term inclusion in ranges for
* RangeQuerys.
*/
public Collator getRangeCollator() {
RangeCollatorAttribute attr = getQueryConfigHandler().getAttribute(RangeCollatorAttribute.class);
return attr.getRangeCollator();
}
/**
* Sets the boolean operator of the QueryParser. In default mode (
* {@link Operator#OR}) terms without any modifiers are considered optional:

View File

@ -53,9 +53,7 @@ public class RangeQueryNodeBuilder implements StandardQueryBuilder {
String field = rangeNode.getField().toString();
TermRangeQuery rangeQuery = new TermRangeQuery(field, lower
.getTextAsString(), upper.getTextAsString(), lowerInclusive,
upperInclusive, rangeNode.getCollator());
TermRangeQuery rangeQuery = TermRangeQuery.newStringRange(field, lower.getTextAsString(), upper.getTextAsString(), lowerInclusive, upperInclusive);
MultiTermQuery.RewriteMethod method = (MultiTermQuery.RewriteMethod)queryNode.getTag(MultiTermRewriteMethodAttribute.TAG_ID);
if (method != null) {

View File

@ -1,92 +0,0 @@
package org.apache.lucene.queryParser.standard.config;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.text.Collator;
import org.apache.lucene.queryParser.core.config.QueryConfigHandler;
import org.apache.lucene.queryParser.standard.processors.ParametricRangeQueryNodeProcessor;
import org.apache.lucene.search.TermRangeQuery;
import org.apache.lucene.util.AttributeImpl;
/**
* This attribute is used by {@link ParametricRangeQueryNodeProcessor} processor
* and must be defined in the {@link QueryConfigHandler}. This attribute tells
* the processor which {@link Collator} should be used for a
* {@link TermRangeQuery} <br/>
*
* @see org.apache.lucene.queryParser.standard.config.RangeCollatorAttribute
*/
public class RangeCollatorAttributeImpl extends AttributeImpl
implements RangeCollatorAttribute {
private Collator rangeCollator;
public RangeCollatorAttributeImpl() {
rangeCollator = null; // default value for 2.4
}
public void setDateResolution(Collator rangeCollator) {
this.rangeCollator = rangeCollator;
}
public Collator getRangeCollator() {
return this.rangeCollator;
}
@Override
public void clear() {
throw new UnsupportedOperationException();
}
@Override
public void copyTo(AttributeImpl target) {
throw new UnsupportedOperationException();
}
@Override
public boolean equals(Object other) {
if (other instanceof RangeCollatorAttributeImpl) {
RangeCollatorAttributeImpl rangeCollatorAttr = (RangeCollatorAttributeImpl) other;
if (rangeCollatorAttr.rangeCollator == this.rangeCollator
|| rangeCollatorAttr.rangeCollator.equals(this.rangeCollator)) {
return true;
}
}
return false;
}
@Override
public int hashCode() {
return (this.rangeCollator == null) ? 0 : this.rangeCollator.hashCode();
}
@Override
public String toString() {
return "<rangeCollatorAttribute rangeCollator='" + this.rangeCollator
+ "'/>";
}
}

View File

@ -38,7 +38,6 @@ public class StandardQueryConfigHandler extends QueryConfigHandler {
addFieldConfigListener(new FieldDateResolutionFCListener(this));
// Default Values
addAttribute(RangeCollatorAttribute.class);
addAttribute(DefaultOperatorAttribute.class);
addAttribute(AnalyzerAttribute.class);
addAttribute(FuzzyAttribute.class);

View File

@ -17,34 +17,24 @@ package org.apache.lucene.queryParser.standard.nodes;
* limitations under the License.
*/
import java.text.Collator;
import org.apache.lucene.queryParser.core.nodes.ParametricQueryNode;
import org.apache.lucene.queryParser.core.nodes.ParametricRangeQueryNode;
import org.apache.lucene.queryParser.standard.config.RangeCollatorAttribute;
import org.apache.lucene.queryParser.standard.processors.ParametricRangeQueryNodeProcessor;
/**
* This query node represents a range query. It also holds which collator will
* be used by the range query and if the constant score rewrite is enabled. <br/>
* This query node represents a range query.
*
* @see ParametricRangeQueryNodeProcessor
* @see RangeCollatorAttribute
* @see org.apache.lucene.search.TermRangeQuery
*/
public class RangeQueryNode extends ParametricRangeQueryNode {
private Collator collator;
/**
* @param lower
* @param upper
*/
public RangeQueryNode(ParametricQueryNode lower, ParametricQueryNode upper, Collator collator) {
public RangeQueryNode(ParametricQueryNode lower, ParametricQueryNode upper) {
super(lower, upper);
this.collator = collator;
}
@Override
@ -57,12 +47,4 @@ public class RangeQueryNode extends ParametricRangeQueryNode {
return sb.toString();
}
/**
* @return the collator
*/
public Collator getCollator() {
return this.collator;
}
}

View File

@ -17,7 +17,6 @@ package org.apache.lucene.queryParser.standard.processors;
* limitations under the License.
*/
import java.text.Collator;
import java.text.DateFormat;
import java.util.Calendar;
import java.util.Date;
@ -36,7 +35,6 @@ import org.apache.lucene.queryParser.core.nodes.ParametricQueryNode.CompareOpera
import org.apache.lucene.queryParser.core.processors.QueryNodeProcessorImpl;
import org.apache.lucene.queryParser.standard.config.DateResolutionAttribute;
import org.apache.lucene.queryParser.standard.config.LocaleAttribute;
import org.apache.lucene.queryParser.standard.config.RangeCollatorAttribute;
import org.apache.lucene.queryParser.standard.nodes.RangeQueryNode;
/**
@ -54,12 +52,7 @@ import org.apache.lucene.queryParser.standard.nodes.RangeQueryNode;
* If a {@link DateResolutionAttribute} is defined and the {@link Resolution} is
* not <code>null</code> it will also be used to parse the date value. <br/>
* <br/>
* This processor will also try to retrieve a {@link RangeCollatorAttribute}
* from the {@link QueryConfigHandler}. If a {@link RangeCollatorAttribute} is
* found and the {@link Collator} is not <code>null</code>, it's set on the
* {@link RangeQueryNode}. <br/>
*
* @see RangeCollatorAttribute
* @see DateResolutionAttribute
* @see LocaleAttribute
* @see RangeQueryNode
@ -79,17 +72,9 @@ public class ParametricRangeQueryNodeProcessor extends QueryNodeProcessorImpl {
ParametricQueryNode upper = parametricRangeNode.getUpperBound();
ParametricQueryNode lower = parametricRangeNode.getLowerBound();
Locale locale = Locale.getDefault();
Collator collator = null;
DateTools.Resolution dateRes = null;
boolean inclusive = false;
if (getQueryConfigHandler().hasAttribute(RangeCollatorAttribute.class)) {
collator = getQueryConfigHandler().getAttribute(
RangeCollatorAttribute.class).getRangeCollator();
}
if (getQueryConfigHandler().hasAttribute(LocaleAttribute.class)) {
locale = getQueryConfigHandler().getAttribute(LocaleAttribute.class)
@ -155,7 +140,7 @@ public class ParametricRangeQueryNodeProcessor extends QueryNodeProcessorImpl {
lower.setText(part1);
upper.setText(part2);
return new RangeQueryNode(lower, upper, collator);
return new RangeQueryNode(lower, upper);
}

View File

@ -642,55 +642,6 @@ public class TestQPHelper extends LuceneTestCase {
"gack (bar blar {a TO z})");
}
public void testFarsiRangeCollating() throws Exception {
Directory ramDir = newDirectory();
IndexWriter iw = new IndexWriter(ramDir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(MockTokenizer.WHITESPACE, false)));
Document doc = new Document();
doc.add(newField("content", "\u0633\u0627\u0628", Field.Store.YES,
Field.Index.NOT_ANALYZED));
iw.addDocument(doc);
iw.close();
IndexSearcher is = new IndexSearcher(ramDir, true);
StandardQueryParser qp = new StandardQueryParser();
qp.setAnalyzer(new MockAnalyzer(MockTokenizer.WHITESPACE, false));
// Neither Java 1.4.2 nor 1.5.0 has Farsi Locale collation available in
// RuleBasedCollator. However, the Arabic Locale seems to order the
// Farsi
// characters properly.
Collator c = Collator.getInstance(new Locale("ar"));
qp.setRangeCollator(c);
// Unicode order would include U+0633 in [ U+062F - U+0698 ], but Farsi
// orders the U+0698 character before the U+0633 character, so the
// single
// index Term below should NOT be returned by a ConstantScoreRangeQuery
// with a Farsi Collator (or an Arabic one for the case when Farsi is
// not
// supported).
// Test ConstantScoreRangeQuery
qp.setMultiTermRewriteMethod(MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE);
ScoreDoc[] result = is.search(qp.parse("[ \u062F TO \u0698 ]", "content"),
null, 1000).scoreDocs;
assertEquals("The index Term should not be included.", 0, result.length);
result = is.search(qp.parse("[ \u0633 TO \u0638 ]", "content"), null, 1000).scoreDocs;
assertEquals("The index Term should be included.", 1, result.length);
// Test RangeQuery
qp.setMultiTermRewriteMethod(MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE);
result = is.search(qp.parse("[ \u062F TO \u0698 ]", "content"), null, 1000).scoreDocs;
assertEquals("The index Term should not be included.", 0, result.length);
result = is.search(qp.parse("[ \u0633 TO \u0638 ]", "content"), null, 1000).scoreDocs;
assertEquals("The index Term should be included.", 1, result.length);
is.close();
ramDir.close();
}
/** for testing DateTools support */
private String getDate(String s, DateTools.Resolution resolution)
throws Exception {

View File

@ -60,8 +60,6 @@ public class TestAttributes extends LuceneTestCase {
Collections.singletonMap(MultiTermRewriteMethodAttribute.class.getName()+"#multiTermRewriteMethod", MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT));
_TestUtil.assertAttributeReflection(new PositionIncrementsAttributeImpl(),
Collections.singletonMap(PositionIncrementsAttribute.class.getName()+"#positionIncrementsEnabled", false));
_TestUtil.assertAttributeReflection(new RangeCollatorAttributeImpl(),
Collections.singletonMap(RangeCollatorAttribute.class.getName()+"#rangeCollator", null));
}
}

View File

@ -41,7 +41,7 @@ public class RangeFilterBuilder implements FilterBuilder {
String upperTerm=e.getAttribute("upperTerm");
boolean includeLower=DOMUtils.getAttribute(e,"includeLower",true);
boolean includeUpper=DOMUtils.getAttribute(e,"includeUpper",true);
return new TermRangeFilter(fieldName,lowerTerm,upperTerm,includeLower,includeUpper);
return TermRangeFilter.newStringRange(fieldName,lowerTerm,upperTerm,includeLower,includeUpper);
}
}

View File

@ -77,7 +77,7 @@ public class CharTermAttributeImpl extends AttributeImpl implements CharTermAttr
}
// *** TermToBytesRefAttribute interface ***
public final int toBytesRef(BytesRef target) {
public int toBytesRef(BytesRef target) {
return UnicodeUtil.UTF16toUTF8WithHash(termBuffer, 0, termLength, target);
}

View File

@ -78,9 +78,9 @@ public abstract class QueryParserBase {
// maps field names to date resolutions
Map<String,DateTools.Resolution> fieldToDateResolution = null;
// The collator to use when determining range inclusion,
// for use when constructing RangeQuerys.
Collator rangeCollator = null;
//Whether or not to analyze range terms when constructing RangeQuerys
// (For example, analyzing terms into collation keys for locale-sensitive RangeQuery)
boolean analyzeRangeTerms = false;
boolean autoGeneratePhraseQueries;
@ -391,27 +391,21 @@ public abstract class QueryParserBase {
}
/**
* Sets the collator used to determine index term inclusion in ranges
* for RangeQuerys.
* <p/>
* <strong>WARNING:</strong> Setting the rangeCollator to a non-null
* collator using this method will cause every single index Term in the
* Field referenced by lowerTerm and/or upperTerm to be examined.
* Depending on the number of index Terms in this Field, the operation could
* be very slow.
*
* @param rc the collator to use when constructing RangeQuerys
* Set whether or not to analyze range terms when constructing RangeQuerys.
* For example, setting this to true can enable analyzing terms into
* collation keys for locale-sensitive RangeQuery.
*
* @param analyzeRangeTerms whether or not terms should be analyzed for RangeQuerys
*/
public void setRangeCollator(Collator rc) {
rangeCollator = rc;
public void setAnalyzeRangeTerms(boolean analyzeRangeTerms) {
this.analyzeRangeTerms = analyzeRangeTerms;
}
/**
* @return the collator used to determine index term inclusion in ranges
* for RangeQuerys.
* @return whether or not to analyze range terms when constructing RangeQuerys.
*/
public Collator getRangeCollator() {
return rangeCollator;
public boolean getAnalyzeRangeTerms() {
return analyzeRangeTerms;
}
protected void addClause(List<BooleanClause> clauses, int conj, int mods, Query q) {
@ -792,6 +786,36 @@ public abstract class QueryParserBase {
return new FuzzyQuery(term,minimumSimilarity,prefixLength);
}
private BytesRef analyzeRangePart(String field, String part) {
TokenStream source;
try {
source = analyzer.reusableTokenStream(field, new StringReader(part));
source.reset();
} catch (IOException e) {
source = analyzer.tokenStream(field, new StringReader(part));
}
BytesRef result = new BytesRef();
TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class);
try {
if (!source.incrementToken())
throw new IllegalArgumentException("analyzer returned no terms for range part: " + part);
termAtt.toBytesRef(result);
if (source.incrementToken())
throw new IllegalArgumentException("analyzer returned too many terms for range part: " + part);
} catch (IOException e) {
throw new RuntimeException("error analyzing range part: " + part, e);
}
try {
source.close();
} catch (IOException ignored) {}
return result;
}
/**
* Builds a new TermRangeQuery instance
* @param field Field
@ -802,7 +826,23 @@ public abstract class QueryParserBase {
* @return new TermRangeQuery instance
*/
protected Query newRangeQuery(String field, String part1, String part2, boolean startInclusive, boolean endInclusive) {
final TermRangeQuery query = new TermRangeQuery(field, part1, part2, startInclusive, endInclusive, rangeCollator);
final BytesRef start;
final BytesRef end;
if (part1 == null) {
start = null;
} else {
start = analyzeRangeTerms ? analyzeRangePart(field, part1) : new BytesRef(part1);
}
if (part2 == null) {
end = null;
} else {
end = analyzeRangeTerms ? analyzeRangePart(field, part2) : new BytesRef(part2);
}
final TermRangeQuery query = new TermRangeQuery(field, start, end, startInclusive, endInclusive);
query.setRewriteMethod(multiTermRewriteMethod);
return query;
}

View File

@ -18,8 +18,6 @@ package org.apache.lucene.search;
*/
import java.io.IOException;
import java.text.Collator;
import java.util.Locale;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.search.FieldCache.DocTermsIndex;
@ -718,85 +716,6 @@ public abstract class FieldComparator {
}
}
/** Sorts by a field's value using the Collator for a
* given Locale.
*
* <p><b>WARNING</b>: this is likely very slow; you'll
* get much better performance using the
* CollationKeyAnalyzer or ICUCollationKeyAnalyzer. */
public static final class StringComparatorLocale extends FieldComparator {
private final String[] values;
private DocTerms currentDocTerms;
private final String field;
final Collator collator;
private String bottom;
private final BytesRef tempBR = new BytesRef();
StringComparatorLocale(int numHits, String field, Locale locale) {
values = new String[numHits];
this.field = field;
collator = Collator.getInstance(locale);
}
@Override
public int compare(int slot1, int slot2) {
final String val1 = values[slot1];
final String val2 = values[slot2];
if (val1 == null) {
if (val2 == null) {
return 0;
}
return -1;
} else if (val2 == null) {
return 1;
}
return collator.compare(val1, val2);
}
@Override
public int compareBottom(int doc) {
final String val2 = currentDocTerms.getTerm(doc, tempBR).utf8ToString();
if (bottom == null) {
if (val2 == null) {
return 0;
}
return -1;
} else if (val2 == null) {
return 1;
}
return collator.compare(bottom, val2);
}
@Override
public void copy(int slot, int doc) {
final BytesRef br = currentDocTerms.getTerm(doc, tempBR);
if (br == null) {
values[slot] = null;
} else {
values[slot] = br.utf8ToString();
}
}
@Override
public FieldComparator setNextReader(AtomicReaderContext context) throws IOException {
currentDocTerms = FieldCache.DEFAULT.getTerms(context.reader, field);
return this;
}
@Override
public void setBottom(final int bottom) {
this.bottom = values[bottom];
}
@Override
public Comparable<?> value(int slot) {
final String s = values[slot];
return s == null ? null : new BytesRef(values[slot]);
}
}
/** Sorts by field's natural Term sort order, using
* ordinals. This is functionally equivalent to {@link
* TermValComparator}, but it first resolves the string

View File

@ -20,9 +20,6 @@ package org.apache.lucene.search;
import org.apache.lucene.util.PriorityQueue;
import org.apache.lucene.util.BytesRef;
import java.text.Collator;
import java.util.Locale;
/**
* Expert: Collects sorted results from Searchable's and collates them.
* The elements put into this queue must be of type FieldDoc.
@ -35,11 +32,6 @@ class FieldDocSortedHitQueue extends PriorityQueue<FieldDoc> {
volatile SortField[] fields = null;
// used in the case where the fields are sorted by locale
// based strings
volatile Collator[] collators = null;
/**
* Creates a hit queue sorted by the given list of fields.
* @param fields Fieldable names, in priority order (highest priority first).
@ -60,7 +52,6 @@ class FieldDocSortedHitQueue extends PriorityQueue<FieldDoc> {
*/
void setFields (SortField[] fields) {
this.fields = fields;
this.collators = hasCollators (fields);
}
@ -69,24 +60,6 @@ class FieldDocSortedHitQueue extends PriorityQueue<FieldDoc> {
return fields;
}
/** Returns an array of collators, possibly <code>null</code>. The collators
* correspond to any SortFields which were given a specific locale.
* @param fields Array of sort fields.
* @return Array, possibly <code>null</code>.
*/
private Collator[] hasCollators (final SortField[] fields) {
if (fields == null) return null;
Collator[] ret = new Collator[fields.length];
for (int i=0; i<fields.length; ++i) {
Locale locale = fields[i].getLocale();
if (locale != null)
ret[i] = Collator.getInstance (locale);
}
return ret;
}
/**
* Returns whether <code>a</code> is less relevant than <code>b</code>.
* @param a ScoreDoc
@ -109,11 +82,9 @@ class FieldDocSortedHitQueue extends PriorityQueue<FieldDoc> {
c = (s2 == null) ? 0 : -1;
} else if (s2 == null) {
c = 1;
} else if (fields[i].getLocale() == null) {
c = s1.compareTo(s2);
} else {
c = collators[i].compare(s1.utf8ToString(), s2.utf8ToString());
}
c = s1.compareTo(s2);
}
} else {
c = docA.fields[i].compareTo(docB.fields[i]);
if (type == SortField.SCORE) {

View File

@ -18,7 +18,6 @@ package org.apache.lucene.search;
*/
import java.io.IOException;
import java.util.Locale;
import org.apache.lucene.search.cache.*;
import org.apache.lucene.util.StringHelper;
@ -90,7 +89,6 @@ public class SortField {
private String field;
private int type; // defaults to determining type dynamically
private Locale locale; // defaults to "natural order" (no Locale)
boolean reverse = false; // defaults to natural order
private CachedArrayCreator<?> creator;
public Object missingValue = null; // used for 'sortMissingFirst/Last'
@ -213,28 +211,6 @@ public class SortField {
}
return this;
}
/** Creates a sort by terms in the given field sorted
* according to the given locale.
* @param field Name of field to sort by, cannot be <code>null</code>.
* @param locale Locale of values in the field.
*/
public SortField (String field, Locale locale) {
initFieldType(field, STRING);
this.locale = locale;
}
/** Creates a sort, possibly in reverse, by terms in the given field sorted
* according to the given locale.
* @param field Name of field to sort by, cannot be <code>null</code>.
* @param locale Locale of values in the field.
*/
public SortField (String field, Locale locale, boolean reverse) {
initFieldType(field, STRING);
this.locale = locale;
this.reverse = reverse;
}
/** Creates a sort with a custom comparison function.
* @param field Name of field to sort by; cannot be <code>null</code>.
@ -295,14 +271,6 @@ public class SortField {
return type;
}
/** Returns the Locale by which term values are interpreted.
* May return <code>null</code> if no Locale was specified.
* @return Locale, or <code>null</code>.
*/
public Locale getLocale() {
return locale;
}
/** Returns the instance of a {@link FieldCache} parser that fits to the given sort type.
* May return <code>null</code> if no parser was specified. Sorting is using the default parser then.
* @return An instance of a {@link FieldCache} parser, or <code>null</code>.
@ -384,7 +352,6 @@ public class SortField {
break;
}
if (locale != null) buffer.append('(').append(locale).append(')');
if (creator != null) buffer.append('(').append(creator).append(')');
if (reverse) buffer.append('!');
@ -404,7 +371,6 @@ public class SortField {
other.field == this.field // field is always interned
&& other.type == this.type
&& other.reverse == this.reverse
&& (other.locale == null ? this.locale == null : other.locale.equals(this.locale))
&& (other.comparatorSource == null ? this.comparatorSource == null : other.comparatorSource.equals(this.comparatorSource))
&& (other.creator == null ? this.creator == null : other.creator.equals(this.creator))
);
@ -419,7 +385,6 @@ public class SortField {
public int hashCode() {
int hash=type^0x346565dd + Boolean.valueOf(reverse).hashCode()^0xaf5998bb;
if (field != null) hash += field.hashCode()^0xff5685dd;
if (locale != null) hash += locale.hashCode()^0x08150815;
if (comparatorSource != null) hash += comparatorSource.hashCode();
if (creator != null) hash += creator.hashCode()^0x3aaf56ff;
return hash;
@ -439,13 +404,6 @@ public class SortField {
*/
public FieldComparator getComparator(final int numHits, final int sortPos) throws IOException {
if (locale != null) {
// TODO: it'd be nice to allow FieldCache.getStringIndex
// to optionally accept a Locale so sorting could then use
// the faster StringComparator impls
return new FieldComparator.StringComparatorLocale(numHits, field, locale);
}
switch (type) {
case SortField.SCORE:
return new FieldComparator.RelevanceComparator(numHits);

View File

@ -1,5 +1,7 @@
package org.apache.lucene.search;
import org.apache.lucene.util.BytesRef;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@ -17,15 +19,13 @@ package org.apache.lucene.search;
* limitations under the License.
*/
import java.text.Collator;
/**
* A Filter that restricts search results to a range of term
* values in a given field.
*
* <p>This filter matches the documents looking for terms that fall into the
* supplied range according to {@link
* String#compareTo(String)}, unless a <code>Collator</code> is provided. It is not intended
* Byte#compareTo(Byte)}, It is not intended
* for numerical ranges; use {@link NumericRangeFilter} instead.
*
* <p>If you construct a large number of range filters with different ranges but on the
@ -44,39 +44,25 @@ public class TermRangeFilter extends MultiTermQueryWrapperFilter<TermRangeQuery>
* lowerTerm is null and includeLower is true (similar for upperTerm
* and includeUpper)
*/
public TermRangeFilter(String fieldName, String lowerTerm, String upperTerm,
public TermRangeFilter(String fieldName, BytesRef lowerTerm, BytesRef upperTerm,
boolean includeLower, boolean includeUpper) {
super(new TermRangeQuery(fieldName, lowerTerm, upperTerm, includeLower, includeUpper));
}
/**
* <strong>WARNING:</strong> Using this constructor and supplying a non-null
* value in the <code>collator</code> parameter will cause every single
* index Term in the Field referenced by lowerTerm and/or upperTerm to be
* examined. Depending on the number of index Terms in this Field, the
* operation could be very slow.
*
* @param lowerTerm The lower bound on this range
* @param upperTerm The upper bound on this range
* @param includeLower Does this range include the lower bound?
* @param includeUpper Does this range include the upper bound?
* @param collator The collator to use when determining range inclusion; set
* to null to use Unicode code point ordering instead of collation.
* @throws IllegalArgumentException if both terms are null or if
* lowerTerm is null and includeLower is true (similar for upperTerm
* and includeUpper)
* Factory that creates a new TermRangeFilter using Strings for term text.
*/
public TermRangeFilter(String fieldName, String lowerTerm, String upperTerm,
boolean includeLower, boolean includeUpper,
Collator collator) {
super(new TermRangeQuery(fieldName, lowerTerm, upperTerm, includeLower, includeUpper, collator));
public static TermRangeFilter newStringRange(String field, String lowerTerm, String upperTerm, boolean includeLower, boolean includeUpper) {
BytesRef lower = lowerTerm == null ? null : new BytesRef(lowerTerm);
BytesRef upper = upperTerm == null ? null : new BytesRef(upperTerm);
return new TermRangeFilter(field, lower, upper, includeLower, includeUpper);
}
/**
* Constructs a filter for field <code>fieldName</code> matching
* less than or equal to <code>upperTerm</code>.
*/
public static TermRangeFilter Less(String fieldName, String upperTerm) {
public static TermRangeFilter Less(String fieldName, BytesRef upperTerm) {
return new TermRangeFilter(fieldName, null, upperTerm, false, true);
}
@ -84,22 +70,19 @@ public class TermRangeFilter extends MultiTermQueryWrapperFilter<TermRangeQuery>
* Constructs a filter for field <code>fieldName</code> matching
* greater than or equal to <code>lowerTerm</code>.
*/
public static TermRangeFilter More(String fieldName, String lowerTerm) {
public static TermRangeFilter More(String fieldName, BytesRef lowerTerm) {
return new TermRangeFilter(fieldName, lowerTerm, null, true, false);
}
/** Returns the lower value of this range filter */
public String getLowerTerm() { return query.getLowerTerm(); }
public BytesRef getLowerTerm() { return query.getLowerTerm(); }
/** Returns the upper value of this range filter */
public String getUpperTerm() { return query.getUpperTerm(); }
public BytesRef getUpperTerm() { return query.getUpperTerm(); }
/** Returns <code>true</code> if the lower endpoint is inclusive */
public boolean includesLower() { return query.includesLower(); }
/** Returns <code>true</code> if the upper endpoint is inclusive */
public boolean includesUpper() { return query.includesUpper(); }
/** Returns the collator used to determine range inclusion, if any. */
public Collator getCollator() { return query.getCollator(); }
}

View File

@ -18,11 +18,11 @@ package org.apache.lucene.search;
*/
import java.io.IOException;
import java.text.Collator;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.ToStringUtils;
/**
@ -30,7 +30,7 @@ import org.apache.lucene.util.ToStringUtils;
*
* <p>This query matches the documents looking for terms that fall into the
* supplied range according to {@link
* String#compareTo(String)}, unless a <code>Collator</code> is provided. It is not intended
* Byte#compareTo(Byte)}. It is not intended
* for numerical ranges; use {@link NumericRangeQuery} instead.
*
* <p>This query uses the {@link
@ -40,9 +40,8 @@ import org.apache.lucene.util.ToStringUtils;
*/
public class TermRangeQuery extends MultiTermQuery {
private String lowerTerm;
private String upperTerm;
private Collator collator;
private BytesRef lowerTerm;
private BytesRef upperTerm;
private boolean includeLower;
private boolean includeUpper;
@ -69,78 +68,48 @@ public class TermRangeQuery extends MultiTermQuery {
* If true, the <code>upperTerm</code> is
* included in the range.
*/
public TermRangeQuery(String field, String lowerTerm, String upperTerm, boolean includeLower, boolean includeUpper) {
this(field, lowerTerm, upperTerm, includeLower, includeUpper, null);
}
/** Constructs a query selecting all terms greater/equal than
* <code>lowerTerm</code> but less/equal than <code>upperTerm</code>.
* <p>
* If an endpoint is null, it is said
* to be "open". Either or both endpoints may be open. Open endpoints may not
* be exclusive (you can't select all but the first or last term without
* explicitly specifying the term to exclude.)
* <p>
* If <code>collator</code> is not null, it will be used to decide whether
* index terms are within the given range, rather than using the Unicode code
* point order in which index terms are stored.
* <p>
* <strong>WARNING:</strong> Using this constructor and supplying a non-null
* value in the <code>collator</code> parameter will cause every single
* index Term in the Field referenced by lowerTerm and/or upperTerm to be
* examined. Depending on the number of index Terms in this Field, the
* operation could be very slow.
*
* @param lowerTerm The Term text at the lower end of the range
* @param upperTerm The Term text at the upper end of the range
* @param includeLower
* If true, the <code>lowerTerm</code> is
* included in the range.
* @param includeUpper
* If true, the <code>upperTerm</code> is
* included in the range.
* @param collator The collator to use to collate index Terms, to determine
* their membership in the range bounded by <code>lowerTerm</code> and
* <code>upperTerm</code>.
*/
public TermRangeQuery(String field, String lowerTerm, String upperTerm, boolean includeLower, boolean includeUpper,
Collator collator) {
public TermRangeQuery(String field, BytesRef lowerTerm, BytesRef upperTerm, boolean includeLower, boolean includeUpper) {
super(field);
this.lowerTerm = lowerTerm;
this.upperTerm = upperTerm;
this.includeLower = includeLower;
this.includeUpper = includeUpper;
this.collator = collator;
}
/**
* Factory that creates a new TermRangeQuery using Strings for term text.
*/
public static TermRangeQuery newStringRange(String field, String lowerTerm, String upperTerm, boolean includeLower, boolean includeUpper) {
BytesRef lower = lowerTerm == null ? null : new BytesRef(lowerTerm);
BytesRef upper = upperTerm == null ? null : new BytesRef(upperTerm);
return new TermRangeQuery(field, lower, upper, includeLower, includeUpper);
}
/** Returns the lower value of this range query */
public String getLowerTerm() { return lowerTerm; }
public BytesRef getLowerTerm() { return lowerTerm; }
/** Returns the upper value of this range query */
public String getUpperTerm() { return upperTerm; }
public BytesRef getUpperTerm() { return upperTerm; }
/** Returns <code>true</code> if the lower endpoint is inclusive */
public boolean includesLower() { return includeLower; }
/** Returns <code>true</code> if the upper endpoint is inclusive */
public boolean includesUpper() { return includeUpper; }
/** Returns the collator used to determine range inclusion, if any. */
public Collator getCollator() { return collator; }
@Override
protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException {
if (collator == null && lowerTerm != null && upperTerm != null && lowerTerm.compareTo(upperTerm) > 0) {
if (lowerTerm != null && upperTerm != null && lowerTerm.compareTo(upperTerm) > 0) {
return TermsEnum.EMPTY;
}
TermsEnum tenum = terms.iterator();
if ((lowerTerm == null || (collator == null && includeLower && "".equals(lowerTerm))) && upperTerm == null) {
if ((lowerTerm == null || (includeLower && lowerTerm.length == 0)) && upperTerm == null) {
return tenum;
}
return new TermRangeTermsEnum(tenum,
lowerTerm, upperTerm, includeLower, includeUpper, collator);
lowerTerm, upperTerm, includeLower, includeUpper);
}
/** Prints a user-readable version of this query. */
@ -152,9 +121,10 @@ public class TermRangeQuery extends MultiTermQuery {
buffer.append(":");
}
buffer.append(includeLower ? '[' : '{');
buffer.append(lowerTerm != null ? ("*".equals(lowerTerm) ? "\\*" : lowerTerm) : "*");
// TODO: all these toStrings for queries should just output the bytes, it might not be UTF-8!
buffer.append(lowerTerm != null ? ("*".equals(lowerTerm.utf8ToString()) ? "\\*" : lowerTerm.utf8ToString()) : "*");
buffer.append(" TO ");
buffer.append(upperTerm != null ? ("*".equals(upperTerm) ? "\\*" : upperTerm) : "*");
buffer.append(upperTerm != null ? ("*".equals(upperTerm.utf8ToString()) ? "\\*" : upperTerm.utf8ToString()) : "*");
buffer.append(includeUpper ? ']' : '}');
buffer.append(ToStringUtils.boost(getBoost()));
return buffer.toString();
@ -164,7 +134,6 @@ public class TermRangeQuery extends MultiTermQuery {
public int hashCode() {
final int prime = 31;
int result = super.hashCode();
result = prime * result + ((collator == null) ? 0 : collator.hashCode());
result = prime * result + (includeLower ? 1231 : 1237);
result = prime * result + (includeUpper ? 1231 : 1237);
result = prime * result + ((lowerTerm == null) ? 0 : lowerTerm.hashCode());
@ -181,11 +150,6 @@ public class TermRangeQuery extends MultiTermQuery {
if (getClass() != obj.getClass())
return false;
TermRangeQuery other = (TermRangeQuery) obj;
if (collator == null) {
if (other.collator != null)
return false;
} else if (!collator.equals(other.collator))
return false;
if (includeLower != other.includeLower)
return false;
if (includeUpper != other.includeUpper)

View File

@ -18,7 +18,6 @@ package org.apache.lucene.search;
*/
import java.io.IOException;
import java.text.Collator;
import java.util.Comparator;
import org.apache.lucene.index.TermsEnum;
@ -33,11 +32,8 @@ import org.apache.lucene.util.BytesRef;
*/
public class TermRangeTermsEnum extends FilteredTermsEnum {
private Collator collator;
private String upperTermText;
private String lowerTermText;
private boolean includeLower;
private boolean includeUpper;
final private boolean includeLower;
final private boolean includeUpper;
final private BytesRef lowerBytesRef;
final private BytesRef upperBytesRef;
private final Comparator<BytesRef> termComp;
@ -53,79 +49,61 @@ public class TermRangeTermsEnum extends FilteredTermsEnum {
*
* @param tenum
* TermsEnum to filter
* @param lowerTermText
* @param lowerTerm
* The term text at the lower end of the range
* @param upperTermText
* @param upperTerm
* The term text at the upper end of the range
* @param includeLower
* If true, the <code>lowerTerm</code> is included in the range.
* @param includeUpper
* If true, the <code>upperTerm</code> is included in the range.
* @param collator
* The collator to use to collate index Terms, to determine their
* membership in the range bounded by <code>lowerTerm</code> and
* <code>upperTerm</code>.
*
* @throws IOException
*/
public TermRangeTermsEnum(TermsEnum tenum, String lowerTermText, String upperTermText,
boolean includeLower, boolean includeUpper, Collator collator) throws IOException {
public TermRangeTermsEnum(TermsEnum tenum, BytesRef lowerTerm, BytesRef upperTerm,
boolean includeLower, boolean includeUpper) throws IOException {
super(tenum);
this.collator = collator;
this.upperTermText = upperTermText;
this.lowerTermText = lowerTermText;
this.includeLower = includeLower;
this.includeUpper = includeUpper;
// do a little bit of normalization...
// open ended range queries should always be inclusive.
if (this.lowerTermText == null) {
this.lowerTermText = "";
if (lowerTerm == null) {
this.lowerBytesRef = new BytesRef();
this.includeLower = true;
} else {
this.lowerBytesRef = lowerTerm;
this.includeLower = includeLower;
}
lowerBytesRef = new BytesRef(this.lowerTermText);
if (this.upperTermText == null) {
if (upperTerm == null) {
this.includeUpper = true;
upperBytesRef = null;
} else {
upperBytesRef = new BytesRef(upperTermText);
this.includeUpper = includeUpper;
upperBytesRef = upperTerm;
}
BytesRef startBytesRef = (collator == null) ? lowerBytesRef : new BytesRef("");
setInitialSeekTerm(startBytesRef);
setInitialSeekTerm(lowerBytesRef);
termComp = getComparator();
}
@Override
protected AcceptStatus accept(BytesRef term) {
if (collator == null) {
if (!this.includeLower && term.equals(lowerBytesRef))
return AcceptStatus.NO;
// Use this field's default sort ordering
if (upperBytesRef != null) {
final int cmp = termComp.compare(upperBytesRef, term);
/*
* if beyond the upper term, or is exclusive and this is equal to
* the upper term, break out
*/
if ((cmp < 0) ||
(!includeUpper && cmp==0)) {
return AcceptStatus.END;
}
}
return AcceptStatus.YES;
} else {
if ((includeLower
? collator.compare(term.utf8ToString(), lowerTermText) >= 0
: collator.compare(term.utf8ToString(), lowerTermText) > 0)
&& (upperTermText == null
|| (includeUpper
? collator.compare(term.utf8ToString(), upperTermText) <= 0
: collator.compare(term.utf8ToString(), upperTermText) < 0))) {
return AcceptStatus.YES;
}
if (!this.includeLower && term.equals(lowerBytesRef))
return AcceptStatus.NO;
// Use this field's default sort ordering
if (upperBytesRef != null) {
final int cmp = termComp.compare(upperBytesRef, term);
/*
* if beyond the upper term, or is exclusive and this is equal to
* the upper term, break out
*/
if ((cmp < 0) ||
(!includeUpper && cmp==0)) {
return AcceptStatus.END;
}
}
return AcceptStatus.YES;
}
}

View File

@ -39,7 +39,10 @@ import java.nio.ByteBuffer;
* <p/>
*
* @lucene.experimental
* @deprecated Implement {@link TermToBytesRefAttribute} and store bytes directly
* instead. This class will be removed in Lucene 5.0
*/
@Deprecated
public final class IndexableBinaryStringTools {
private static final CodingCase[] CODING_CASES = {

View File

@ -577,50 +577,6 @@ public class TestQueryParser extends LuceneTestCase {
assertQueryEquals("[\\* TO \"*\"]",null,"[\\* TO \\*]");
}
public void testFarsiRangeCollating() throws Exception {
Directory ramDir = newDirectory();
IndexWriter iw = new IndexWriter(ramDir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(MockTokenizer.WHITESPACE, false)));
Document doc = new Document();
doc.add(newField("content","\u0633\u0627\u0628",
Field.Store.YES, Field.Index.NOT_ANALYZED));
iw.addDocument(doc);
iw.close();
IndexSearcher is = new IndexSearcher(ramDir, true);
QueryParser qp = new QueryParser(TEST_VERSION_CURRENT, "content", new MockAnalyzer(MockTokenizer.WHITESPACE, false));
// Neither Java 1.4.2 nor 1.5.0 has Farsi Locale collation available in
// RuleBasedCollator. However, the Arabic Locale seems to order the Farsi
// characters properly.
Collator c = Collator.getInstance(new Locale("ar"));
qp.setRangeCollator(c);
// Unicode order would include U+0633 in [ U+062F - U+0698 ], but Farsi
// orders the U+0698 character before the U+0633 character, so the single
// index Term below should NOT be returned by a ConstantScoreRangeQuery
// with a Farsi Collator (or an Arabic one for the case when Farsi is not
// supported).
// Test ConstantScoreRangeQuery
qp.setMultiTermRewriteMethod(MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE);
ScoreDoc[] result = is.search(qp.parse("[ \u062F TO \u0698 ]"), null, 1000).scoreDocs;
assertEquals("The index Term should not be included.", 0, result.length);
result = is.search(qp.parse("[ \u0633 TO \u0638 ]"), null, 1000).scoreDocs;
assertEquals("The index Term should be included.", 1, result.length);
// Test TermRangeQuery
qp.setMultiTermRewriteMethod(MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE);
result = is.search(qp.parse("[ \u062F TO \u0698 ]"), null, 1000).scoreDocs;
assertEquals("The index Term should not be included.", 0, result.length);
result = is.search(qp.parse("[ \u0633 TO \u0638 ]"), null, 1000).scoreDocs;
assertEquals("The index Term should be included.", 1, result.length);
is.close();
ramDir.close();
}
private String escapeDateString(String s) {
if (s.indexOf(" ") > -1) {
return "\"" + s + "\"";
@ -1260,4 +1216,41 @@ public class TestQueryParser extends LuceneTestCase {
Query unexpanded = new TermQuery(new Term("field", "dogs"));
assertEquals(unexpanded, smart.parse("\"dogs\""));
}
/**
* Mock collation analyzer: indexes terms as "collated" + term
*/
private class MockCollationFilter extends TokenFilter {
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
protected MockCollationFilter(TokenStream input) {
super(input);
}
@Override
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
String term = termAtt.toString();
termAtt.setEmpty().append("collated").append(term);
return true;
} else {
return false;
}
}
}
private class MockCollationAnalyzer extends Analyzer {
@Override
public TokenStream tokenStream(String fieldName, Reader reader) {
return new MockCollationFilter(new MockTokenizer(reader, MockTokenizer.WHITESPACE, true));
}
}
public void testCollatedRange() throws Exception {
QueryParser qp = new QueryParser(TEST_VERSION_CURRENT, "field", new MockCollationAnalyzer());
qp.setAnalyzeRangeTerms(true);
Query expected = TermRangeQuery.newStringRange("field", "collatedabc", "collateddef", true, true);
Query actual = qp.parse("[abc TO def]");
assertEquals(expected, actual);
}
}

View File

@ -35,7 +35,7 @@ public class TestConstantScoreQuery extends LuceneTestCase {
public void testCSQ() throws Exception {
final Query q1 = new ConstantScoreQuery(new TermQuery(new Term("a", "b")));
final Query q2 = new ConstantScoreQuery(new TermQuery(new Term("a", "c")));
final Query q3 = new ConstantScoreQuery(new TermRangeFilter("a", "b", "c", true, true));
final Query q3 = new ConstantScoreQuery(TermRangeFilter.newStringRange("a", "b", "c", true, true));
QueryUtils.check(q1);
QueryUtils.check(q2);
QueryUtils.checkEqual(q1,q1);

View File

@ -61,12 +61,12 @@ public class TestDateFilter extends LuceneTestCase {
// filter that should preserve matches
// DateFilter df1 = DateFilter.Before("datefield", now);
TermRangeFilter df1 = new TermRangeFilter("datefield", DateTools
TermRangeFilter df1 = TermRangeFilter.newStringRange("datefield", DateTools
.timeToString(now - 2000, DateTools.Resolution.MILLISECOND), DateTools
.timeToString(now, DateTools.Resolution.MILLISECOND), false, true);
// filter that should discard matches
// DateFilter df2 = DateFilter.Before("datefield", now - 999999);
TermRangeFilter df2 = new TermRangeFilter("datefield", DateTools
TermRangeFilter df2 = TermRangeFilter.newStringRange("datefield", DateTools
.timeToString(0, DateTools.Resolution.MILLISECOND), DateTools
.timeToString(now - 2000, DateTools.Resolution.MILLISECOND), true,
false);
@ -128,13 +128,13 @@ public class TestDateFilter extends LuceneTestCase {
// filter that should preserve matches
// DateFilter df1 = DateFilter.After("datefield", now);
TermRangeFilter df1 = new TermRangeFilter("datefield", DateTools
TermRangeFilter df1 = TermRangeFilter.newStringRange("datefield", DateTools
.timeToString(now, DateTools.Resolution.MILLISECOND), DateTools
.timeToString(now + 999999, DateTools.Resolution.MILLISECOND), true,
false);
// filter that should discard matches
// DateFilter df2 = DateFilter.After("datefield", now + 999999);
TermRangeFilter df2 = new TermRangeFilter("datefield", DateTools
TermRangeFilter df2 = TermRangeFilter.newStringRange("datefield", DateTools
.timeToString(now + 999999, DateTools.Resolution.MILLISECOND),
DateTools.timeToString(now + 999999999,
DateTools.Resolution.MILLISECOND), false, true);

View File

@ -187,7 +187,7 @@ public class TestFilteredQuery extends LuceneTestCase {
* This tests FilteredQuery's rewrite correctness
*/
public void testRangeQuery() throws Exception {
TermRangeQuery rq = new TermRangeQuery(
TermRangeQuery rq = TermRangeQuery.newStringRange(
"sorter", "b", "d", true, true);
Query filteredquery = new FilteredQuery(rq, filter);

View File

@ -92,25 +92,17 @@ public class TestMultiTermConstantScore extends BaseTestRangeFilter {
/** macro for readability */
public static Query csrq(String f, String l, String h, boolean il, boolean ih) {
TermRangeQuery query = new TermRangeQuery(f, l, h, il, ih);
TermRangeQuery query = TermRangeQuery.newStringRange(f, l, h, il, ih);
query.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE);
return query;
}
public static Query csrq(String f, String l, String h, boolean il, boolean ih, MultiTermQuery.RewriteMethod method) {
TermRangeQuery query = new TermRangeQuery(f, l, h, il, ih);
TermRangeQuery query = TermRangeQuery.newStringRange(f, l, h, il, ih);
query.setRewriteMethod(method);
return query;
}
/** macro for readability */
public static Query csrq(String f, String l, String h, boolean il,
boolean ih, Collator c) {
TermRangeQuery query = new TermRangeQuery(f, l, h, il, ih, c);
query.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE);
return query;
}
/** macro for readability */
public static Query cspq(Term prefix) {
PrefixQuery query = new PrefixQuery(prefix);
@ -141,15 +133,6 @@ public class TestMultiTermConstantScore extends BaseTestRangeFilter {
"data", "pr*t?j")));
}
@Test
public void testBasicsRngCollating() throws IOException {
Collator c = Collator.getInstance(Locale.ENGLISH);
QueryUtils.check(csrq("data", "1", "6", T, T, c));
QueryUtils.check(csrq("data", "A", "Z", T, T, c));
QueryUtils.checkUnequal(csrq("data", "1", "6", T, T, c), csrq("data", "A",
"Z", T, T, c));
}
@Test
public void testEqualScores() throws IOException {
// NOTE: uses index build in *this* setUp
@ -262,7 +245,7 @@ public class TestMultiTermConstantScore extends BaseTestRangeFilter {
// first do a regular TermRangeQuery which uses term expansion so
// docs with more terms in range get higher scores
Query rq = new TermRangeQuery("data", "1", "4", T, T);
Query rq = TermRangeQuery.newStringRange("data", "1", "4", T, T);
ScoreDoc[] expected = search.search(rq, null, 1000).scoreDocs;
int numHits = expected.length;
@ -415,92 +398,6 @@ public class TestMultiTermConstantScore extends BaseTestRangeFilter {
search.close();
}
@Test
public void testRangeQueryIdCollating() throws IOException {
// NOTE: uses index build in *super* setUp
IndexReader reader = signedIndexReader;
IndexSearcher search = newSearcher(reader);
int medId = ((maxId - minId) / 2);
String minIP = pad(minId);
String maxIP = pad(maxId);
String medIP = pad(medId);
int numDocs = reader.numDocs();
assertEquals("num of docs", numDocs, 1 + maxId - minId);
ScoreDoc[] result;
Collator c = Collator.getInstance(Locale.ENGLISH);
// test id, bounded on both ends
result = search.search(csrq("id", minIP, maxIP, T, T, c), null, numDocs).scoreDocs;
assertEquals("find all", numDocs, result.length);
result = search.search(csrq("id", minIP, maxIP, T, F, c), null, numDocs).scoreDocs;
assertEquals("all but last", numDocs - 1, result.length);
result = search.search(csrq("id", minIP, maxIP, F, T, c), null, numDocs).scoreDocs;
assertEquals("all but first", numDocs - 1, result.length);
result = search.search(csrq("id", minIP, maxIP, F, F, c), null, numDocs).scoreDocs;
assertEquals("all but ends", numDocs - 2, result.length);
result = search.search(csrq("id", medIP, maxIP, T, T, c), null, numDocs).scoreDocs;
assertEquals("med and up", 1 + maxId - medId, result.length);
result = search.search(csrq("id", minIP, medIP, T, T, c), null, numDocs).scoreDocs;
assertEquals("up to med", 1 + medId - minId, result.length);
// unbounded id
result = search.search(csrq("id", minIP, null, T, F, c), null, numDocs).scoreDocs;
assertEquals("min and up", numDocs, result.length);
result = search.search(csrq("id", null, maxIP, F, T, c), null, numDocs).scoreDocs;
assertEquals("max and down", numDocs, result.length);
result = search.search(csrq("id", minIP, null, F, F, c), null, numDocs).scoreDocs;
assertEquals("not min, but up", numDocs - 1, result.length);
result = search.search(csrq("id", null, maxIP, F, F, c), null, numDocs).scoreDocs;
assertEquals("not max, but down", numDocs - 1, result.length);
result = search.search(csrq("id", medIP, maxIP, T, F, c), null, numDocs).scoreDocs;
assertEquals("med and up, not max", maxId - medId, result.length);
result = search.search(csrq("id", minIP, medIP, F, T, c), null, numDocs).scoreDocs;
assertEquals("not min, up to med", medId - minId, result.length);
// very small sets
result = search.search(csrq("id", minIP, minIP, F, F, c), null, numDocs).scoreDocs;
assertEquals("min,min,F,F,c", 0, result.length);
result = search.search(csrq("id", medIP, medIP, F, F, c), null, numDocs).scoreDocs;
assertEquals("med,med,F,F,c", 0, result.length);
result = search.search(csrq("id", maxIP, maxIP, F, F, c), null, numDocs).scoreDocs;
assertEquals("max,max,F,F,c", 0, result.length);
result = search.search(csrq("id", minIP, minIP, T, T, c), null, numDocs).scoreDocs;
assertEquals("min,min,T,T,c", 1, result.length);
result = search.search(csrq("id", null, minIP, F, T, c), null, numDocs).scoreDocs;
assertEquals("nul,min,F,T,c", 1, result.length);
result = search.search(csrq("id", maxIP, maxIP, T, T, c), null, numDocs).scoreDocs;
assertEquals("max,max,T,T,c", 1, result.length);
result = search.search(csrq("id", maxIP, null, T, F, c), null, numDocs).scoreDocs;
assertEquals("max,nul,T,T,c", 1, result.length);
result = search.search(csrq("id", medIP, medIP, T, T, c), null, numDocs).scoreDocs;
assertEquals("med,med,T,T,c", 1, result.length);
search.close();
}
@Test
public void testRangeQueryRand() throws IOException {
// NOTE: uses index build in *super* setUp
@ -564,151 +461,4 @@ public class TestMultiTermConstantScore extends BaseTestRangeFilter {
search.close();
}
@Test
public void testRangeQueryRandCollating() throws IOException {
// NOTE: uses index build in *super* setUp
// using the unsigned index because collation seems to ignore hyphens
IndexReader reader = unsignedIndexReader;
IndexSearcher search = newSearcher(reader);
String minRP = pad(unsignedIndexDir.minR);
String maxRP = pad(unsignedIndexDir.maxR);
int numDocs = reader.numDocs();
assertEquals("num of docs", numDocs, 1 + maxId - minId);
ScoreDoc[] result;
Collator c = Collator.getInstance(Locale.ENGLISH);
// test extremes, bounded on both ends
result = search.search(csrq("rand", minRP, maxRP, T, T, c), null, numDocs).scoreDocs;
assertEquals("find all", numDocs, result.length);
result = search.search(csrq("rand", minRP, maxRP, T, F, c), null, numDocs).scoreDocs;
assertEquals("all but biggest", numDocs - 1, result.length);
result = search.search(csrq("rand", minRP, maxRP, F, T, c), null, numDocs).scoreDocs;
assertEquals("all but smallest", numDocs - 1, result.length);
result = search.search(csrq("rand", minRP, maxRP, F, F, c), null, numDocs).scoreDocs;
assertEquals("all but extremes", numDocs - 2, result.length);
// unbounded
result = search.search(csrq("rand", minRP, null, T, F, c), null, numDocs).scoreDocs;
assertEquals("smallest and up", numDocs, result.length);
result = search.search(csrq("rand", null, maxRP, F, T, c), null, numDocs).scoreDocs;
assertEquals("biggest and down", numDocs, result.length);
result = search.search(csrq("rand", minRP, null, F, F, c), null, numDocs).scoreDocs;
assertEquals("not smallest, but up", numDocs - 1, result.length);
result = search.search(csrq("rand", null, maxRP, F, F, c), null, numDocs).scoreDocs;
assertEquals("not biggest, but down", numDocs - 1, result.length);
// very small sets
result = search.search(csrq("rand", minRP, minRP, F, F, c), null, numDocs).scoreDocs;
assertEquals("min,min,F,F,c", 0, result.length);
result = search.search(csrq("rand", maxRP, maxRP, F, F, c), null, numDocs).scoreDocs;
assertEquals("max,max,F,F,c", 0, result.length);
result = search.search(csrq("rand", minRP, minRP, T, T, c), null, numDocs).scoreDocs;
assertEquals("min,min,T,T,c", 1, result.length);
result = search.search(csrq("rand", null, minRP, F, T, c), null, numDocs).scoreDocs;
assertEquals("nul,min,F,T,c", 1, result.length);
result = search.search(csrq("rand", maxRP, maxRP, T, T, c), null, numDocs).scoreDocs;
assertEquals("max,max,T,T,c", 1, result.length);
result = search.search(csrq("rand", maxRP, null, T, F, c), null, numDocs).scoreDocs;
assertEquals("max,nul,T,T,c", 1, result.length);
search.close();
}
@Test
public void testFarsi() throws Exception {
/* build an index */
Directory farsiIndex = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random, farsiIndex, new MockAnalyzer(MockTokenizer.SIMPLE, true));
Document doc = new Document();
doc.add(newField("content", "\u0633\u0627\u0628", Field.Store.YES,
Field.Index.NOT_ANALYZED));
doc
.add(newField("body", "body", Field.Store.YES,
Field.Index.NOT_ANALYZED));
writer.addDocument(doc);
IndexReader reader = writer.getReader();
writer.close();
IndexSearcher search = newSearcher(reader);
// Neither Java 1.4.2 nor 1.5.0 has Farsi Locale collation available in
// RuleBasedCollator. However, the Arabic Locale seems to order the Farsi
// characters properly.
Collator c = Collator.getInstance(new Locale("ar"));
// Unicode order would include U+0633 in [ U+062F - U+0698 ], but Farsi
// orders the U+0698 character before the U+0633 character, so the single
// index Term below should NOT be returned by a ConstantScoreRangeQuery
// with a Farsi Collator (or an Arabic one for the case when Farsi is
// not supported).
ScoreDoc[] result = search.search(csrq("content", "\u062F", "\u0698", T, T,
c), null, 1000).scoreDocs;
assertEquals("The index Term should not be included.", 0, result.length);
result = search.search(csrq("content", "\u0633", "\u0638", T, T, c), null,
1000).scoreDocs;
assertEquals("The index Term should be included.", 1, result.length);
search.close();
reader.close();
farsiIndex.close();
}
@Test
public void testDanish() throws Exception {
/* build an index */
Directory danishIndex = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random, danishIndex, new MockAnalyzer(MockTokenizer.SIMPLE, true));
// Danish collation orders the words below in the given order
// (example taken from TestSort.testInternationalSort() ).
String[] words = { "H\u00D8T", "H\u00C5T", "MAND" };
for (int docnum = 0 ; docnum < words.length ; ++docnum) {
Document doc = new Document();
doc.add(newField("content", words[docnum],
Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(newField("body", "body",
Field.Store.YES, Field.Index.NOT_ANALYZED));
writer.addDocument(doc);
}
IndexReader reader = writer.getReader();
writer.close();
IndexSearcher search = newSearcher(reader);
Collator c = Collator.getInstance(new Locale("da", "dk"));
// Unicode order would not include "H\u00C5T" in [ "H\u00D8T", "MAND" ],
// but Danish collation does.
ScoreDoc[] result = search.search
(csrq("content", "H\u00D8T", "MAND", F, F, c), null, 1000).scoreDocs;
assertEquals("The index Term should be included.", 1, result.length);
result = search.search
(csrq("content", "H\u00C5T", "MAND", F, F, c), null, 1000).scoreDocs;
assertEquals("The index Term should not be included.", 0, result.length);
search.close();
reader.close();
danishIndex.close();
}
}

View File

@ -112,7 +112,7 @@ public class TestMultiTermQueryRewrites extends LuceneTestCase {
}
private void checkDuplicateTerms(MultiTermQuery.RewriteMethod method) throws Exception {
final MultiTermQuery mtq = new TermRangeQuery("data", "2", "7", true, true);
final MultiTermQuery mtq = TermRangeQuery.newStringRange("data", "2", "7", true, true);
mtq.setRewriteMethod(method);
final Query q1 = searcher.rewrite(mtq);
final Query q2 = multiSearcher.rewrite(mtq);
@ -158,7 +158,7 @@ public class TestMultiTermQueryRewrites extends LuceneTestCase {
final MultiTermQuery mtq = new MultiTermQuery("data") {
@Override
protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException {
return new TermRangeTermsEnum(terms.iterator(), "2", "7", true, true, null) {
return new TermRangeTermsEnum(terms.iterator(), new BytesRef("2"), new BytesRef("7"), true, true) {
final BoostAttribute boostAtt =
attributes().addAttribute(BoostAttribute.class);
@ -203,7 +203,7 @@ public class TestMultiTermQueryRewrites extends LuceneTestCase {
// default gets restored automatically by LuceneTestCase:
BooleanQuery.setMaxClauseCount(3);
final MultiTermQuery mtq = new TermRangeQuery("data", "2", "7", true, true);
final MultiTermQuery mtq = TermRangeQuery.newStringRange("data", "2", "7", true, true);
mtq.setRewriteMethod(method);
try {
multiSearcherDupls.rewrite(mtq);
@ -219,7 +219,7 @@ public class TestMultiTermQueryRewrites extends LuceneTestCase {
// default gets restored automatically by LuceneTestCase:
BooleanQuery.setMaxClauseCount(3);
final MultiTermQuery mtq = new TermRangeQuery("data", "2", "7", true, true);
final MultiTermQuery mtq = TermRangeQuery.newStringRange("data", "2", "7", true, true);
mtq.setRewriteMethod(method);
multiSearcherDupls.rewrite(mtq);
}

View File

@ -67,7 +67,7 @@ public class TestMultiValuedNumericRangeQuery extends LuceneTestCase {
if (lower>upper) {
int a=lower; lower=upper; upper=a;
}
TermRangeQuery cq=new TermRangeQuery("asc", format.format(lower), format.format(upper), true, true);
TermRangeQuery cq=TermRangeQuery.newStringRange("asc", format.format(lower), format.format(upper), true, true);
NumericRangeQuery<Integer> tq=NumericRangeQuery.newIntRange("trie", lower, upper, true, true);
TopDocs trTopDocs = searcher.search(cq, 1);
TopDocs nrTopDocs = searcher.search(tq, 1);

View File

@ -344,12 +344,10 @@ public class TestNumericRangeQuery32 extends LuceneTestCase {
final BytesRef lowerBytes = new BytesRef(NumericUtils.BUF_SIZE_INT), upperBytes = new BytesRef(NumericUtils.BUF_SIZE_INT);
NumericUtils.intToPrefixCoded(lower, 0, lowerBytes);
NumericUtils.intToPrefixCoded(upper, 0, upperBytes);
// TODO: when new TermRange ctors with BytesRef available, use them and do not convert to string!
final String lowerString = lowerBytes.utf8ToString(), upperString = upperBytes.utf8ToString();
// test inclusive range
NumericRangeQuery<Integer> tq=NumericRangeQuery.newIntRange(field, precisionStep, lower, upper, true, true);
TermRangeQuery cq=new TermRangeQuery(field, lowerString, upperString, true, true);
TermRangeQuery cq=new TermRangeQuery(field, lowerBytes, upperBytes, true, true);
TopDocs tTopDocs = searcher.search(tq, 1);
TopDocs cTopDocs = searcher.search(cq, 1);
assertEquals("Returned count for NumericRangeQuery and TermRangeQuery must be equal", cTopDocs.totalHits, tTopDocs.totalHits );
@ -357,7 +355,7 @@ public class TestNumericRangeQuery32 extends LuceneTestCase {
termCountC += cq.getTotalNumberOfTerms();
// test exclusive range
tq=NumericRangeQuery.newIntRange(field, precisionStep, lower, upper, false, false);
cq=new TermRangeQuery(field, lowerString, upperString, false, false);
cq=new TermRangeQuery(field, lowerBytes, upperBytes, false, false);
tTopDocs = searcher.search(tq, 1);
cTopDocs = searcher.search(cq, 1);
assertEquals("Returned count for NumericRangeQuery and TermRangeQuery must be equal", cTopDocs.totalHits, tTopDocs.totalHits );
@ -365,7 +363,7 @@ public class TestNumericRangeQuery32 extends LuceneTestCase {
termCountC += cq.getTotalNumberOfTerms();
// test left exclusive range
tq=NumericRangeQuery.newIntRange(field, precisionStep, lower, upper, false, true);
cq=new TermRangeQuery(field, lowerString, upperString, false, true);
cq=new TermRangeQuery(field, lowerBytes, upperBytes, false, true);
tTopDocs = searcher.search(tq, 1);
cTopDocs = searcher.search(cq, 1);
assertEquals("Returned count for NumericRangeQuery and TermRangeQuery must be equal", cTopDocs.totalHits, tTopDocs.totalHits );
@ -373,7 +371,7 @@ public class TestNumericRangeQuery32 extends LuceneTestCase {
termCountC += cq.getTotalNumberOfTerms();
// test right exclusive range
tq=NumericRangeQuery.newIntRange(field, precisionStep, lower, upper, true, false);
cq=new TermRangeQuery(field, lowerString, upperString, true, false);
cq=new TermRangeQuery(field, lowerBytes, upperBytes, true, false);
tTopDocs = searcher.search(tq, 1);
cTopDocs = searcher.search(cq, 1);
assertEquals("Returned count for NumericRangeQuery and TermRangeQuery must be equal", cTopDocs.totalHits, tTopDocs.totalHits );

View File

@ -361,12 +361,10 @@ public class TestNumericRangeQuery64 extends LuceneTestCase {
final BytesRef lowerBytes = new BytesRef(NumericUtils.BUF_SIZE_LONG), upperBytes = new BytesRef(NumericUtils.BUF_SIZE_LONG);
NumericUtils.longToPrefixCoded(lower, 0, lowerBytes);
NumericUtils.longToPrefixCoded(upper, 0, upperBytes);
// TODO: when new TermRange ctors with BytesRef available, use them and do not convert to string!
final String lowerString = lowerBytes.utf8ToString(), upperString = upperBytes.utf8ToString();
// test inclusive range
NumericRangeQuery<Long> tq=NumericRangeQuery.newLongRange(field, precisionStep, lower, upper, true, true);
TermRangeQuery cq=new TermRangeQuery(field, lowerString, upperString, true, true);
TermRangeQuery cq=new TermRangeQuery(field, lowerBytes, upperBytes, true, true);
TopDocs tTopDocs = searcher.search(tq, 1);
TopDocs cTopDocs = searcher.search(cq, 1);
assertEquals("Returned count for NumericRangeQuery and TermRangeQuery must be equal", cTopDocs.totalHits, tTopDocs.totalHits );
@ -374,7 +372,7 @@ public class TestNumericRangeQuery64 extends LuceneTestCase {
termCountC += cq.getTotalNumberOfTerms();
// test exclusive range
tq=NumericRangeQuery.newLongRange(field, precisionStep, lower, upper, false, false);
cq=new TermRangeQuery(field, lowerString, upperString, false, false);
cq=new TermRangeQuery(field, lowerBytes, upperBytes, false, false);
tTopDocs = searcher.search(tq, 1);
cTopDocs = searcher.search(cq, 1);
assertEquals("Returned count for NumericRangeQuery and TermRangeQuery must be equal", cTopDocs.totalHits, tTopDocs.totalHits );
@ -382,7 +380,7 @@ public class TestNumericRangeQuery64 extends LuceneTestCase {
termCountC += cq.getTotalNumberOfTerms();
// test left exclusive range
tq=NumericRangeQuery.newLongRange(field, precisionStep, lower, upper, false, true);
cq=new TermRangeQuery(field, lowerString, upperString, false, true);
cq=new TermRangeQuery(field, lowerBytes, upperBytes, false, true);
tTopDocs = searcher.search(tq, 1);
cTopDocs = searcher.search(cq, 1);
assertEquals("Returned count for NumericRangeQuery and TermRangeQuery must be equal", cTopDocs.totalHits, tTopDocs.totalHits );
@ -390,7 +388,7 @@ public class TestNumericRangeQuery64 extends LuceneTestCase {
termCountC += cq.getTotalNumberOfTerms();
// test right exclusive range
tq=NumericRangeQuery.newLongRange(field, precisionStep, lower, upper, true, false);
cq=new TermRangeQuery(field, lowerString, upperString, true, false);
cq=new TermRangeQuery(field, lowerBytes, upperBytes, true, false);
tTopDocs = searcher.search(tq, 1);
cTopDocs = searcher.search(cq, 1);
assertEquals("Returned count for NumericRangeQuery and TermRangeQuery must be equal", cTopDocs.totalHits, tTopDocs.totalHits );

View File

@ -18,12 +18,8 @@ package org.apache.lucene.search;
*/
import java.io.IOException;
import java.text.Collator;
import java.util.ArrayList;
import java.util.BitSet;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Locale;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
@ -110,11 +106,6 @@ public class TestSort extends LuceneTestCase {
{ "d", "m", null, null, null, null, null, null, null, null, null, null}
};
// the sort order of Ø versus U depends on the version of the rules being used
// for the inherited root locale: Ø's order isnt specified in Locale.US since
// its not used in english.
private boolean oStrokeFirst = Collator.getInstance(new Locale("")).compare("Ø", "U") < 0;
// create an index of all the documents, or just the x, or just the y documents
private IndexSearcher getIndex (boolean even, boolean odd)
throws IOException {
@ -564,12 +555,6 @@ public class TestSort extends LuceneTestCase {
sort.setSort (new SortField ("string", SortField.STRING, true) );
assertMatches (full, queryF, sort, "IJZ");
sort.setSort (new SortField ("i18n", Locale.ENGLISH));
assertMatches (full, queryF, sort, "ZJI");
sort.setSort (new SortField ("i18n", Locale.ENGLISH, true));
assertMatches (full, queryF, sort, "IJZ");
sort.setSort (new SortField ("int", SortField.INT) );
assertMatches (full, queryF, sort, "IZJ");
@ -630,36 +615,6 @@ public class TestSort extends LuceneTestCase {
assertMatches (full, queryX, sort, "GICEA");
}
// test using a Locale for sorting strings
public void testLocaleSort() throws Exception {
sort.setSort (new SortField ("string", Locale.US) );
assertMatches (full, queryX, sort, "AIGEC");
assertMatches (full, queryY, sort, "DJHFB");
sort.setSort (new SortField ("string", Locale.US, true) );
assertMatches (full, queryX, sort, "CEGIA");
assertMatches (full, queryY, sort, "BFHJD");
}
// test using various international locales with accented characters
// (which sort differently depending on locale)
public void testInternationalSort() throws Exception {
sort.setSort (new SortField ("i18n", Locale.US));
assertMatches (full, queryY, sort, oStrokeFirst ? "BFJHD" : "BFJDH");
sort.setSort (new SortField ("i18n", new Locale("sv", "se")));
assertMatches (full, queryY, sort, "BJDFH");
sort.setSort (new SortField ("i18n", new Locale("da", "dk")));
assertMatches (full, queryY, sort, "BJDHF");
sort.setSort (new SortField ("i18n", Locale.US));
assertMatches (full, queryX, sort, "ECAGI");
sort.setSort (new SortField ("i18n", Locale.FRANCE));
assertMatches (full, queryX, sort, "EACGI");
}
// test a variety of sorts using a parallel multisearcher
public void testParallelMultiSort() throws Exception {
ExecutorService exec = Executors.newFixedThreadPool(_TestUtil.nextInt(random, 2, 8));
@ -976,19 +931,6 @@ public class TestSort extends LuceneTestCase {
assertSaneFieldCaches(getName() + " various");
// next we'll check Locale based (String[]) for 'string', so purge first
FieldCache.DEFAULT.purgeAllCaches();
sort.setSort(new SortField ("string", Locale.US) );
assertMatches(multi, queryA, sort, "DJAIHGFEBC");
sort.setSort(new SortField ("string", Locale.US, true) );
assertMatches(multi, queryA, sort, "CBEFGHIAJD");
sort.setSort(new SortField ("string", Locale.UK) );
assertMatches(multi, queryA, sort, "DJAIHGFEBC");
assertSaneFieldCaches(getName() + " Locale.US + Locale.UK");
FieldCache.DEFAULT.purgeAllCaches();
}
private void assertMatches(IndexSearcher searcher, Query query, Sort sort, String expectedResult) throws IOException {
@ -1014,37 +956,6 @@ public class TestSort extends LuceneTestCase {
assertEquals (msg, expectedResult, buff.toString());
}
private HashMap<String,Float> getScores (ScoreDoc[] hits, IndexSearcher searcher)
throws IOException {
HashMap<String,Float> scoreMap = new HashMap<String,Float>();
int n = hits.length;
for (int i=0; i<n; ++i) {
Document doc = searcher.doc(hits[i].doc);
String[] v = doc.getValues("tracer");
assertEquals (v.length, 1);
scoreMap.put (v[0], Float.valueOf(hits[i].score));
}
return scoreMap;
}
// make sure all the values in the maps match
private <K, V> void assertSameValues (HashMap<K,V> m1, HashMap<K,V> m2) {
int n = m1.size();
int m = m2.size();
assertEquals (n, m);
Iterator<K> iter = m1.keySet().iterator();
while (iter.hasNext()) {
K key = iter.next();
V o1 = m1.get(key);
V o2 = m2.get(key);
if (o1 instanceof Float) {
assertEquals(((Float)o1).floatValue(), ((Float)o2).floatValue(), 1e-6);
} else {
assertEquals (m1.get(key), m2.get(key));
}
}
}
public void testEmptyStringVsNullStringSort() throws Exception {
Directory dir = newDirectory();
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(

View File

@ -18,15 +18,9 @@ package org.apache.lucene.search;
*/
import java.io.IOException;
import java.text.Collator;
import java.util.Locale;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.store.Directory;
import org.junit.Test;
/**
@ -61,193 +55,89 @@ public class TestTermRangeFilter extends BaseTestRangeFilter {
// test id, bounded on both ends
result = search.search(q, new TermRangeFilter("id", minIP, maxIP, T, T),
result = search.search(q, TermRangeFilter.newStringRange("id", minIP, maxIP, T, T),
numDocs).scoreDocs;
assertEquals("find all", numDocs, result.length);
result = search.search(q, new TermRangeFilter("id", minIP, maxIP, T, F),
result = search.search(q, TermRangeFilter.newStringRange("id", minIP, maxIP, T, F),
numDocs).scoreDocs;
assertEquals("all but last", numDocs - 1, result.length);
result = search.search(q, new TermRangeFilter("id", minIP, maxIP, F, T),
result = search.search(q, TermRangeFilter.newStringRange("id", minIP, maxIP, F, T),
numDocs).scoreDocs;
assertEquals("all but first", numDocs - 1, result.length);
result = search.search(q, new TermRangeFilter("id", minIP, maxIP, F, F),
result = search.search(q, TermRangeFilter.newStringRange("id", minIP, maxIP, F, F),
numDocs).scoreDocs;
assertEquals("all but ends", numDocs - 2, result.length);
result = search.search(q, new TermRangeFilter("id", medIP, maxIP, T, T),
result = search.search(q, TermRangeFilter.newStringRange("id", medIP, maxIP, T, T),
numDocs).scoreDocs;
assertEquals("med and up", 1 + maxId - medId, result.length);
result = search.search(q, new TermRangeFilter("id", minIP, medIP, T, T),
result = search.search(q, TermRangeFilter.newStringRange("id", minIP, medIP, T, T),
numDocs).scoreDocs;
assertEquals("up to med", 1 + medId - minId, result.length);
// unbounded id
result = search.search(q, new TermRangeFilter("id", minIP, null, T, F),
result = search.search(q, TermRangeFilter.newStringRange("id", minIP, null, T, F),
numDocs).scoreDocs;
assertEquals("min and up", numDocs, result.length);
result = search.search(q, new TermRangeFilter("id", null, maxIP, F, T),
result = search.search(q, TermRangeFilter.newStringRange("id", null, maxIP, F, T),
numDocs).scoreDocs;
assertEquals("max and down", numDocs, result.length);
result = search.search(q, new TermRangeFilter("id", minIP, null, F, F),
result = search.search(q, TermRangeFilter.newStringRange("id", minIP, null, F, F),
numDocs).scoreDocs;
assertEquals("not min, but up", numDocs - 1, result.length);
result = search.search(q, new TermRangeFilter("id", null, maxIP, F, F),
result = search.search(q, TermRangeFilter.newStringRange("id", null, maxIP, F, F),
numDocs).scoreDocs;
assertEquals("not max, but down", numDocs - 1, result.length);
result = search.search(q, new TermRangeFilter("id", medIP, maxIP, T, F),
result = search.search(q, TermRangeFilter.newStringRange("id", medIP, maxIP, T, F),
numDocs).scoreDocs;
assertEquals("med and up, not max", maxId - medId, result.length);
result = search.search(q, new TermRangeFilter("id", minIP, medIP, F, T),
result = search.search(q, TermRangeFilter.newStringRange("id", minIP, medIP, F, T),
numDocs).scoreDocs;
assertEquals("not min, up to med", medId - minId, result.length);
// very small sets
result = search.search(q, new TermRangeFilter("id", minIP, minIP, F, F),
result = search.search(q, TermRangeFilter.newStringRange("id", minIP, minIP, F, F),
numDocs).scoreDocs;
assertEquals("min,min,F,F", 0, result.length);
result = search.search(q, new TermRangeFilter("id", medIP, medIP, F, F),
result = search.search(q, TermRangeFilter.newStringRange("id", medIP, medIP, F, F),
numDocs).scoreDocs;
assertEquals("med,med,F,F", 0, result.length);
result = search.search(q, new TermRangeFilter("id", maxIP, maxIP, F, F),
result = search.search(q, TermRangeFilter.newStringRange("id", maxIP, maxIP, F, F),
numDocs).scoreDocs;
assertEquals("max,max,F,F", 0, result.length);
result = search.search(q, new TermRangeFilter("id", minIP, minIP, T, T),
result = search.search(q, TermRangeFilter.newStringRange("id", minIP, minIP, T, T),
numDocs).scoreDocs;
assertEquals("min,min,T,T", 1, result.length);
result = search.search(q, new TermRangeFilter("id", null, minIP, F, T),
result = search.search(q, TermRangeFilter.newStringRange("id", null, minIP, F, T),
numDocs).scoreDocs;
assertEquals("nul,min,F,T", 1, result.length);
result = search.search(q, new TermRangeFilter("id", maxIP, maxIP, T, T),
result = search.search(q, TermRangeFilter.newStringRange("id", maxIP, maxIP, T, T),
numDocs).scoreDocs;
assertEquals("max,max,T,T", 1, result.length);
result = search.search(q, new TermRangeFilter("id", maxIP, null, T, F),
result = search.search(q, TermRangeFilter.newStringRange("id", maxIP, null, T, F),
numDocs).scoreDocs;
assertEquals("max,nul,T,T", 1, result.length);
result = search.search(q, new TermRangeFilter("id", medIP, medIP, T, T),
result = search.search(q, TermRangeFilter.newStringRange("id", medIP, medIP, T, T),
numDocs).scoreDocs;
assertEquals("med,med,T,T", 1, result.length);
search.close();
}
@Test
public void testRangeFilterIdCollating() throws IOException {
IndexReader reader = signedIndexReader;
IndexSearcher search = newSearcher(reader);
Collator c = Collator.getInstance(Locale.ENGLISH);
int medId = ((maxId - minId) / 2);
String minIP = pad(minId);
String maxIP = pad(maxId);
String medIP = pad(medId);
int numDocs = reader.numDocs();
assertEquals("num of docs", numDocs, 1 + maxId - minId);
Query q = new TermQuery(new Term("body", "body"));
// test id, bounded on both ends
int numHits = search.search(q, new TermRangeFilter("id", minIP, maxIP, T,
T, c), 1000).totalHits;
assertEquals("find all", numDocs, numHits);
numHits = search.search(q,
new TermRangeFilter("id", minIP, maxIP, T, F, c), 1000).totalHits;
assertEquals("all but last", numDocs - 1, numHits);
numHits = search.search(q,
new TermRangeFilter("id", minIP, maxIP, F, T, c), 1000).totalHits;
assertEquals("all but first", numDocs - 1, numHits);
numHits = search.search(q,
new TermRangeFilter("id", minIP, maxIP, F, F, c), 1000).totalHits;
assertEquals("all but ends", numDocs - 2, numHits);
numHits = search.search(q,
new TermRangeFilter("id", medIP, maxIP, T, T, c), 1000).totalHits;
assertEquals("med and up", 1 + maxId - medId, numHits);
numHits = search.search(q,
new TermRangeFilter("id", minIP, medIP, T, T, c), 1000).totalHits;
assertEquals("up to med", 1 + medId - minId, numHits);
// unbounded id
numHits = search.search(q, new TermRangeFilter("id", minIP, null, T, F, c),
1000).totalHits;
assertEquals("min and up", numDocs, numHits);
numHits = search.search(q, new TermRangeFilter("id", null, maxIP, F, T, c),
1000).totalHits;
assertEquals("max and down", numDocs, numHits);
numHits = search.search(q, new TermRangeFilter("id", minIP, null, F, F, c),
1000).totalHits;
assertEquals("not min, but up", numDocs - 1, numHits);
numHits = search.search(q, new TermRangeFilter("id", null, maxIP, F, F, c),
1000).totalHits;
assertEquals("not max, but down", numDocs - 1, numHits);
numHits = search.search(q,
new TermRangeFilter("id", medIP, maxIP, T, F, c), 1000).totalHits;
assertEquals("med and up, not max", maxId - medId, numHits);
numHits = search.search(q,
new TermRangeFilter("id", minIP, medIP, F, T, c), 1000).totalHits;
assertEquals("not min, up to med", medId - minId, numHits);
// very small sets
numHits = search.search(q,
new TermRangeFilter("id", minIP, minIP, F, F, c), 1000).totalHits;
assertEquals("min,min,F,F", 0, numHits);
numHits = search.search(q,
new TermRangeFilter("id", medIP, medIP, F, F, c), 1000).totalHits;
assertEquals("med,med,F,F", 0, numHits);
numHits = search.search(q,
new TermRangeFilter("id", maxIP, maxIP, F, F, c), 1000).totalHits;
assertEquals("max,max,F,F", 0, numHits);
numHits = search.search(q,
new TermRangeFilter("id", minIP, minIP, T, T, c), 1000).totalHits;
assertEquals("min,min,T,T", 1, numHits);
numHits = search.search(q, new TermRangeFilter("id", null, minIP, F, T, c),
1000).totalHits;
assertEquals("nul,min,F,T", 1, numHits);
numHits = search.search(q,
new TermRangeFilter("id", maxIP, maxIP, T, T, c), 1000).totalHits;
assertEquals("max,max,T,T", 1, numHits);
numHits = search.search(q, new TermRangeFilter("id", maxIP, null, T, F, c),
1000).totalHits;
assertEquals("max,nul,T,T", 1, numHits);
numHits = search.search(q,
new TermRangeFilter("id", medIP, medIP, T, T, c), 1000).totalHits;
assertEquals("med,med,T,T", 1, numHits);
search.close();
}
@Test
public void testRangeFilterRand() throws IOException {
@ -266,223 +156,63 @@ public class TestTermRangeFilter extends BaseTestRangeFilter {
// test extremes, bounded on both ends
result = search.search(q, new TermRangeFilter("rand", minRP, maxRP, T, T),
result = search.search(q, TermRangeFilter.newStringRange("rand", minRP, maxRP, T, T),
numDocs).scoreDocs;
assertEquals("find all", numDocs, result.length);
result = search.search(q, new TermRangeFilter("rand", minRP, maxRP, T, F),
result = search.search(q, TermRangeFilter.newStringRange("rand", minRP, maxRP, T, F),
numDocs).scoreDocs;
assertEquals("all but biggest", numDocs - 1, result.length);
result = search.search(q, new TermRangeFilter("rand", minRP, maxRP, F, T),
result = search.search(q, TermRangeFilter.newStringRange("rand", minRP, maxRP, F, T),
numDocs).scoreDocs;
assertEquals("all but smallest", numDocs - 1, result.length);
result = search.search(q, new TermRangeFilter("rand", minRP, maxRP, F, F),
result = search.search(q, TermRangeFilter.newStringRange("rand", minRP, maxRP, F, F),
numDocs).scoreDocs;
assertEquals("all but extremes", numDocs - 2, result.length);
// unbounded
result = search.search(q, new TermRangeFilter("rand", minRP, null, T, F),
result = search.search(q, TermRangeFilter.newStringRange("rand", minRP, null, T, F),
numDocs).scoreDocs;
assertEquals("smallest and up", numDocs, result.length);
result = search.search(q, new TermRangeFilter("rand", null, maxRP, F, T),
result = search.search(q, TermRangeFilter.newStringRange("rand", null, maxRP, F, T),
numDocs).scoreDocs;
assertEquals("biggest and down", numDocs, result.length);
result = search.search(q, new TermRangeFilter("rand", minRP, null, F, F),
result = search.search(q, TermRangeFilter.newStringRange("rand", minRP, null, F, F),
numDocs).scoreDocs;
assertEquals("not smallest, but up", numDocs - 1, result.length);
result = search.search(q, new TermRangeFilter("rand", null, maxRP, F, F),
result = search.search(q, TermRangeFilter.newStringRange("rand", null, maxRP, F, F),
numDocs).scoreDocs;
assertEquals("not biggest, but down", numDocs - 1, result.length);
// very small sets
result = search.search(q, new TermRangeFilter("rand", minRP, minRP, F, F),
result = search.search(q, TermRangeFilter.newStringRange("rand", minRP, minRP, F, F),
numDocs).scoreDocs;
assertEquals("min,min,F,F", 0, result.length);
result = search.search(q, new TermRangeFilter("rand", maxRP, maxRP, F, F),
result = search.search(q, TermRangeFilter.newStringRange("rand", maxRP, maxRP, F, F),
numDocs).scoreDocs;
assertEquals("max,max,F,F", 0, result.length);
result = search.search(q, new TermRangeFilter("rand", minRP, minRP, T, T),
result = search.search(q, TermRangeFilter.newStringRange("rand", minRP, minRP, T, T),
numDocs).scoreDocs;
assertEquals("min,min,T,T", 1, result.length);
result = search.search(q, new TermRangeFilter("rand", null, minRP, F, T),
result = search.search(q, TermRangeFilter.newStringRange("rand", null, minRP, F, T),
numDocs).scoreDocs;
assertEquals("nul,min,F,T", 1, result.length);
result = search.search(q, new TermRangeFilter("rand", maxRP, maxRP, T, T),
result = search.search(q, TermRangeFilter.newStringRange("rand", maxRP, maxRP, T, T),
numDocs).scoreDocs;
assertEquals("max,max,T,T", 1, result.length);
result = search.search(q, new TermRangeFilter("rand", maxRP, null, T, F),
result = search.search(q, TermRangeFilter.newStringRange("rand", maxRP, null, T, F),
numDocs).scoreDocs;
assertEquals("max,nul,T,T", 1, result.length);
search.close();
}
@Test
public void testRangeFilterRandCollating() throws IOException {
// using the unsigned index because collation seems to ignore hyphens
IndexReader reader = unsignedIndexReader;
IndexSearcher search = newSearcher(reader);
Collator c = Collator.getInstance(Locale.ENGLISH);
String minRP = pad(unsignedIndexDir.minR);
String maxRP = pad(unsignedIndexDir.maxR);
int numDocs = reader.numDocs();
assertEquals("num of docs", numDocs, 1 + maxId - minId);
Query q = new TermQuery(new Term("body", "body"));
// test extremes, bounded on both ends
int numHits = search.search(q, new TermRangeFilter("rand", minRP, maxRP, T,
T, c), 1000).totalHits;
assertEquals("find all", numDocs, numHits);
numHits = search.search(q, new TermRangeFilter("rand", minRP, maxRP, T, F,
c), 1000).totalHits;
assertEquals("all but biggest", numDocs - 1, numHits);
numHits = search.search(q, new TermRangeFilter("rand", minRP, maxRP, F, T,
c), 1000).totalHits;
assertEquals("all but smallest", numDocs - 1, numHits);
numHits = search.search(q, new TermRangeFilter("rand", minRP, maxRP, F, F,
c), 1000).totalHits;
assertEquals("all but extremes", numDocs - 2, numHits);
// unbounded
numHits = search.search(q,
new TermRangeFilter("rand", minRP, null, T, F, c), 1000).totalHits;
assertEquals("smallest and up", numDocs, numHits);
numHits = search.search(q,
new TermRangeFilter("rand", null, maxRP, F, T, c), 1000).totalHits;
assertEquals("biggest and down", numDocs, numHits);
numHits = search.search(q,
new TermRangeFilter("rand", minRP, null, F, F, c), 1000).totalHits;
assertEquals("not smallest, but up", numDocs - 1, numHits);
numHits = search.search(q,
new TermRangeFilter("rand", null, maxRP, F, F, c), 1000).totalHits;
assertEquals("not biggest, but down", numDocs - 1, numHits);
// very small sets
numHits = search.search(q, new TermRangeFilter("rand", minRP, minRP, F, F,
c), 1000).totalHits;
assertEquals("min,min,F,F", 0, numHits);
numHits = search.search(q, new TermRangeFilter("rand", maxRP, maxRP, F, F,
c), 1000).totalHits;
assertEquals("max,max,F,F", 0, numHits);
numHits = search.search(q, new TermRangeFilter("rand", minRP, minRP, T, T,
c), 1000).totalHits;
assertEquals("min,min,T,T", 1, numHits);
numHits = search.search(q,
new TermRangeFilter("rand", null, minRP, F, T, c), 1000).totalHits;
assertEquals("nul,min,F,T", 1, numHits);
numHits = search.search(q, new TermRangeFilter("rand", maxRP, maxRP, T, T,
c), 1000).totalHits;
assertEquals("max,max,T,T", 1, numHits);
numHits = search.search(q,
new TermRangeFilter("rand", maxRP, null, T, F, c), 1000).totalHits;
assertEquals("max,nul,T,T", 1, numHits);
search.close();
}
@Test
public void testFarsi() throws Exception {
/* build an index */
Directory farsiIndex = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random, farsiIndex);
Document doc = new Document();
doc.add(newField("content", "\u0633\u0627\u0628", Field.Store.YES,
Field.Index.NOT_ANALYZED));
doc
.add(newField("body", "body", Field.Store.YES,
Field.Index.NOT_ANALYZED));
writer.addDocument(doc);
IndexReader reader = writer.getReader();
writer.close();
IndexSearcher search = newSearcher(reader);
Query q = new TermQuery(new Term("body", "body"));
// Neither Java 1.4.2 nor 1.5.0 has Farsi Locale collation available in
// RuleBasedCollator. However, the Arabic Locale seems to order the Farsi
// characters properly.
Collator collator = Collator.getInstance(new Locale("ar"));
// Unicode order would include U+0633 in [ U+062F - U+0698 ], but Farsi
// orders the U+0698 character before the U+0633 character, so the single
// index Term below should NOT be returned by a TermRangeFilter with a Farsi
// Collator (or an Arabic one for the case when Farsi is not supported).
int numHits = search.search(q, new TermRangeFilter("content", "\u062F",
"\u0698", T, T, collator), 1000).totalHits;
assertEquals("The index Term should not be included.", 0, numHits);
numHits = search.search(q, new TermRangeFilter("content", "\u0633",
"\u0638", T, T, collator), 1000).totalHits;
assertEquals("The index Term should be included.", 1, numHits);
search.close();
reader.close();
farsiIndex.close();
}
@Test
public void testDanish() throws Exception {
/* build an index */
Directory danishIndex = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random, danishIndex);
// Danish collation orders the words below in the given order
// (example taken from TestSort.testInternationalSort() ).
String[] words = {"H\u00D8T", "H\u00C5T", "MAND"};
for (int docnum = 0; docnum < words.length; ++docnum) {
Document doc = new Document();
doc.add(newField("content", words[docnum], Field.Store.YES,
Field.Index.NOT_ANALYZED));
doc.add(newField("body", "body", Field.Store.YES,
Field.Index.NOT_ANALYZED));
writer.addDocument(doc);
}
IndexReader reader = writer.getReader();
writer.close();
IndexSearcher search = newSearcher(reader);
Query q = new TermQuery(new Term("body", "body"));
Collator collator = Collator.getInstance(new Locale("da", "dk"));
// Unicode order would not include "H\u00C5T" in [ "H\u00D8T", "MAND" ],
// but Danish collation does.
int numHits = search.search(q, new TermRangeFilter("content", "H\u00D8T",
"MAND", F, F, collator), 1000).totalHits;
assertEquals("The index Term should be included.", 1, numHits);
numHits = search.search(q, new TermRangeFilter("content", "H\u00C5T",
"MAND", F, F, collator), 1000).totalHits;
assertEquals("The index Term should not be included.", 0, numHits);
search.close();
reader.close();
danishIndex.close();
}
}

View File

@ -53,7 +53,7 @@ public class TestTermRangeQuery extends LuceneTestCase {
}
public void testExclusive() throws Exception {
Query query = new TermRangeQuery("content", "A", "C", false, false);
Query query = TermRangeQuery.newStringRange("content", "A", "C", false, false);
initializeIndex(new String[] {"A", "B", "C", "D"});
IndexSearcher searcher = new IndexSearcher(dir, true);
ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs;
@ -74,7 +74,7 @@ public class TestTermRangeQuery extends LuceneTestCase {
}
public void testInclusive() throws Exception {
Query query = new TermRangeQuery("content", "A", "C", true, true);
Query query = TermRangeQuery.newStringRange("content", "A", "C", true, true);
initializeIndex(new String[]{"A", "B", "C", "D"});
IndexSearcher searcher = new IndexSearcher(dir, true);
@ -105,11 +105,11 @@ public class TestTermRangeQuery extends LuceneTestCase {
query = new TermRangeQuery("content", null, null, false, false);
assertFalse(query.getTermsEnum(terms) instanceof TermRangeTermsEnum);
assertEquals(4, searcher.search(query, null, 1000).scoreDocs.length);
query = new TermRangeQuery("content", "", null, true, false);
query = TermRangeQuery.newStringRange("content", "", null, true, false);
assertFalse(query.getTermsEnum(terms) instanceof TermRangeTermsEnum);
assertEquals(4, searcher.search(query, null, 1000).scoreDocs.length);
// and now anothe one
query = new TermRangeQuery("content", "B", null, true, false);
query = TermRangeQuery.newStringRange("content", "B", null, true, false);
assertTrue(query.getTermsEnum(terms) instanceof TermRangeTermsEnum);
assertEquals(3, searcher.search(query, null, 1000).scoreDocs.length);
searcher.close();
@ -121,7 +121,7 @@ public class TestTermRangeQuery extends LuceneTestCase {
initializeIndex(new String[]{"A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K"});
IndexSearcher searcher = new IndexSearcher(dir, true);
TermRangeQuery query = new TermRangeQuery("content", "B", "J", true, true);
TermRangeQuery query = TermRangeQuery.newStringRange("content", "B", "J", true, true);
checkBooleanTerms(searcher, query, "B", "C", "D", "E", "F", "G", "H", "I", "J");
final int savedClauseCount = BooleanQuery.getMaxClauseCount();
@ -150,10 +150,10 @@ public class TestTermRangeQuery extends LuceneTestCase {
}
public void testEqualsHashcode() {
Query query = new TermRangeQuery("content", "A", "C", true, true);
Query query = TermRangeQuery.newStringRange("content", "A", "C", true, true);
query.setBoost(1.0f);
Query other = new TermRangeQuery("content", "A", "C", true, true);
Query other = TermRangeQuery.newStringRange("content", "A", "C", true, true);
other.setBoost(1.0f);
assertEquals("query equals itself is true", query, query);
@ -163,120 +163,32 @@ public class TestTermRangeQuery extends LuceneTestCase {
other.setBoost(2.0f);
assertFalse("Different boost queries are not equal", query.equals(other));
other = new TermRangeQuery("notcontent", "A", "C", true, true);
other = TermRangeQuery.newStringRange("notcontent", "A", "C", true, true);
assertFalse("Different fields are not equal", query.equals(other));
other = new TermRangeQuery("content", "X", "C", true, true);
other = TermRangeQuery.newStringRange("content", "X", "C", true, true);
assertFalse("Different lower terms are not equal", query.equals(other));
other = new TermRangeQuery("content", "A", "Z", true, true);
other = TermRangeQuery.newStringRange("content", "A", "Z", true, true);
assertFalse("Different upper terms are not equal", query.equals(other));
query = new TermRangeQuery("content", null, "C", true, true);
other = new TermRangeQuery("content", null, "C", true, true);
query = TermRangeQuery.newStringRange("content", null, "C", true, true);
other = TermRangeQuery.newStringRange("content", null, "C", true, true);
assertEquals("equivalent queries with null lowerterms are equal()", query, other);
assertEquals("hashcode must return same value when equals is true", query.hashCode(), other.hashCode());
query = new TermRangeQuery("content", "C", null, true, true);
other = new TermRangeQuery("content", "C", null, true, true);
query = TermRangeQuery.newStringRange("content", "C", null, true, true);
other = TermRangeQuery.newStringRange("content", "C", null, true, true);
assertEquals("equivalent queries with null upperterms are equal()", query, other);
assertEquals("hashcode returns same value", query.hashCode(), other.hashCode());
query = new TermRangeQuery("content", null, "C", true, true);
other = new TermRangeQuery("content", "C", null, true, true);
query = TermRangeQuery.newStringRange("content", null, "C", true, true);
other = TermRangeQuery.newStringRange("content", "C", null, true, true);
assertFalse("queries with different upper and lower terms are not equal", query.equals(other));
query = new TermRangeQuery("content", "A", "C", false, false);
other = new TermRangeQuery("content", "A", "C", true, true);
query = TermRangeQuery.newStringRange("content", "A", "C", false, false);
other = TermRangeQuery.newStringRange("content", "A", "C", true, true);
assertFalse("queries with different inclusive are not equal", query.equals(other));
query = new TermRangeQuery("content", "A", "C", false, false);
other = new TermRangeQuery("content", "A", "C", false, false, Collator.getInstance());
assertFalse("a query with a collator is not equal to one without", query.equals(other));
}
public void testExclusiveCollating() throws Exception {
Query query = new TermRangeQuery("content", "A", "C", false, false, Collator.getInstance(Locale.ENGLISH));
initializeIndex(new String[] {"A", "B", "C", "D"});
IndexSearcher searcher = new IndexSearcher(dir, true);
ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals("A,B,C,D, only B in range", 1, hits.length);
searcher.close();
initializeIndex(new String[] {"A", "B", "D"});
searcher = new IndexSearcher(dir, true);
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals("A,B,D, only B in range", 1, hits.length);
searcher.close();
addDoc("C");
searcher = new IndexSearcher(dir, true);
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals("C added, still only B in range", 1, hits.length);
searcher.close();
}
public void testInclusiveCollating() throws Exception {
Query query = new TermRangeQuery("content", "A", "C",true, true, Collator.getInstance(Locale.ENGLISH));
initializeIndex(new String[]{"A", "B", "C", "D"});
IndexSearcher searcher = new IndexSearcher(dir, true);
ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals("A,B,C,D - A,B,C in range", 3, hits.length);
searcher.close();
initializeIndex(new String[]{"A", "B", "D"});
searcher = new IndexSearcher(dir, true);
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals("A,B,D - A and B in range", 2, hits.length);
searcher.close();
addDoc("C");
searcher = new IndexSearcher(dir, true);
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals("C added - A, B, C in range", 3, hits.length);
searcher.close();
}
public void testFarsi() throws Exception {
// Neither Java 1.4.2 nor 1.5.0 has Farsi Locale collation available in
// RuleBasedCollator. However, the Arabic Locale seems to order the Farsi
// characters properly.
Collator collator = Collator.getInstance(new Locale("ar"));
Query query = new TermRangeQuery("content", "\u062F", "\u0698", true, true, collator);
// Unicode order would include U+0633 in [ U+062F - U+0698 ], but Farsi
// orders the U+0698 character before the U+0633 character, so the single
// index Term below should NOT be returned by a TermRangeQuery with a Farsi
// Collator (or an Arabic one for the case when Farsi is not supported).
initializeIndex(new String[]{ "\u0633\u0627\u0628"});
IndexSearcher searcher = new IndexSearcher(dir, true);
ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals("The index Term should not be included.", 0, hits.length);
query = new TermRangeQuery("content", "\u0633", "\u0638",true, true, collator);
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals("The index Term should be included.", 1, hits.length);
searcher.close();
}
public void testDanish() throws Exception {
Collator collator = Collator.getInstance(new Locale("da", "dk"));
// Danish collation orders the words below in the given order (example taken
// from TestSort.testInternationalSort() ).
String[] words = { "H\u00D8T", "H\u00C5T", "MAND" };
Query query = new TermRangeQuery("content", "H\u00D8T", "MAND", false, false, collator);
// Unicode order would not include "H\u00C5T" in [ "H\u00D8T", "MAND" ],
// but Danish collation does.
initializeIndex(words);
IndexSearcher searcher = new IndexSearcher(dir, true);
ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals("The index Term should be included.", 1, hits.length);
query = new TermRangeQuery("content", "H\u00C5T", "MAND", false, false, collator);
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals("The index Term should not be included.", 0, hits.length);
searcher.close();
}
private static class SingleCharAnalyzer extends Analyzer {
@ -363,7 +275,7 @@ public class TestTermRangeQuery extends LuceneTestCase {
public void testExclusiveLowerNull() throws Exception {
Analyzer analyzer = new SingleCharAnalyzer();
//http://issues.apache.org/jira/browse/LUCENE-38
Query query = new TermRangeQuery("content", null, "C",
Query query = TermRangeQuery.newStringRange("content", null, "C",
false, false);
initializeIndex(new String[] {"A", "B", "", "C", "D"}, analyzer);
IndexSearcher searcher = new IndexSearcher(dir, true);
@ -396,7 +308,7 @@ public class TestTermRangeQuery extends LuceneTestCase {
public void testInclusiveLowerNull() throws Exception {
//http://issues.apache.org/jira/browse/LUCENE-38
Analyzer analyzer = new SingleCharAnalyzer();
Query query = new TermRangeQuery("content", null, "C", true, true);
Query query = TermRangeQuery.newStringRange("content", null, "C", true, true);
initializeIndex(new String[]{"A", "B", "","C", "D"}, analyzer);
IndexSearcher searcher = new IndexSearcher(dir, true);
int numHits = searcher.search(query, null, 1000).totalHits;

View File

@ -17,6 +17,10 @@ package org.apache.lucene.util;
* limitations under the License.
*/
/**
* @deprecated Remove when IndexableBinaryStringTools is removed.
*/
@Deprecated
public class TestIndexableBinaryStringTools extends LuceneTestCase {
private static final int NUM_RANDOM_TESTS = 2000 * RANDOM_MULTIPLIER;
private static final int MAX_RANDOM_BINARY_LENGTH = 300 * RANDOM_MULTIPLIER;

View File

@ -25,6 +25,10 @@ API Changes
* LUCENE-1370: Added ShingleFilter option to output unigrams if no shingles
can be generated. (Chris Harris via Steven Rowe)
* LUCENE-2514, LUCENE-2551: JDK and ICU CollationKeyAnalyzers were changed to
use pure byte keys when Version >= 4.0. This cuts sort key size approximately
in half. (Robert Muir)
New Features
* LUCENE-2413: Consolidated Solr analysis components into common.

View File

@ -29,8 +29,8 @@ import org.apache.lucene.util.AttributeSource;
* Emits the entire input as a single token.
*/
public final class KeywordTokenizer extends Tokenizer {
private static final int DEFAULT_BUFFER_SIZE = 256;
/** Default read buffer size */
public static final int DEFAULT_BUFFER_SIZE = 256;
private boolean done = false;
private int finalOffset;

View File

@ -0,0 +1,103 @@
package org.apache.lucene.collation;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.text.Collator;
import org.apache.lucene.collation.tokenattributes.CollatedTermAttributeImpl;
import org.apache.lucene.util.Attribute;
import org.apache.lucene.util.AttributeImpl;
import org.apache.lucene.util.AttributeSource;
/**
* <p>
* Converts each token into its {@link java.text.CollationKey}, and then
* encodes the bytes as an index term.
* </p>
* <p>
* <strong>WARNING:</strong> Make sure you use exactly the same Collator at
* index and query time -- CollationKeys are only comparable when produced by
* the same Collator. Since {@link java.text.RuleBasedCollator}s are not
* independently versioned, it is unsafe to search against stored
* CollationKeys unless the following are exactly the same (best practice is
* to store this information with the index and check that they remain the
* same at query time):
* </p>
* <ol>
* <li>JVM vendor</li>
* <li>JVM version, including patch version</li>
* <li>
* The language (and country and variant, if specified) of the Locale
* used when constructing the collator via
* {@link Collator#getInstance(java.util.Locale)}.
* </li>
* <li>
* The collation strength used - see {@link Collator#setStrength(int)}
* </li>
* </ol>
* <p>
* The <code>ICUCollationAttributeFactory</code> in the icu package of Lucene's
* contrib area uses ICU4J's Collator, which makes its
* version available, thus allowing collation to be versioned independently
* from the JVM. ICUCollationAttributeFactory is also significantly faster and
* generates significantly shorter keys than CollationAttributeFactory. See
* <a href="http://site.icu-project.org/charts/collation-icu4j-sun"
* >http://site.icu-project.org/charts/collation-icu4j-sun</a> for key
* generation timing and key length comparisons between ICU4J and
* java.text.Collator over several languages.
* </p>
* <p>
* CollationKeys generated by java.text.Collators are not compatible
* with those those generated by ICU Collators. Specifically, if you use
* CollationAttributeFactory to generate index terms, do not use
* ICUCollationAttributeFactory on the query side, or vice versa.
* </p>
*/
public class CollationAttributeFactory extends AttributeSource.AttributeFactory {
private final Collator collator;
private final AttributeSource.AttributeFactory delegate;
/**
* Create a CollationAttributeFactory, using
* {@link AttributeSource.AttributeFactory#DEFAULT_ATTRIBUTE_FACTORY} as the
* factory for all other attributes.
* @param collator CollationKey generator
*/
public CollationAttributeFactory(Collator collator) {
this(AttributeSource.AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, collator);
}
/**
* Create a CollationAttributeFactory, using the supplied Attribute Factory
* as the factory for all other attributes.
* @param delegate Attribute Factory
* @param collator CollationKey generator
*/
public CollationAttributeFactory(AttributeSource.AttributeFactory delegate, Collator collator) {
this.delegate = delegate;
this.collator = collator;
}
@Override
public AttributeImpl createAttributeInstance(
Class<? extends Attribute> attClass) {
return attClass.isAssignableFrom(CollatedTermAttributeImpl.class)
? new CollatedTermAttributeImpl(collator)
: delegate.createAttributeInstance(attClass);
}
}

View File

@ -18,14 +18,13 @@ package org.apache.lucene.collation;
*/
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
import org.apache.lucene.util.IndexableBinaryStringTools; // javadoc @link
import org.apache.lucene.util.Version;
import java.text.Collator;
import java.io.Reader;
import java.io.IOException;
/**
* <p>
@ -33,8 +32,8 @@ import java.io.IOException;
* </p>
* <p>
* Converts the token into its {@link java.text.CollationKey}, and then
* encodes the CollationKey with
* {@link org.apache.lucene.util.IndexableBinaryStringTools}, to allow
* encodes the CollationKey either directly or with
* {@link IndexableBinaryStringTools} (see <a href="#version">below</a>), to allow
* it to be stored as an index term.
* </p>
* <p>
@ -75,39 +74,49 @@ import java.io.IOException;
* CollationKeyAnalyzer to generate index terms, do not use
* ICUCollationKeyAnalyzer on the query side, or vice versa.
* </p>
* <a name="version"/>
* <p>You must specify the required {@link Version}
* compatibility when creating CollationKeyAnalyzer:
* <ul>
* <li> As of 4.0, Collation Keys are directly encoded as bytes. Previous
* versions will encode the bytes with {@link IndexableBinaryStringTools}.
* </ul>
*/
public final class CollationKeyAnalyzer extends Analyzer {
private Collator collator;
public CollationKeyAnalyzer(Collator collator) {
public final class CollationKeyAnalyzer extends ReusableAnalyzerBase {
private final Collator collator;
private final CollationAttributeFactory factory;
private final Version matchVersion;
/**
* Create a new CollationKeyAnalyzer, using the specified collator.
*
* @param matchVersion See <a href="#version">above</a>
* @param collator CollationKey generator
*/
public CollationKeyAnalyzer(Version matchVersion, Collator collator) {
this.matchVersion = matchVersion;
this.collator = collator;
this.factory = new CollationAttributeFactory(collator);
}
/**
* @deprecated Use {@link CollationKeyAnalyzer#CollationKeyAnalyzer(Version, Collator)}
* and specify a version instead. This ctor will be removed in Lucene 5.0
*/
@Deprecated
public CollationKeyAnalyzer(Collator collator) {
this(Version.LUCENE_31, collator);
}
@Override
public TokenStream tokenStream(String fieldName, Reader reader) {
TokenStream result = new KeywordTokenizer(reader);
result = new CollationKeyFilter(result, collator);
return result;
}
private class SavedStreams {
Tokenizer source;
TokenStream result;
}
@Override
public TokenStream reusableTokenStream(String fieldName, Reader reader)
throws IOException {
SavedStreams streams = (SavedStreams)getPreviousTokenStream();
if (streams == null) {
streams = new SavedStreams();
streams.source = new KeywordTokenizer(reader);
streams.result = new CollationKeyFilter(streams.source, collator);
setPreviousTokenStream(streams);
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
if (matchVersion.onOrAfter(Version.LUCENE_40)) {
KeywordTokenizer tokenizer = new KeywordTokenizer(factory, reader, KeywordTokenizer.DEFAULT_BUFFER_SIZE);
return new TokenStreamComponents(tokenizer, tokenizer);
} else {
streams.source.reset(reader);
KeywordTokenizer tokenizer = new KeywordTokenizer(reader);
return new TokenStreamComponents(tokenizer, new CollationKeyFilter(tokenizer, collator));
}
return streams.result;
}
}

View File

@ -71,7 +71,10 @@ import java.text.Collator;
* CollationKeyFilter to generate index terms, do not use
* ICUCollationKeyFilter on the query side, or vice versa.
* </p>
* @deprecated Use {@link CollationAttributeFactory} instead, which encodes
* terms directly as bytes. This filter will be removed in Lucene 5.0
*/
@Deprecated
public final class CollationKeyFilter extends TokenFilter {
private final Collator collator;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);

View File

@ -55,10 +55,9 @@
<code><pre>
// "fa" Locale is not supported by Sun JDK 1.4 or 1.5
Collator collator = Collator.getInstance(new Locale("ar"));
CollationKeyAnalyzer analyzer = new CollationKeyAnalyzer(collator);
CollationKeyAnalyzer analyzer = new CollationKeyAnalyzer(Version.LUCENE_40, collator);
RAMDirectory ramDir = new RAMDirectory();
IndexWriter writer = new IndexWriter
(ramDir, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
IndexWriter writer = new IndexWriter(ramDir, new IndexWriterConfig(Version.LUCENE_40, analyzer));
Document doc = new Document();
doc.add(new Field("content", "\u0633\u0627\u0628",
Field.Store.YES, Field.Index.ANALYZED));
@ -66,12 +65,9 @@
writer.close();
IndexSearcher is = new IndexSearcher(ramDir, true);
// The AnalyzingQueryParser in Lucene's contrib allows terms in range queries
// to be passed through an analyzer - Lucene's standard QueryParser does not
// allow this.
AnalyzingQueryParser aqp = new AnalyzingQueryParser("content", analyzer);
aqp.setLowercaseExpandedTerms(false);
QueryParser aqp = new QueryParser(Version.LUCENE_40, "content", analyzer);
aqp.setAnalyzeRangeTerms(true);
// Unicode order would include U+0633 in [ U+062F - U+0698 ], but Farsi
// orders the U+0698 character before the U+0633 character, so the single
// indexed Term above should NOT be returned by a ConstantScoreRangeQuery
@ -85,10 +81,9 @@
<h3>Danish Sorting</h3>
<code><pre>
Analyzer analyzer
= new CollationKeyAnalyzer(Collator.getInstance(new Locale("da", "dk")));
= new CollationKeyAnalyzer(Version.LUCENE_40, Collator.getInstance(new Locale("da", "dk")));
RAMDirectory indexStore = new RAMDirectory();
IndexWriter writer = new IndexWriter
(indexStore, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
IndexWriter writer = new IndexWriter(indexStore, new IndexWriterConfig(Version.LUCENE_40, analyzer));
String[] tracer = new String[] { "A", "B", "C", "D", "E" };
String[] data = new String[] { "HAT", "HUT", "H\u00C5T", "H\u00D8T", "HOT" };
String[] sortedTracerOrder = new String[] { "A", "E", "B", "D", "C" };
@ -99,7 +94,7 @@
writer.addDocument(doc);
}
writer.close();
Searcher searcher = new IndexSearcher(indexStore, true);
IndexSearcher searcher = new IndexSearcher(indexStore, true);
Sort sort = new Sort();
sort.setSort(new SortField("contents", SortField.STRING));
Query query = new MatchAllDocsQuery();
@ -114,16 +109,15 @@
<code><pre>
Collator collator = Collator.getInstance(new Locale("tr", "TR"));
collator.setStrength(Collator.PRIMARY);
Analyzer analyzer = new CollationKeyAnalyzer(collator);
Analyzer analyzer = new CollationKeyAnalyzer(Version.LUCENE_40, collator);
RAMDirectory ramDir = new RAMDirectory();
IndexWriter writer = new IndexWriter
(ramDir, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
IndexWriter writer = new IndexWriter(ramDir, new IndexWriterConfig(Version.LUCENE_40, analyzer));
Document doc = new Document();
doc.add(new Field("contents", "DIGY", Field.Store.NO, Field.Index.ANALYZED));
writer.addDocument(doc);
writer.close();
IndexSearcher is = new IndexSearcher(ramDir, true);
QueryParser parser = new QueryParser("contents", analyzer);
QueryParser parser = new QueryParser(Version.LUCENE_40, "contents", analyzer);
Query query = parser.parse("d\u0131gy"); // U+0131: dotless i
ScoreDoc[] result = is.search(query, null, 1000).scoreDocs;
assertEquals("The index Term should be included.", 1, result.length);

View File

@ -1,4 +1,4 @@
package org.apache.lucene.queryParser.standard.config;
package org.apache.lucene.collation.tokenattributes;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
@ -19,19 +19,30 @@ package org.apache.lucene.queryParser.standard.config;
import java.text.Collator;
import org.apache.lucene.queryParser.core.config.QueryConfigHandler;
import org.apache.lucene.queryParser.standard.processors.ParametricRangeQueryNodeProcessor;
import org.apache.lucene.search.TermRangeQuery;
import org.apache.lucene.util.Attribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttributeImpl;
import org.apache.lucene.util.BytesRef;
/**
* This attribute is used by {@link ParametricRangeQueryNodeProcessor} processor
* and must be defined in the {@link QueryConfigHandler}. This attribute tells
* the processor which {@link Collator} should be used for a
* {@link TermRangeQuery} <br/>
*
* Extension of {@link CharTermAttributeImpl} that encodes the term
* text as a binary Unicode collation key instead of as UTF-8 bytes.
*/
public interface RangeCollatorAttribute extends Attribute {
public void setDateResolution(Collator rangeCollator);
public Collator getRangeCollator();
public class CollatedTermAttributeImpl extends CharTermAttributeImpl {
private final Collator collator;
/**
* Create a new CollatedTermAttributeImpl
* @param collator Collation key generator
*/
public CollatedTermAttributeImpl(Collator collator) {
this.collator = collator;
}
@Override
public int toBytesRef(BytesRef target) {
target.bytes = collator.getCollationKey(toString()).toByteArray();
target.offset = 0;
target.length = target.bytes.length;
return target.hashCode();
}
}

View File

@ -36,6 +36,7 @@ import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Document;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IndexableBinaryStringTools;
import org.apache.lucene.util.LuceneTestCase;
@ -56,7 +57,9 @@ public abstract class CollationTestBase extends LuceneTestCase {
* @param keyBits the result from
* collator.getCollationKey(original).toByteArray()
* @return The encoded collation key for the original String
* @deprecated only for testing deprecated filters
*/
@Deprecated
protected String encodeCollationKey(byte[] keyBits) {
// Ensure that the backing char[] array is large enough to hold the encoded
// Binary String
@ -65,10 +68,10 @@ public abstract class CollationTestBase extends LuceneTestCase {
IndexableBinaryStringTools.encode(keyBits, 0, keyBits.length, encodedBegArray, 0, encodedLength);
return new String(encodedBegArray);
}
public void testFarsiRangeFilterCollating(Analyzer analyzer, String firstBeg,
String firstEnd, String secondBeg,
String secondEnd) throws Exception {
public void testFarsiRangeFilterCollating(Analyzer analyzer, BytesRef firstBeg,
BytesRef firstEnd, BytesRef secondBeg,
BytesRef secondEnd) throws Exception {
RAMDirectory ramDir = new RAMDirectory();
IndexWriter writer = new IndexWriter(ramDir, new IndexWriterConfig(
TEST_VERSION_CURRENT, analyzer));
@ -98,9 +101,9 @@ public abstract class CollationTestBase extends LuceneTestCase {
searcher.close();
}
public void testFarsiRangeQueryCollating(Analyzer analyzer, String firstBeg,
String firstEnd, String secondBeg,
String secondEnd) throws Exception {
public void testFarsiRangeQueryCollating(Analyzer analyzer, BytesRef firstBeg,
BytesRef firstEnd, BytesRef secondBeg,
BytesRef secondEnd) throws Exception {
RAMDirectory ramDir = new RAMDirectory();
IndexWriter writer = new IndexWriter(ramDir, new IndexWriterConfig(
TEST_VERSION_CURRENT, analyzer));
@ -126,8 +129,8 @@ public abstract class CollationTestBase extends LuceneTestCase {
searcher.close();
}
public void testFarsiTermRangeQuery(Analyzer analyzer, String firstBeg,
String firstEnd, String secondBeg, String secondEnd) throws Exception {
public void testFarsiTermRangeQuery(Analyzer analyzer, BytesRef firstBeg,
BytesRef firstEnd, BytesRef secondBeg, BytesRef secondEnd) throws Exception {
RAMDirectory farsiIndex = new RAMDirectory();
IndexWriter writer = new IndexWriter(farsiIndex, new IndexWriterConfig(

View File

@ -19,6 +19,8 @@ package org.apache.lucene.collation;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.codecs.CodecProvider;
import org.apache.lucene.util.BytesRef;
import java.text.Collator;
import java.util.Locale;
@ -34,17 +36,19 @@ public class TestCollationKeyAnalyzer extends CollationTestBase {
// RuleBasedCollator. However, the Arabic Locale seems to order the Farsi
// characters properly.
private Collator collator = Collator.getInstance(new Locale("ar"));
private Analyzer analyzer = new CollationKeyAnalyzer(collator);
private Analyzer analyzer = new CollationKeyAnalyzer(TEST_VERSION_CURRENT, collator);
private String firstRangeBeginning = encodeCollationKey
(collator.getCollationKey(firstRangeBeginningOriginal).toByteArray());
private String firstRangeEnd = encodeCollationKey
(collator.getCollationKey(firstRangeEndOriginal).toByteArray());
private String secondRangeBeginning = encodeCollationKey
(collator.getCollationKey(secondRangeBeginningOriginal).toByteArray());
private String secondRangeEnd = encodeCollationKey
(collator.getCollationKey(secondRangeEndOriginal).toByteArray());
private BytesRef firstRangeBeginning = new BytesRef(collator.getCollationKey(firstRangeBeginningOriginal).toByteArray());
private BytesRef firstRangeEnd = new BytesRef(collator.getCollationKey(firstRangeEndOriginal).toByteArray());
private BytesRef secondRangeBeginning = new BytesRef(collator.getCollationKey(secondRangeBeginningOriginal).toByteArray());
private BytesRef secondRangeEnd = new BytesRef(collator.getCollationKey(secondRangeEndOriginal).toByteArray());
@Override
public void setUp() throws Exception {
super.setUp();
assumeFalse("preflex format only supports UTF-8 encoded bytes", "PreFlex".equals(CodecProvider.getDefault().getDefaultFieldCodec()));
}
public void testFarsiRangeFilterCollating() throws Exception {
testFarsiRangeFilterCollating
(analyzer, firstRangeBeginning, firstRangeEnd,
@ -65,13 +69,13 @@ public class TestCollationKeyAnalyzer extends CollationTestBase {
public void testCollationKeySort() throws Exception {
Analyzer usAnalyzer
= new CollationKeyAnalyzer(Collator.getInstance(Locale.US));
= new CollationKeyAnalyzer(TEST_VERSION_CURRENT, Collator.getInstance(Locale.US));
Analyzer franceAnalyzer
= new CollationKeyAnalyzer(Collator.getInstance(Locale.FRANCE));
= new CollationKeyAnalyzer(TEST_VERSION_CURRENT, Collator.getInstance(Locale.FRANCE));
Analyzer swedenAnalyzer
= new CollationKeyAnalyzer(Collator.getInstance(new Locale("sv", "se")));
= new CollationKeyAnalyzer(TEST_VERSION_CURRENT, Collator.getInstance(new Locale("sv", "se")));
Analyzer denmarkAnalyzer
= new CollationKeyAnalyzer(Collator.getInstance(new Locale("da", "dk")));
= new CollationKeyAnalyzer(TEST_VERSION_CURRENT, Collator.getInstance(new Locale("da", "dk")));
// The ICU Collator and Sun java.text.Collator implementations differ in their
// orderings - "BFJDH" is the ordering for java.text.Collator for Locale.US.

View File

@ -21,12 +21,16 @@ package org.apache.lucene.collation;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.util.BytesRef;
import java.text.Collator;
import java.util.Locale;
import java.io.Reader;
/**
* @deprecated remove when CollationKeyFilter is removed.
*/
@Deprecated
public class TestCollationKeyFilter extends CollationTestBase {
// the sort order of Ø versus U depends on the version of the rules being used
// for the inherited root locale: Ø's order isnt specified in Locale.US since
@ -39,14 +43,14 @@ public class TestCollationKeyFilter extends CollationTestBase {
private Collator collator = Collator.getInstance(new Locale("ar"));
private Analyzer analyzer = new TestAnalyzer(collator);
private String firstRangeBeginning = encodeCollationKey
(collator.getCollationKey(firstRangeBeginningOriginal).toByteArray());
private String firstRangeEnd = encodeCollationKey
(collator.getCollationKey(firstRangeEndOriginal).toByteArray());
private String secondRangeBeginning = encodeCollationKey
(collator.getCollationKey(secondRangeBeginningOriginal).toByteArray());
private String secondRangeEnd = encodeCollationKey
(collator.getCollationKey(secondRangeEndOriginal).toByteArray());
private BytesRef firstRangeBeginning = new BytesRef(encodeCollationKey
(collator.getCollationKey(firstRangeBeginningOriginal).toByteArray()));
private BytesRef firstRangeEnd = new BytesRef(encodeCollationKey
(collator.getCollationKey(firstRangeEndOriginal).toByteArray()));
private BytesRef secondRangeBeginning = new BytesRef(encodeCollationKey
(collator.getCollationKey(secondRangeBeginningOriginal).toByteArray()));
private BytesRef secondRangeEnd = new BytesRef(encodeCollationKey
(collator.getCollationKey(secondRangeEndOriginal).toByteArray()));
public final class TestAnalyzer extends Analyzer {

View File

@ -0,0 +1,96 @@
package org.apache.lucene.collation;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.collation.tokenattributes.ICUCollatedTermAttributeImpl;
import org.apache.lucene.util.Attribute;
import org.apache.lucene.util.AttributeImpl;
import org.apache.lucene.util.AttributeSource;
import com.ibm.icu.text.Collator;
/**
* <p>
* Converts each token into its {@link com.ibm.icu.text.CollationKey}, and
* then encodes bytes as an index term.
* </p>
* <p>
* <strong>WARNING:</strong> Make sure you use exactly the same Collator at
* index and query time -- CollationKeys are only comparable when produced by
* the same Collator. {@link com.ibm.icu.text.RuleBasedCollator}s are
* independently versioned, so it is safe to search against stored
* CollationKeys if the following are exactly the same (best practice is
* to store this information with the index and check that they remain the
* same at query time):
* </p>
* <ol>
* <li>
* Collator version - see {@link Collator#getVersion()}
* </li>
* <li>
* The collation strength used - see {@link Collator#setStrength(int)}
* </li>
* </ol>
* <p>
* CollationKeys generated by ICU Collators are not compatible with those
* generated by java.text.Collators. Specifically, if you use
* ICUCollationAttributeFactory to generate index terms, do not use
* {@link CollationAttributeFactory} on the query side, or vice versa.
* </p>
* <p>
* ICUCollationAttributeFactory is significantly faster and generates significantly
* shorter keys than CollationAttributeFactory. See
* <a href="http://site.icu-project.org/charts/collation-icu4j-sun"
* >http://site.icu-project.org/charts/collation-icu4j-sun</a> for key
* generation timing and key length comparisons between ICU4J and
* java.text.Collator over several languages.
* </p>
*/
public class ICUCollationAttributeFactory extends AttributeSource.AttributeFactory {
private final Collator collator;
private final AttributeSource.AttributeFactory delegate;
/**
* Create an ICUCollationAttributeFactory, using
* {@link AttributeSource.AttributeFactory#DEFAULT_ATTRIBUTE_FACTORY} as the
* factory for all other attributes.
* @param collator CollationKey generator
*/
public ICUCollationAttributeFactory(Collator collator) {
this(AttributeSource.AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, collator);
}
/**
* Create an ICUCollationAttributeFactory, using the supplied Attribute
* Factory as the factory for all other attributes.
* @param delegate Attribute Factory
* @param collator CollationKey generator
*/
public ICUCollationAttributeFactory(AttributeSource.AttributeFactory delegate, Collator collator) {
this.delegate = delegate;
this.collator = collator;
}
@Override
public AttributeImpl createAttributeInstance(
Class<? extends Attribute> attClass) {
return attClass.isAssignableFrom(ICUCollatedTermAttributeImpl.class)
? new ICUCollatedTermAttributeImpl(collator)
: delegate.createAttributeInstance(attClass);
}
}

View File

@ -19,24 +19,20 @@ package org.apache.lucene.collation;
import com.ibm.icu.text.Collator;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
import org.apache.lucene.collation.CollationKeyAnalyzer; // javadocs
import org.apache.lucene.util.Version;
import java.io.Reader;
import java.io.IOException;
/**
* <p>
* Filters {@link KeywordTokenizer} with {@link ICUCollationKeyFilter}.
* <p>
* Converts the token into its {@link com.ibm.icu.text.CollationKey}, and
* then encodes the CollationKey with
* {@link org.apache.lucene.util.IndexableBinaryStringTools}, to allow it to
* then encodes the CollationKey either directly or with
* {@link IndexableBinaryStringTools} (see <a href="#version">below</a>), to allow it to
* be stored as an index term.
* </p>
* <p>
@ -70,39 +66,48 @@ import java.io.IOException;
* generation timing and key length comparisons between ICU4J and
* java.text.Collator over several languages.
* </p>
* <a name="version"/>
* <p>You must specify the required {@link Version}
* compatibility when creating ICUCollationKeyAnalyzer:
* <ul>
* <li> As of 4.0, Collation Keys are directly encoded as bytes. Previous
* versions will encode the bytes with {@link IndexableBinaryStringTools}.
* </ul>
*/
public final class ICUCollationKeyAnalyzer extends Analyzer {
private Collator collator;
public final class ICUCollationKeyAnalyzer extends ReusableAnalyzerBase {
private final Collator collator;
private final ICUCollationAttributeFactory factory;
private final Version matchVersion;
public ICUCollationKeyAnalyzer(Collator collator) {
/**
* Create a new ICUCollationKeyAnalyzer, using the specified collator.
*
* @param matchVersion See <a href="#version">above</a>
* @param collator CollationKey generator
*/
public ICUCollationKeyAnalyzer(Version matchVersion, Collator collator) {
this.matchVersion = matchVersion;
this.collator = collator;
this.factory = new ICUCollationAttributeFactory(collator);
}
/**
* @deprecated Use {@link ICUCollationKeyAnalyzer#ICUCollationKeyAnalyzer(Version, Collator)}
* and specify a version instead. This ctor will be removed in Lucene 5.0
*/
@Deprecated
public ICUCollationKeyAnalyzer(Collator collator) {
this(Version.LUCENE_31, collator);
}
@Override
public TokenStream tokenStream(String fieldName, Reader reader) {
TokenStream result = new KeywordTokenizer(reader);
result = new ICUCollationKeyFilter(result, collator);
return result;
}
private class SavedStreams {
Tokenizer source;
TokenStream result;
}
@Override
public TokenStream reusableTokenStream(String fieldName, Reader reader)
throws IOException {
SavedStreams streams = (SavedStreams)getPreviousTokenStream();
if (streams == null) {
streams = new SavedStreams();
streams.source = new KeywordTokenizer(reader);
streams.result = new ICUCollationKeyFilter(streams.source, collator);
setPreviousTokenStream(streams);
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
if (matchVersion.onOrAfter(Version.LUCENE_40)) {
KeywordTokenizer tokenizer = new KeywordTokenizer(factory, reader, KeywordTokenizer.DEFAULT_BUFFER_SIZE);
return new TokenStreamComponents(tokenizer, tokenizer);
} else {
streams.source.reset(reader);
KeywordTokenizer tokenizer = new KeywordTokenizer(reader);
return new TokenStreamComponents(tokenizer, new ICUCollationKeyFilter(tokenizer, collator));
}
return streams.result;
}
}

View File

@ -68,7 +68,10 @@ import java.io.IOException;
* generation timing and key length comparisons between ICU4J and
* java.text.Collator over several languages.
* </p>
* @deprecated Use {@link ICUCollationAttributeFactory} instead, which encodes
* terms directly as bytes. This filter will be removed in Lucene 5.0
*/
@Deprecated
public final class ICUCollationKeyFilter extends TokenFilter {
private Collator collator = null;
private RawCollationKey reusableKey = new RawCollationKey();

View File

@ -0,0 +1,50 @@
package org.apache.lucene.collation.tokenattributes;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.tokenattributes.CharTermAttributeImpl;
import org.apache.lucene.util.BytesRef;
import com.ibm.icu.text.Collator;
import com.ibm.icu.text.RawCollationKey;
/**
* Extension of {@link CharTermAttributeImpl} that encodes the term
* text as a binary Unicode collation key instead of as UTF-8 bytes.
*/
public class ICUCollatedTermAttributeImpl extends CharTermAttributeImpl {
private final Collator collator;
private final RawCollationKey key = new RawCollationKey();
/**
* Create a new ICUCollatedTermAttributeImpl
* @param collator Collation key generator
*/
public ICUCollatedTermAttributeImpl(Collator collator) {
this.collator = collator;
}
@Override
public int toBytesRef(BytesRef target) {
collator.getRawCollationKey(toString(), key);
target.bytes = key.bytes;
target.offset = 0;
target.length = key.size;
return target.hashCode();
}
}

View File

@ -112,11 +112,10 @@ algorithm.
<h3>Farsi Range Queries</h3>
<code><pre>
Collator collator = Collator.getInstance(new Locale("ar"));
ICUCollationKeyAnalyzer analyzer = new ICUCollationKeyAnalyzer(collator);
Collator collator = Collator.getInstance(new ULocale("ar"));
ICUCollationKeyAnalyzer analyzer = new ICUCollationKeyAnalyzer(Version.LUCENE_40, collator);
RAMDirectory ramDir = new RAMDirectory();
IndexWriter writer = new IndexWriter
(ramDir, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
IndexWriter writer = new IndexWriter(ramDir, new IndexWriterConfig(Version.LUCENE_40, analyzer));
Document doc = new Document();
doc.add(new Field("content", "\u0633\u0627\u0628",
Field.Store.YES, Field.Index.ANALYZED));
@ -124,12 +123,9 @@ algorithm.
writer.close();
IndexSearcher is = new IndexSearcher(ramDir, true);
// The AnalyzingQueryParser in Lucene's contrib allows terms in range queries
// to be passed through an analyzer - Lucene's standard QueryParser does not
// allow this.
AnalyzingQueryParser aqp = new AnalyzingQueryParser("content", analyzer);
aqp.setLowercaseExpandedTerms(false);
QueryParser aqp = new QueryParser(Version.LUCENE_40, "content", analyzer);
aqp.setAnalyzeRangeTerms(true);
// Unicode order would include U+0633 in [ U+062F - U+0698 ], but Farsi
// orders the U+0698 character before the U+0633 character, so the single
// indexed Term above should NOT be returned by a ConstantScoreRangeQuery
@ -143,10 +139,9 @@ algorithm.
<h3>Danish Sorting</h3>
<code><pre>
Analyzer analyzer
= new ICUCollationKeyAnalyzer(Collator.getInstance(new Locale("da", "dk")));
= new ICUCollationKeyAnalyzer(Version.LUCENE_40, Collator.getInstance(new ULocale("da", "dk")));
RAMDirectory indexStore = new RAMDirectory();
IndexWriter writer = new IndexWriter
(indexStore, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
IndexWriter writer = new IndexWriter(indexStore, new IndexWriterConfig(Version.LUCENE_40, analyzer));
String[] tracer = new String[] { "A", "B", "C", "D", "E" };
String[] data = new String[] { "HAT", "HUT", "H\u00C5T", "H\u00D8T", "HOT" };
String[] sortedTracerOrder = new String[] { "A", "E", "B", "D", "C" };
@ -157,7 +152,7 @@ algorithm.
writer.addDocument(doc);
}
writer.close();
Searcher searcher = new IndexSearcher(indexStore, true);
IndexSearcher searcher = new IndexSearcher(indexStore, true);
Sort sort = new Sort();
sort.setSort(new SortField("contents", SortField.STRING));
Query query = new MatchAllDocsQuery();
@ -170,18 +165,17 @@ algorithm.
<h3>Turkish Case Normalization</h3>
<code><pre>
Collator collator = Collator.getInstance(new Locale("tr", "TR"));
Collator collator = Collator.getInstance(new ULocale("tr", "TR"));
collator.setStrength(Collator.PRIMARY);
Analyzer analyzer = new ICUCollationKeyAnalyzer(collator);
Analyzer analyzer = new ICUCollationKeyAnalyzer(Version.LUCENE_40, collator);
RAMDirectory ramDir = new RAMDirectory();
IndexWriter writer = new IndexWriter
(ramDir, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
IndexWriter writer = new IndexWriter(ramDir, new IndexWriterConfig(Version.LUCENE_40, analyzer));
Document doc = new Document();
doc.add(new Field("contents", "DIGY", Field.Store.NO, Field.Index.ANALYZED));
writer.addDocument(doc);
writer.close();
IndexSearcher is = new IndexSearcher(ramDir, true);
QueryParser parser = new QueryParser("contents", analyzer);
QueryParser parser = new QueryParser(Version.LUCENE_40, "contents", analyzer);
Query query = parser.parse("d\u0131gy"); // U+0131: dotless i
ScoreDoc[] result = is.search(query, null, 1000).scoreDocs;
assertEquals("The index Term should be included.", 1, result.length);

View File

@ -20,6 +20,8 @@ package org.apache.lucene.collation;
import com.ibm.icu.text.Collator;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.codecs.CodecProvider;
import org.apache.lucene.util.BytesRef;
import java.util.Locale;
@ -27,17 +29,23 @@ import java.util.Locale;
public class TestICUCollationKeyAnalyzer extends CollationTestBase {
private Collator collator = Collator.getInstance(new Locale("fa"));
private Analyzer analyzer = new ICUCollationKeyAnalyzer(collator);
private Analyzer analyzer = new ICUCollationKeyAnalyzer(TEST_VERSION_CURRENT, collator);
private String firstRangeBeginning = encodeCollationKey
private BytesRef firstRangeBeginning = new BytesRef
(collator.getCollationKey(firstRangeBeginningOriginal).toByteArray());
private String firstRangeEnd = encodeCollationKey
private BytesRef firstRangeEnd = new BytesRef
(collator.getCollationKey(firstRangeEndOriginal).toByteArray());
private String secondRangeBeginning = encodeCollationKey
private BytesRef secondRangeBeginning = new BytesRef
(collator.getCollationKey(secondRangeBeginningOriginal).toByteArray());
private String secondRangeEnd = encodeCollationKey
private BytesRef secondRangeEnd = new BytesRef
(collator.getCollationKey(secondRangeEndOriginal).toByteArray());
@Override
public void setUp() throws Exception {
super.setUp();
assumeFalse("preflex format only supports UTF-8 encoded bytes", "PreFlex".equals(CodecProvider.getDefault().getDefaultFieldCodec()));
}
public void testFarsiRangeFilterCollating() throws Exception {
testFarsiRangeFilterCollating(analyzer, firstRangeBeginning, firstRangeEnd,
secondRangeBeginning, secondRangeEnd);
@ -62,13 +70,13 @@ public class TestICUCollationKeyAnalyzer extends CollationTestBase {
//
public void testCollationKeySort() throws Exception {
Analyzer usAnalyzer = new ICUCollationKeyAnalyzer
(Collator.getInstance(Locale.US));
(TEST_VERSION_CURRENT, Collator.getInstance(Locale.US));
Analyzer franceAnalyzer = new ICUCollationKeyAnalyzer
(Collator.getInstance(Locale.FRANCE));
(TEST_VERSION_CURRENT, Collator.getInstance(Locale.FRANCE));
Analyzer swedenAnalyzer = new ICUCollationKeyAnalyzer
(Collator.getInstance(new Locale("sv", "se")));
(TEST_VERSION_CURRENT, Collator.getInstance(new Locale("sv", "se")));
Analyzer denmarkAnalyzer = new ICUCollationKeyAnalyzer
(Collator.getInstance(new Locale("da", "dk")));
(TEST_VERSION_CURRENT, Collator.getInstance(new Locale("da", "dk")));
// The ICU Collator and java.text.Collator implementations differ in their
// orderings - "BFJHD" is the ordering for the ICU Collator for Locale.US.

View File

@ -22,24 +22,26 @@ import com.ibm.icu.text.Collator;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.util.BytesRef;
import java.io.Reader;
import java.util.Locale;
/** @deprecated remove this when ICUCollationKeyFilter is removed */
@Deprecated
public class TestICUCollationKeyFilter extends CollationTestBase {
private Collator collator = Collator.getInstance(new Locale("fa"));
private Analyzer analyzer = new TestAnalyzer(collator);
private String firstRangeBeginning = encodeCollationKey
(collator.getCollationKey(firstRangeBeginningOriginal).toByteArray());
private String firstRangeEnd = encodeCollationKey
(collator.getCollationKey(firstRangeEndOriginal).toByteArray());
private String secondRangeBeginning = encodeCollationKey
(collator.getCollationKey(secondRangeBeginningOriginal).toByteArray());
private String secondRangeEnd = encodeCollationKey
(collator.getCollationKey(secondRangeEndOriginal).toByteArray());
private BytesRef firstRangeBeginning = new BytesRef(encodeCollationKey
(collator.getCollationKey(firstRangeBeginningOriginal).toByteArray()));
private BytesRef firstRangeEnd = new BytesRef(encodeCollationKey
(collator.getCollationKey(firstRangeEndOriginal).toByteArray()));
private BytesRef secondRangeBeginning = new BytesRef(encodeCollationKey
(collator.getCollationKey(secondRangeBeginningOriginal).toByteArray()));
private BytesRef secondRangeEnd = new BytesRef(encodeCollationKey
(collator.getCollationKey(secondRangeEndOriginal).toByteArray()));
public final class TestAnalyzer extends Analyzer {

View File

@ -95,10 +95,10 @@ class ShardFieldSortedHitQueue extends PriorityQueue {
String fieldname = fields[i].getField();
comparators[i] = getCachedComparator(fieldname, fields[i]
.getType(), fields[i].getLocale(), fields[i].getComparatorSource());
.getType(), fields[i].getComparatorSource());
if (fields[i].getType() == SortField.STRING) {
this.fields[i] = new SortField(fieldname, fields[i].getLocale(),
this.fields[i] = new SortField(fieldname, SortField.STRING,
fields[i].getReverse());
} else {
this.fields[i] = new SortField(fieldname, fields[i].getType(),
@ -145,17 +145,14 @@ class ShardFieldSortedHitQueue extends PriorityQueue {
return c < 0;
}
Comparator getCachedComparator(String fieldname, int type, Locale locale, FieldComparatorSource factory) {
Comparator getCachedComparator(String fieldname, int type, FieldComparatorSource factory) {
Comparator comparator = null;
switch (type) {
case SortField.SCORE:
comparator = comparatorScore(fieldname);
break;
case SortField.STRING:
if (locale != null)
comparator = comparatorStringLocale(fieldname, locale);
else
comparator = comparatorNatural(fieldname);
comparator = comparatorNatural(fieldname);
break;
case SortField.CUSTOM:
if (factory instanceof MissingStringLastComparatorSource){

View File

@ -410,7 +410,7 @@ public class DateField extends FieldType {
/** DateField specific range query */
public Query getRangeQuery(QParser parser, SchemaField sf, Date part1, Date part2, boolean minInclusive, boolean maxInclusive) {
return new TermRangeQuery(
return TermRangeQuery.newStringRange(
sf.getName(),
part1 == null ? null : toInternal(part1),
part2 == null ? null : toInternal(part2),

View File

@ -521,7 +521,7 @@ public abstract class FieldType extends FieldProperties {
*/
public Query getRangeQuery(QParser parser, SchemaField field, String part1, String part2, boolean minInclusive, boolean maxInclusive) {
// constant score mode is now enabled per default
return new TermRangeQuery(
return TermRangeQuery.newStringRange(
field.getName(),
part1 == null ? null : toInternal(part1),
part2 == null ? null : toInternal(part2),

View File

@ -398,8 +398,8 @@ public class QueryParsing {
String fname = q.getField();
FieldType ft = writeFieldName(fname, schema, out, flags);
out.append(q.includesLower() ? '[' : '{');
String lt = q.getLowerTerm();
String ut = q.getUpperTerm();
String lt = q.getLowerTerm().utf8ToString();
String ut = q.getUpperTerm().utf8ToString();
if (lt == null) {
out.append('*');
} else {