LUCENE-6281: Removed slow collation support.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1661720 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Adrien Grand 2015-02-23 17:32:53 +00:00
parent a43ebd6870
commit 0c078aaf4d
6 changed files with 4 additions and 684 deletions

View File

@ -152,6 +152,10 @@ API Changes
* LUCENE-6272: Scorer extends DocSetIdIterator rather than DocsEnum (Alan
Woodward)
* LUCENE-6281: Removed support for slow collations from lucene/sandbox. Better
performance would be achieved through CollationKeyAnalyzer or
ICUCollationKeyAnalyzer. (Adrien Grand)
Other
* LUCENE-6248: Remove unused odd constants from StandardSyntaxParser.jj

View File

@ -1,141 +0,0 @@
package org.apache.lucene.sandbox.queries;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.text.Collator;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.search.SimpleFieldComparator;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
/** Sorts by a field's value using the given Collator
*
* <p><b>WARNING</b>: this is very slow; you'll
* get much better performance using the
* CollationKeyAnalyzer or ICUCollationKeyAnalyzer.
* @deprecated Index collation keys with CollationKeyAnalyzer or ICUCollationKeyAnalyzer instead.
* This class will be removed in Lucene 5.0
*/
@Deprecated
public final class SlowCollatedStringComparator extends SimpleFieldComparator<String> {
private final String[] values;
private BinaryDocValues currentDocTerms;
private Bits docsWithField;
private final String field;
final Collator collator;
private String bottom;
private String topValue;
public SlowCollatedStringComparator(int numHits, String field, Collator collator) {
values = new String[numHits];
this.field = field;
this.collator = collator;
}
@Override
public int compare(int slot1, int slot2) {
final String val1 = values[slot1];
final String val2 = values[slot2];
if (val1 == null) {
if (val2 == null) {
return 0;
}
return -1;
} else if (val2 == null) {
return 1;
}
return collator.compare(val1, val2);
}
@Override
public int compareBottom(int doc) {
final BytesRef term = currentDocTerms.get(doc);
final String val2 = term.length == 0 && docsWithField.get(doc) == false ? null : term.utf8ToString();
if (bottom == null) {
if (val2 == null) {
return 0;
}
return -1;
} else if (val2 == null) {
return 1;
}
return collator.compare(bottom, val2);
}
@Override
public void copy(int slot, int doc) {
final BytesRef term = currentDocTerms.get(doc);
if (term.length == 0 && docsWithField.get(doc) == false) {
values[slot] = null;
} else {
values[slot] = term.utf8ToString();
}
}
@Override
protected void doSetNextReader(LeafReaderContext context) throws IOException {
currentDocTerms = DocValues.getBinary(context.reader(), field);
docsWithField = DocValues.getDocsWithField(context.reader(), field);
}
@Override
public void setBottom(final int bottom) {
this.bottom = values[bottom];
}
@Override
public void setTopValue(final String value) {
this.topValue = value;
}
@Override
public String value(int slot) {
return values[slot];
}
@Override
public int compareValues(String first, String second) {
if (first == null) {
if (second == null) {
return 0;
}
return -1;
} else if (second == null) {
return 1;
} else {
return collator.compare(first, second);
}
}
@Override
public int compareTop(int doc) {
final BytesRef term = currentDocTerms.get(doc);
final String docValue;
if (term.length == 0 && docsWithField.get(doc) == false) {
docValue = null;
} else {
docValue = term.utf8ToString();
}
return compareValues(topValue, docValue);
}
}

View File

@ -1,75 +0,0 @@
package org.apache.lucene.sandbox.queries;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.text.Collator;
import org.apache.lucene.search.DocValuesRangeQuery;
import org.apache.lucene.search.MultiTermQueryWrapperFilter;
import org.apache.lucene.search.NumericRangeFilter; // javadoc
// javadoc
/**
* A Filter that restricts search results to a range of term
* values in a given field.
*
* <p>This filter matches the documents looking for terms that fall into the
* supplied range according to {@link
* String#compareTo(String)}, unless a <code>Collator</code> is provided. It is not intended
* for numerical ranges; use {@link NumericRangeFilter} instead.
*
* <p>If you construct a large number of range filters with different ranges but on the
* same field, {@link DocValuesRangeQuery} may have significantly better performance.
* @deprecated Index collation keys with CollationKeyAnalyzer or ICUCollationKeyAnalyzer instead.
* This class will be removed in Lucene 5.0
*/
@Deprecated
public class SlowCollatedTermRangeFilter extends MultiTermQueryWrapperFilter<SlowCollatedTermRangeQuery> {
/**
*
* @param lowerTerm The lower bound on this range
* @param upperTerm The upper bound on this range
* @param includeLower Does this range include the lower bound?
* @param includeUpper Does this range include the upper bound?
* @param collator The collator to use when determining range inclusion; set
* to null to use Unicode code point ordering instead of collation.
* @throws IllegalArgumentException if both terms are null or if
* lowerTerm is null and includeLower is true (similar for upperTerm
* and includeUpper)
*/
public SlowCollatedTermRangeFilter(String fieldName, String lowerTerm, String upperTerm,
boolean includeLower, boolean includeUpper,
Collator collator) {
super(new SlowCollatedTermRangeQuery(fieldName, lowerTerm, upperTerm, includeLower, includeUpper, collator));
}
/** Returns the lower value of this range filter */
public String getLowerTerm() { return query.getLowerTerm(); }
/** Returns the upper value of this range filter */
public String getUpperTerm() { return query.getUpperTerm(); }
/** Returns <code>true</code> if the lower endpoint is inclusive */
public boolean includesLower() { return query.includesLower(); }
/** Returns <code>true</code> if the upper endpoint is inclusive */
public boolean includesUpper() { return query.includesUpper(); }
/** Returns the collator used to determine range inclusion, if any. */
public Collator getCollator() { return query.getCollator(); }
}

View File

@ -1,178 +0,0 @@
package org.apache.lucene.sandbox.queries;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.text.Collator;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.MultiTermQuery; // javadoc
import org.apache.lucene.search.NumericRangeQuery; // javadoc
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.ToStringUtils;
/**
* A Query that matches documents within an range of terms.
*
* <p>This query matches the documents looking for terms that fall into the
* supplied range according to {@link
* String#compareTo(String)}, unless a <code>Collator</code> is provided. It is not intended
* for numerical ranges; use {@link NumericRangeQuery} instead.
*
* <p>This query uses the {@link
* MultiTermQuery#CONSTANT_SCORE_FILTER_REWRITE}
* rewrite method.
* @deprecated Index collation keys with CollationKeyAnalyzer or ICUCollationKeyAnalyzer instead.
* This class will be removed in Lucene 5.0
*/
@Deprecated
public class SlowCollatedTermRangeQuery extends MultiTermQuery {
private String lowerTerm;
private String upperTerm;
private boolean includeLower;
private boolean includeUpper;
private Collator collator;
/** Constructs a query selecting all terms greater/equal than
* <code>lowerTerm</code> but less/equal than <code>upperTerm</code>.
* <p>
* If an endpoint is null, it is said
* to be "open". Either or both endpoints may be open. Open endpoints may not
* be exclusive (you can't select all but the first or last term without
* explicitly specifying the term to exclude.)
* <p>
*
* @param lowerTerm The Term text at the lower end of the range
* @param upperTerm The Term text at the upper end of the range
* @param includeLower
* If true, the <code>lowerTerm</code> is
* included in the range.
* @param includeUpper
* If true, the <code>upperTerm</code> is
* included in the range.
* @param collator The collator to use to collate index Terms, to determine
* their membership in the range bounded by <code>lowerTerm</code> and
* <code>upperTerm</code>.
*/
public SlowCollatedTermRangeQuery(String field, String lowerTerm, String upperTerm,
boolean includeLower, boolean includeUpper, Collator collator) {
super(field);
this.lowerTerm = lowerTerm;
this.upperTerm = upperTerm;
this.includeLower = includeLower;
this.includeUpper = includeUpper;
this.collator = collator;
}
/** Returns the lower value of this range query */
public String getLowerTerm() { return lowerTerm; }
/** Returns the upper value of this range query */
public String getUpperTerm() { return upperTerm; }
/** Returns <code>true</code> if the lower endpoint is inclusive */
public boolean includesLower() { return includeLower; }
/** Returns <code>true</code> if the upper endpoint is inclusive */
public boolean includesUpper() { return includeUpper; }
/** Returns the collator used to determine range inclusion */
public Collator getCollator() { return collator; }
@Override
protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException {
if (lowerTerm != null && upperTerm != null && collator.compare(lowerTerm, upperTerm) > 0) {
return TermsEnum.EMPTY;
}
TermsEnum tenum = terms.iterator(null);
if (lowerTerm == null && upperTerm == null) {
return tenum;
}
return new SlowCollatedTermRangeTermsEnum(tenum,
lowerTerm, upperTerm, includeLower, includeUpper, collator);
}
/** @deprecated Use {@link #getField()} instead. */
@Deprecated
public String field() {
return getField();
}
/** Prints a user-readable version of this query. */
@Override
public String toString(String field) {
StringBuilder buffer = new StringBuilder();
if (!getField().equals(field)) {
buffer.append(getField());
buffer.append(":");
}
buffer.append(includeLower ? '[' : '{');
buffer.append(lowerTerm != null ? lowerTerm : "*");
buffer.append(" TO ");
buffer.append(upperTerm != null ? upperTerm : "*");
buffer.append(includeUpper ? ']' : '}');
buffer.append(ToStringUtils.boost(getBoost()));
return buffer.toString();
}
@Override
public int hashCode() {
final int prime = 31;
int result = super.hashCode();
result = prime * result + ((collator == null) ? 0 : collator.hashCode());
result = prime * result + (includeLower ? 1231 : 1237);
result = prime * result + (includeUpper ? 1231 : 1237);
result = prime * result + ((lowerTerm == null) ? 0 : lowerTerm.hashCode());
result = prime * result + ((upperTerm == null) ? 0 : upperTerm.hashCode());
return result;
}
@Override
public boolean equals(Object obj) {
if (this == obj)
return true;
if (!super.equals(obj))
return false;
if (getClass() != obj.getClass())
return false;
SlowCollatedTermRangeQuery other = (SlowCollatedTermRangeQuery) obj;
if (collator == null) {
if (other.collator != null)
return false;
} else if (!collator.equals(other.collator))
return false;
if (includeLower != other.includeLower)
return false;
if (includeUpper != other.includeUpper)
return false;
if (lowerTerm == null) {
if (other.lowerTerm != null)
return false;
} else if (!lowerTerm.equals(other.lowerTerm))
return false;
if (upperTerm == null) {
if (other.upperTerm != null)
return false;
} else if (!upperTerm.equals(other.upperTerm))
return false;
return true;
}
}

View File

@ -1,100 +0,0 @@
package org.apache.lucene.sandbox.queries;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.text.Collator;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.FilteredTermsEnum;
import org.apache.lucene.util.BytesRef;
/**
* Subclass of FilteredTermEnum for enumerating all terms that match the
* specified range parameters.
* <p>Term enumerations are always ordered by
* {@link BytesRef#compareTo}. Each term in the enumeration is
* greater than all that precede it.</p>
* @deprecated Index collation keys with CollationKeyAnalyzer or ICUCollationKeyAnalyzer instead.
* This class will be removed in Lucene 5.0
*/
@Deprecated
public class SlowCollatedTermRangeTermsEnum extends FilteredTermsEnum {
private Collator collator;
private String upperTermText;
private String lowerTermText;
private boolean includeLower;
private boolean includeUpper;
/**
* Enumerates all terms greater/equal than <code>lowerTerm</code>
* but less/equal than <code>upperTerm</code>.
*
* If an endpoint is null, it is said to be "open". Either or both
* endpoints may be open. Open endpoints may not be exclusive
* (you can't select all but the first or last term without
* explicitly specifying the term to exclude.)
*
* @param tenum source of the terms to enumerate.
* @param lowerTermText
* The term text at the lower end of the range
* @param upperTermText
* The term text at the upper end of the range
* @param includeLower
* If true, the <code>lowerTerm</code> is included in the range.
* @param includeUpper
* If true, the <code>upperTerm</code> is included in the range.
* @param collator
* The collator to use to collate index Terms, to determine their
* membership in the range bounded by <code>lowerTerm</code> and
* <code>upperTerm</code>.
*/
public SlowCollatedTermRangeTermsEnum(TermsEnum tenum, String lowerTermText, String upperTermText,
boolean includeLower, boolean includeUpper, Collator collator) {
super(tenum);
this.collator = collator;
this.upperTermText = upperTermText;
this.lowerTermText = lowerTermText;
this.includeLower = includeLower;
this.includeUpper = includeUpper;
// do a little bit of normalization...
// open ended range queries should always be inclusive.
if (this.lowerTermText == null) {
this.lowerTermText = "";
this.includeLower = true;
}
// TODO: optimize
BytesRef startBytesRef = new BytesRef("");
setInitialSeekTerm(startBytesRef);
}
@Override
protected AcceptStatus accept(BytesRef term) {
if ((includeLower
? collator.compare(term.utf8ToString(), lowerTermText) >= 0
: collator.compare(term.utf8ToString(), lowerTermText) > 0)
&& (upperTermText == null
|| (includeUpper
? collator.compare(term.utf8ToString(), upperTermText) <= 0
: collator.compare(term.utf8ToString(), upperTermText) < 0))) {
return AcceptStatus.YES;
}
return AcceptStatus.NO;
}
}

View File

@ -1,190 +0,0 @@
package org.apache.lucene.sandbox.queries;
import java.text.Collator;
import java.util.Locale;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.SortedDocValuesField;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.search.*;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
import org.junit.AfterClass;
import org.junit.BeforeClass;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Tests SlowCollatedStringComparator, SlowCollatedTermRangeQuery, and SlowCollatedTermRangeFilter
*/
public class TestSlowCollationMethods extends LuceneTestCase {
private static Collator collator;
private static IndexSearcher searcher;
private static IndexReader reader;
private static Directory dir;
private static int numDocs;
private static String splitDoc;
@BeforeClass
public static void beforeClass() throws Exception {
final Locale locale = LuceneTestCase.randomLocale(random());
collator = Collator.getInstance(locale);
collator.setStrength(Collator.IDENTICAL);
collator.setDecomposition(Collator.NO_DECOMPOSITION);
numDocs = 1000 * RANDOM_MULTIPLIER;
dir = newDirectory();
RandomIndexWriter iw = new RandomIndexWriter(random(), dir);
for (int i = 0; i < numDocs; i++) {
Document doc = new Document();
String value = TestUtil.randomUnicodeString(random());
Field field = newStringField("field", value, Field.Store.YES);
doc.add(field);
Field dvField = new SortedDocValuesField("field", new BytesRef(value));
doc.add(dvField);
iw.addDocument(doc);
}
splitDoc = TestUtil.randomUnicodeString(random());
reader = iw.getReader();
iw.close();
searcher = newSearcher(reader);
}
@AfterClass
public static void afterClass() throws Exception {
reader.close();
dir.close();
collator = null;
searcher = null;
reader = null;
dir = null;
}
private void doCheckSorting(TopDocs docs) throws Exception {
String prev = "";
for (ScoreDoc doc : docs.scoreDocs) {
String value = reader.document(doc.doc).get("field");
assertTrue(collator.compare(value, prev) >= 0);
prev = value;
}
}
public void testSort() throws Exception {
SortField sf = new SortField("field", new FieldComparatorSource() {
@Override
public FieldComparator<String> newComparator(String fieldname, int numHits, int sortPos, boolean reversed) {
return new SlowCollatedStringComparator(numHits, fieldname, collator);
}
});
final Sort sort = new Sort(sf);
final TopFieldDocs docs1 = searcher.search(TermRangeQuery.newStringRange("field", null, splitDoc, true, true), null, numDocs/(1+random().nextInt(4)), sort);
doCheckSorting(docs1);
final TopFieldDocs docs2 = searcher.search(TermRangeQuery.newStringRange("field", splitDoc, null, true, true), null, numDocs/(1+random().nextInt(4)), sort);
doCheckSorting(docs2);
final TopFieldDocs docs = TopDocs.merge(sort, numDocs/(1+random().nextInt(4)), new TopFieldDocs[]{docs1, docs2});
doCheckSorting(docs);
}
private void doTestRanges(String startPoint, String endPoint, Query query) throws Exception {
QueryUtils.check(query);
// positive test
TopDocs docs = searcher.search(query, numDocs);
for (ScoreDoc doc : docs.scoreDocs) {
String value = reader.document(doc.doc).get("field");
assertTrue(collator.compare(value, startPoint) >= 0);
assertTrue(collator.compare(value, endPoint) <= 0);
}
// negative test
BooleanQuery bq = new BooleanQuery();
bq.add(new MatchAllDocsQuery(), Occur.SHOULD);
bq.add(query, Occur.MUST_NOT);
docs = searcher.search(bq, numDocs);
for (ScoreDoc doc : docs.scoreDocs) {
String value = reader.document(doc.doc).get("field");
assertTrue(collator.compare(value, startPoint) < 0 || collator.compare(value, endPoint) > 0);
}
}
public void testRangeQuery() throws Exception {
int numQueries = 50*RANDOM_MULTIPLIER;
for (int i = 0; i < numQueries; i++) {
String startPoint = TestUtil.randomUnicodeString(random());
String endPoint = TestUtil.randomUnicodeString(random());
Query query = new SlowCollatedTermRangeQuery("field", startPoint, endPoint, true, true, collator);
doTestRanges(startPoint, endPoint, query);
}
}
public void testRangeFilter() throws Exception {
int numQueries = 50*RANDOM_MULTIPLIER;
for (int i = 0; i < numQueries; i++) {
String startPoint = TestUtil.randomUnicodeString(random());
String endPoint = TestUtil.randomUnicodeString(random());
Query query = new ConstantScoreQuery(new SlowCollatedTermRangeFilter("field", startPoint, endPoint, true, true, collator));
doTestRanges(startPoint, endPoint, query);
}
}
public void testQuery() throws Exception {
// Copied from beforeClass, but scaled down to few docs:
// since otherwise this test can run for a very long
// time (1-2 hours or more; see Lucene-Solr-4.x-Linux Build #2204):
final Locale locale = LuceneTestCase.randomLocale(random());
Collator collator = Collator.getInstance(locale);
collator.setStrength(Collator.IDENTICAL);
collator.setDecomposition(Collator.NO_DECOMPOSITION);
int numDocs = 20 * RANDOM_MULTIPLIER;
Directory dir = newDirectory();
RandomIndexWriter iw = new RandomIndexWriter(random(), dir);
for (int i = 0; i < numDocs; i++) {
Document doc = new Document();
String value = TestUtil.randomUnicodeString(random());
Field field = newStringField("field", value, Field.Store.YES);
doc.add(field);
iw.addDocument(doc);
}
IndexReader reader = iw.getReader();
iw.close();
IndexSearcher searcher = newSearcher(reader);
String startPoint = TestUtil.randomUnicodeString(random());
String endPoint = TestUtil.randomUnicodeString(random());
Query query = new SlowCollatedTermRangeQuery("field", startPoint, endPoint, true, true, collator);
QueryUtils.check(random(), query, searcher);
reader.close();
dir.close();
collator = null;
searcher = null;
reader = null;
dir = null;
}
}