LUCENE-5666: add docvalues for JDK collator, too

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5666@1593806 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2014-05-11 15:29:22 +00:00
parent f0f6578cf1
commit c52cad95fa
5 changed files with 213 additions and 117 deletions

View File

@ -0,0 +1,70 @@
package org.apache.lucene.collation;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.text.Collator;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.SortedDocValuesField;
import org.apache.lucene.search.FieldCacheRangeFilter;
import org.apache.lucene.util.BytesRef;
/**
* Indexes collation keys as a single-valued {@link SortedDocValuesField}.
* <p>
* This is more efficient that {@link CollationKeyAnalyzer} if the field
* only has one value: no uninversion is necessary to sort on the field,
* locale-sensitive range queries can still work via {@link FieldCacheRangeFilter},
* and the underlying data structures built at index-time are likely more efficient
* and use less memory than FieldCache.
*/
public final class CollationDocValuesField extends Field {
private final String name;
private final Collator collator;
private final BytesRef bytes = new BytesRef();
/**
* Create a new ICUCollationDocValuesField.
* <p>
* NOTE: you should not create a new one for each document, instead
* just make one and reuse it during your indexing process, setting
* the value via {@link #setStringValue(String)}.
* @param name field name
* @param collator Collator for generating collation keys.
*/
// TODO: can we make this trap-free? maybe just synchronize on the collator
// instead?
public CollationDocValuesField(String name, Collator collator) {
super(name, SortedDocValuesField.TYPE);
this.name = name;
this.collator = (Collator) collator.clone();
fieldsData = bytes; // so wrong setters cannot be called
}
@Override
public String name() {
return name;
}
@Override
public void setStringValue(String value) {
bytes.bytes = collator.getCollationKey(value).toByteArray();
bytes.offset = 0;
bytes.length = bytes.bytes.length;
}
}

View File

@ -0,0 +1,143 @@
package org.apache.lucene.collation;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.text.Collator;
import java.util.Locale;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.ConstantScoreQuery;
import org.apache.lucene.search.FieldCacheRangeFilter;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.QueryUtils;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
import org.apache.lucene.util.LuceneTestCase.SuppressCodecs;
/**
* trivial test of CollationDocValuesField
*/
@SuppressCodecs("Lucene3x")
public class TestCollationDocValuesField extends LuceneTestCase {
public void testBasic() throws Exception {
Directory dir = newDirectory();
RandomIndexWriter iw = new RandomIndexWriter(random(), dir);
Document doc = new Document();
Field field = newField("field", "", StringField.TYPE_STORED);
CollationDocValuesField collationField = new CollationDocValuesField("collated", Collator.getInstance(Locale.ENGLISH));
doc.add(field);
doc.add(collationField);
field.setStringValue("ABC");
collationField.setStringValue("ABC");
iw.addDocument(doc);
field.setStringValue("abc");
collationField.setStringValue("abc");
iw.addDocument(doc);
IndexReader ir = iw.getReader();
iw.shutdown();
IndexSearcher is = newSearcher(ir);
SortField sortField = new SortField("collated", SortField.Type.STRING);
TopDocs td = is.search(new MatchAllDocsQuery(), 5, new Sort(sortField));
assertEquals("abc", ir.document(td.scoreDocs[0].doc).get("field"));
assertEquals("ABC", ir.document(td.scoreDocs[1].doc).get("field"));
ir.close();
dir.close();
}
public void testRanges() throws Exception {
Directory dir = newDirectory();
RandomIndexWriter iw = new RandomIndexWriter(random(), dir);
Document doc = new Document();
Field field = newField("field", "", StringField.TYPE_STORED);
Collator collator = Collator.getInstance(); // uses -Dtests.locale
if (random().nextBoolean()) {
collator.setStrength(Collator.PRIMARY);
}
CollationDocValuesField collationField = new CollationDocValuesField("collated", collator);
doc.add(field);
doc.add(collationField);
int numDocs = atLeast(500);
for (int i = 0; i < numDocs; i++) {
String value = TestUtil.randomSimpleString(random());
field.setStringValue(value);
collationField.setStringValue(value);
iw.addDocument(doc);
}
IndexReader ir = iw.getReader();
iw.shutdown();
IndexSearcher is = newSearcher(ir);
int numChecks = atLeast(100);
for (int i = 0; i < numChecks; i++) {
String start = TestUtil.randomSimpleString(random());
String end = TestUtil.randomSimpleString(random());
BytesRef lowerVal = new BytesRef(collator.getCollationKey(start).toByteArray());
BytesRef upperVal = new BytesRef(collator.getCollationKey(end).toByteArray());
Query query = new ConstantScoreQuery(FieldCacheRangeFilter.newBytesRefRange("collated", lowerVal, upperVal, true, true));
doTestRanges(is, start, end, query, collator);
}
ir.close();
dir.close();
}
private void doTestRanges(IndexSearcher is, String startPoint, String endPoint, Query query, Collator collator) throws Exception {
QueryUtils.check(query);
// positive test
TopDocs docs = is.search(query, is.getIndexReader().maxDoc());
for (ScoreDoc doc : docs.scoreDocs) {
String value = is.doc(doc.doc).get("field");
assertTrue(collator.compare(value, startPoint) >= 0);
assertTrue(collator.compare(value, endPoint) <= 0);
}
// negative test
BooleanQuery bq = new BooleanQuery();
bq.add(new MatchAllDocsQuery(), Occur.SHOULD);
bq.add(query, Occur.MUST_NOT);
docs = is.search(bq, is.getIndexReader().maxDoc());
for (ScoreDoc doc : docs.scoreDocs) {
String value = is.doc(doc.doc).get("field");
assertTrue(collator.compare(value, startPoint) < 0 || collator.compare(value, endPoint) > 0);
}
}
}

View File

@ -60,23 +60,6 @@ public class TestCollationKeyAnalyzer extends CollationTestBase {
secondRangeBeginning, secondRangeEnd);
}
public void testCollationKeySort() throws Exception {
Analyzer usAnalyzer
= new CollationKeyAnalyzer(TEST_VERSION_CURRENT, Collator.getInstance(Locale.US));
Analyzer franceAnalyzer
= new CollationKeyAnalyzer(TEST_VERSION_CURRENT, Collator.getInstance(Locale.FRANCE));
Analyzer swedenAnalyzer
= new CollationKeyAnalyzer(TEST_VERSION_CURRENT, Collator.getInstance(new Locale("sv", "se")));
Analyzer denmarkAnalyzer
= new CollationKeyAnalyzer(TEST_VERSION_CURRENT, Collator.getInstance(new Locale("da", "dk")));
// The ICU Collator and Sun java.text.Collator implementations differ in their
// orderings - "BFJDH" is the ordering for java.text.Collator for Locale.US.
testCollationKeySort
(usAnalyzer, franceAnalyzer, swedenAnalyzer, denmarkAnalyzer,
oStrokeFirst ? "BFJHD" : "BFJDH", "EACGI", "BJDFH", "BJDHF");
}
public void testThreadSafe() throws Exception {
int iters = 20 * RANDOM_MULTIPLIER;
for (int i = 0; i < iters; i++) {

View File

@ -56,29 +56,6 @@ public class TestICUCollationKeyAnalyzer extends CollationTestBase {
secondRangeBeginning, secondRangeEnd);
}
// Test using various international locales with accented characters (which
// sort differently depending on locale)
//
// Copied (and slightly modified) from
// org.apache.lucene.search.TestSort.testInternationalSort()
//
public void testCollationKeySort() throws Exception {
Analyzer usAnalyzer = new ICUCollationKeyAnalyzer
(TEST_VERSION_CURRENT, Collator.getInstance(Locale.ROOT));
Analyzer franceAnalyzer = new ICUCollationKeyAnalyzer
(TEST_VERSION_CURRENT, Collator.getInstance(Locale.FRANCE));
Analyzer swedenAnalyzer = new ICUCollationKeyAnalyzer
(TEST_VERSION_CURRENT, Collator.getInstance(new Locale("sv", "se")));
Analyzer denmarkAnalyzer = new ICUCollationKeyAnalyzer
(TEST_VERSION_CURRENT, Collator.getInstance(new Locale("da", "dk")));
// The ICU Collator and java.text.Collator implementations differ in their
// orderings - "BFJHD" is the ordering for the ICU Collator for Locale.ROOT.
testCollationKeySort
(usAnalyzer, franceAnalyzer, swedenAnalyzer, denmarkAnalyzer,
"BFJHD", "ECAGI", "BJDFH", "BJDHF");
}
public void testThreadSafe() throws Exception {
int iters = 20 * RANDOM_MULTIPLIER;
for (int i = 0; i < iters; i++) {

View File

@ -154,83 +154,6 @@ public abstract class CollationTestBase extends LuceneTestCase {
farsiIndex.close();
}
// Test using various international locales with accented characters (which
// sort differently depending on locale)
//
// Copied (and slightly modified) from
// org.apache.lucene.search.TestSort.testInternationalSort()
//
// TODO: this test is really fragile. there are already 3 different cases,
// depending upon unicode version.
public void testCollationKeySort(Analyzer usAnalyzer,
Analyzer franceAnalyzer,
Analyzer swedenAnalyzer,
Analyzer denmarkAnalyzer,
String usResult,
String frResult,
String svResult,
String dkResult) throws Exception {
Directory indexStore = newDirectory();
IndexWriter writer = new IndexWriter(indexStore, new IndexWriterConfig(
TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false)));
// document data:
// the tracer field is used to determine which document was hit
String[][] sortData = new String[][] {
// tracer contents US France Sweden (sv_SE) Denmark (da_DK)
{ "A", "x", "p\u00EAche", "p\u00EAche", "p\u00EAche", "p\u00EAche" },
{ "B", "y", "HAT", "HAT", "HAT", "HAT" },
{ "C", "x", "p\u00E9ch\u00E9", "p\u00E9ch\u00E9", "p\u00E9ch\u00E9", "p\u00E9ch\u00E9" },
{ "D", "y", "HUT", "HUT", "HUT", "HUT" },
{ "E", "x", "peach", "peach", "peach", "peach" },
{ "F", "y", "H\u00C5T", "H\u00C5T", "H\u00C5T", "H\u00C5T" },
{ "G", "x", "sin", "sin", "sin", "sin" },
{ "H", "y", "H\u00D8T", "H\u00D8T", "H\u00D8T", "H\u00D8T" },
{ "I", "x", "s\u00EDn", "s\u00EDn", "s\u00EDn", "s\u00EDn" },
{ "J", "y", "HOT", "HOT", "HOT", "HOT" },
};
FieldType customType = new FieldType();
customType.setStored(true);
for (int i = 0 ; i < sortData.length ; ++i) {
Document doc = new Document();
doc.add(new Field("tracer", sortData[i][0], customType));
doc.add(new TextField("contents", sortData[i][1], Field.Store.NO));
if (sortData[i][2] != null)
doc.add(new TextField("US", usAnalyzer.tokenStream("US", sortData[i][2])));
if (sortData[i][3] != null)
doc.add(new TextField("France", franceAnalyzer.tokenStream("France", sortData[i][3])));
if (sortData[i][4] != null)
doc.add(new TextField("Sweden", swedenAnalyzer.tokenStream("Sweden", sortData[i][4])));
if (sortData[i][5] != null)
doc.add(new TextField("Denmark", denmarkAnalyzer.tokenStream("Denmark", sortData[i][5])));
writer.addDocument(doc);
}
writer.forceMerge(1);
writer.shutdown();
IndexReader reader = DirectoryReader.open(indexStore);
IndexSearcher searcher = new IndexSearcher(reader);
Sort sort = new Sort();
Query queryX = new TermQuery(new Term ("contents", "x"));
Query queryY = new TermQuery(new Term ("contents", "y"));
sort.setSort(new SortField("US", SortField.Type.STRING));
assertMatches(searcher, queryY, sort, usResult);
sort.setSort(new SortField("France", SortField.Type.STRING));
assertMatches(searcher, queryX, sort, frResult);
sort.setSort(new SortField("Sweden", SortField.Type.STRING));
assertMatches(searcher, queryY, sort, svResult);
sort.setSort(new SortField("Denmark", SortField.Type.STRING));
assertMatches(searcher, queryY, sort, dkResult);
reader.close();
indexStore.close();
}
// Make sure the documents returned by the search match the expected list
// Copied from TestSort.java
private void assertMatches(IndexSearcher searcher, Query query, Sort sort,