mirror of https://github.com/apache/lucene.git
LUCENE-5666: add docvalues for JDK collator, too
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5666@1593806 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
f0f6578cf1
commit
c52cad95fa
|
@ -0,0 +1,70 @@
|
|||
package org.apache.lucene.collation;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.text.Collator;
|
||||
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.SortedDocValuesField;
|
||||
import org.apache.lucene.search.FieldCacheRangeFilter;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
||||
/**
|
||||
* Indexes collation keys as a single-valued {@link SortedDocValuesField}.
|
||||
* <p>
|
||||
* This is more efficient that {@link CollationKeyAnalyzer} if the field
|
||||
* only has one value: no uninversion is necessary to sort on the field,
|
||||
* locale-sensitive range queries can still work via {@link FieldCacheRangeFilter},
|
||||
* and the underlying data structures built at index-time are likely more efficient
|
||||
* and use less memory than FieldCache.
|
||||
*/
|
||||
public final class CollationDocValuesField extends Field {
|
||||
private final String name;
|
||||
private final Collator collator;
|
||||
private final BytesRef bytes = new BytesRef();
|
||||
|
||||
/**
|
||||
* Create a new ICUCollationDocValuesField.
|
||||
* <p>
|
||||
* NOTE: you should not create a new one for each document, instead
|
||||
* just make one and reuse it during your indexing process, setting
|
||||
* the value via {@link #setStringValue(String)}.
|
||||
* @param name field name
|
||||
* @param collator Collator for generating collation keys.
|
||||
*/
|
||||
// TODO: can we make this trap-free? maybe just synchronize on the collator
|
||||
// instead?
|
||||
public CollationDocValuesField(String name, Collator collator) {
|
||||
super(name, SortedDocValuesField.TYPE);
|
||||
this.name = name;
|
||||
this.collator = (Collator) collator.clone();
|
||||
fieldsData = bytes; // so wrong setters cannot be called
|
||||
}
|
||||
|
||||
@Override
|
||||
public String name() {
|
||||
return name;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setStringValue(String value) {
|
||||
bytes.bytes = collator.getCollationKey(value).toByteArray();
|
||||
bytes.offset = 0;
|
||||
bytes.length = bytes.bytes.length;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,143 @@
|
|||
package org.apache.lucene.collation;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.text.Collator;
|
||||
import java.util.Locale;
|
||||
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.StringField;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.RandomIndexWriter;
|
||||
import org.apache.lucene.search.BooleanQuery;
|
||||
import org.apache.lucene.search.ConstantScoreQuery;
|
||||
import org.apache.lucene.search.FieldCacheRangeFilter;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.MatchAllDocsQuery;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.QueryUtils;
|
||||
import org.apache.lucene.search.ScoreDoc;
|
||||
import org.apache.lucene.search.Sort;
|
||||
import org.apache.lucene.search.SortField;
|
||||
import org.apache.lucene.search.TopDocs;
|
||||
import org.apache.lucene.search.BooleanClause.Occur;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.TestUtil;
|
||||
import org.apache.lucene.util.LuceneTestCase.SuppressCodecs;
|
||||
|
||||
/**
|
||||
* trivial test of CollationDocValuesField
|
||||
*/
|
||||
@SuppressCodecs("Lucene3x")
|
||||
public class TestCollationDocValuesField extends LuceneTestCase {
|
||||
|
||||
public void testBasic() throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
RandomIndexWriter iw = new RandomIndexWriter(random(), dir);
|
||||
Document doc = new Document();
|
||||
Field field = newField("field", "", StringField.TYPE_STORED);
|
||||
CollationDocValuesField collationField = new CollationDocValuesField("collated", Collator.getInstance(Locale.ENGLISH));
|
||||
doc.add(field);
|
||||
doc.add(collationField);
|
||||
|
||||
field.setStringValue("ABC");
|
||||
collationField.setStringValue("ABC");
|
||||
iw.addDocument(doc);
|
||||
|
||||
field.setStringValue("abc");
|
||||
collationField.setStringValue("abc");
|
||||
iw.addDocument(doc);
|
||||
|
||||
IndexReader ir = iw.getReader();
|
||||
iw.shutdown();
|
||||
|
||||
IndexSearcher is = newSearcher(ir);
|
||||
|
||||
SortField sortField = new SortField("collated", SortField.Type.STRING);
|
||||
|
||||
TopDocs td = is.search(new MatchAllDocsQuery(), 5, new Sort(sortField));
|
||||
assertEquals("abc", ir.document(td.scoreDocs[0].doc).get("field"));
|
||||
assertEquals("ABC", ir.document(td.scoreDocs[1].doc).get("field"));
|
||||
ir.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
public void testRanges() throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
RandomIndexWriter iw = new RandomIndexWriter(random(), dir);
|
||||
Document doc = new Document();
|
||||
Field field = newField("field", "", StringField.TYPE_STORED);
|
||||
Collator collator = Collator.getInstance(); // uses -Dtests.locale
|
||||
if (random().nextBoolean()) {
|
||||
collator.setStrength(Collator.PRIMARY);
|
||||
}
|
||||
CollationDocValuesField collationField = new CollationDocValuesField("collated", collator);
|
||||
doc.add(field);
|
||||
doc.add(collationField);
|
||||
|
||||
int numDocs = atLeast(500);
|
||||
for (int i = 0; i < numDocs; i++) {
|
||||
String value = TestUtil.randomSimpleString(random());
|
||||
field.setStringValue(value);
|
||||
collationField.setStringValue(value);
|
||||
iw.addDocument(doc);
|
||||
}
|
||||
|
||||
IndexReader ir = iw.getReader();
|
||||
iw.shutdown();
|
||||
IndexSearcher is = newSearcher(ir);
|
||||
|
||||
int numChecks = atLeast(100);
|
||||
for (int i = 0; i < numChecks; i++) {
|
||||
String start = TestUtil.randomSimpleString(random());
|
||||
String end = TestUtil.randomSimpleString(random());
|
||||
BytesRef lowerVal = new BytesRef(collator.getCollationKey(start).toByteArray());
|
||||
BytesRef upperVal = new BytesRef(collator.getCollationKey(end).toByteArray());
|
||||
Query query = new ConstantScoreQuery(FieldCacheRangeFilter.newBytesRefRange("collated", lowerVal, upperVal, true, true));
|
||||
doTestRanges(is, start, end, query, collator);
|
||||
}
|
||||
|
||||
ir.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
private void doTestRanges(IndexSearcher is, String startPoint, String endPoint, Query query, Collator collator) throws Exception {
|
||||
QueryUtils.check(query);
|
||||
|
||||
// positive test
|
||||
TopDocs docs = is.search(query, is.getIndexReader().maxDoc());
|
||||
for (ScoreDoc doc : docs.scoreDocs) {
|
||||
String value = is.doc(doc.doc).get("field");
|
||||
assertTrue(collator.compare(value, startPoint) >= 0);
|
||||
assertTrue(collator.compare(value, endPoint) <= 0);
|
||||
}
|
||||
|
||||
// negative test
|
||||
BooleanQuery bq = new BooleanQuery();
|
||||
bq.add(new MatchAllDocsQuery(), Occur.SHOULD);
|
||||
bq.add(query, Occur.MUST_NOT);
|
||||
docs = is.search(bq, is.getIndexReader().maxDoc());
|
||||
for (ScoreDoc doc : docs.scoreDocs) {
|
||||
String value = is.doc(doc.doc).get("field");
|
||||
assertTrue(collator.compare(value, startPoint) < 0 || collator.compare(value, endPoint) > 0);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -60,23 +60,6 @@ public class TestCollationKeyAnalyzer extends CollationTestBase {
|
|||
secondRangeBeginning, secondRangeEnd);
|
||||
}
|
||||
|
||||
public void testCollationKeySort() throws Exception {
|
||||
Analyzer usAnalyzer
|
||||
= new CollationKeyAnalyzer(TEST_VERSION_CURRENT, Collator.getInstance(Locale.US));
|
||||
Analyzer franceAnalyzer
|
||||
= new CollationKeyAnalyzer(TEST_VERSION_CURRENT, Collator.getInstance(Locale.FRANCE));
|
||||
Analyzer swedenAnalyzer
|
||||
= new CollationKeyAnalyzer(TEST_VERSION_CURRENT, Collator.getInstance(new Locale("sv", "se")));
|
||||
Analyzer denmarkAnalyzer
|
||||
= new CollationKeyAnalyzer(TEST_VERSION_CURRENT, Collator.getInstance(new Locale("da", "dk")));
|
||||
|
||||
// The ICU Collator and Sun java.text.Collator implementations differ in their
|
||||
// orderings - "BFJDH" is the ordering for java.text.Collator for Locale.US.
|
||||
testCollationKeySort
|
||||
(usAnalyzer, franceAnalyzer, swedenAnalyzer, denmarkAnalyzer,
|
||||
oStrokeFirst ? "BFJHD" : "BFJDH", "EACGI", "BJDFH", "BJDHF");
|
||||
}
|
||||
|
||||
public void testThreadSafe() throws Exception {
|
||||
int iters = 20 * RANDOM_MULTIPLIER;
|
||||
for (int i = 0; i < iters; i++) {
|
||||
|
|
|
@ -56,29 +56,6 @@ public class TestICUCollationKeyAnalyzer extends CollationTestBase {
|
|||
secondRangeBeginning, secondRangeEnd);
|
||||
}
|
||||
|
||||
// Test using various international locales with accented characters (which
|
||||
// sort differently depending on locale)
|
||||
//
|
||||
// Copied (and slightly modified) from
|
||||
// org.apache.lucene.search.TestSort.testInternationalSort()
|
||||
//
|
||||
public void testCollationKeySort() throws Exception {
|
||||
Analyzer usAnalyzer = new ICUCollationKeyAnalyzer
|
||||
(TEST_VERSION_CURRENT, Collator.getInstance(Locale.ROOT));
|
||||
Analyzer franceAnalyzer = new ICUCollationKeyAnalyzer
|
||||
(TEST_VERSION_CURRENT, Collator.getInstance(Locale.FRANCE));
|
||||
Analyzer swedenAnalyzer = new ICUCollationKeyAnalyzer
|
||||
(TEST_VERSION_CURRENT, Collator.getInstance(new Locale("sv", "se")));
|
||||
Analyzer denmarkAnalyzer = new ICUCollationKeyAnalyzer
|
||||
(TEST_VERSION_CURRENT, Collator.getInstance(new Locale("da", "dk")));
|
||||
|
||||
// The ICU Collator and java.text.Collator implementations differ in their
|
||||
// orderings - "BFJHD" is the ordering for the ICU Collator for Locale.ROOT.
|
||||
testCollationKeySort
|
||||
(usAnalyzer, franceAnalyzer, swedenAnalyzer, denmarkAnalyzer,
|
||||
"BFJHD", "ECAGI", "BJDFH", "BJDHF");
|
||||
}
|
||||
|
||||
public void testThreadSafe() throws Exception {
|
||||
int iters = 20 * RANDOM_MULTIPLIER;
|
||||
for (int i = 0; i < iters; i++) {
|
||||
|
|
|
@ -154,83 +154,6 @@ public abstract class CollationTestBase extends LuceneTestCase {
|
|||
farsiIndex.close();
|
||||
}
|
||||
|
||||
// Test using various international locales with accented characters (which
|
||||
// sort differently depending on locale)
|
||||
//
|
||||
// Copied (and slightly modified) from
|
||||
// org.apache.lucene.search.TestSort.testInternationalSort()
|
||||
//
|
||||
// TODO: this test is really fragile. there are already 3 different cases,
|
||||
// depending upon unicode version.
|
||||
public void testCollationKeySort(Analyzer usAnalyzer,
|
||||
Analyzer franceAnalyzer,
|
||||
Analyzer swedenAnalyzer,
|
||||
Analyzer denmarkAnalyzer,
|
||||
String usResult,
|
||||
String frResult,
|
||||
String svResult,
|
||||
String dkResult) throws Exception {
|
||||
Directory indexStore = newDirectory();
|
||||
IndexWriter writer = new IndexWriter(indexStore, new IndexWriterConfig(
|
||||
TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false)));
|
||||
|
||||
// document data:
|
||||
// the tracer field is used to determine which document was hit
|
||||
String[][] sortData = new String[][] {
|
||||
// tracer contents US France Sweden (sv_SE) Denmark (da_DK)
|
||||
{ "A", "x", "p\u00EAche", "p\u00EAche", "p\u00EAche", "p\u00EAche" },
|
||||
{ "B", "y", "HAT", "HAT", "HAT", "HAT" },
|
||||
{ "C", "x", "p\u00E9ch\u00E9", "p\u00E9ch\u00E9", "p\u00E9ch\u00E9", "p\u00E9ch\u00E9" },
|
||||
{ "D", "y", "HUT", "HUT", "HUT", "HUT" },
|
||||
{ "E", "x", "peach", "peach", "peach", "peach" },
|
||||
{ "F", "y", "H\u00C5T", "H\u00C5T", "H\u00C5T", "H\u00C5T" },
|
||||
{ "G", "x", "sin", "sin", "sin", "sin" },
|
||||
{ "H", "y", "H\u00D8T", "H\u00D8T", "H\u00D8T", "H\u00D8T" },
|
||||
{ "I", "x", "s\u00EDn", "s\u00EDn", "s\u00EDn", "s\u00EDn" },
|
||||
{ "J", "y", "HOT", "HOT", "HOT", "HOT" },
|
||||
};
|
||||
|
||||
FieldType customType = new FieldType();
|
||||
customType.setStored(true);
|
||||
|
||||
for (int i = 0 ; i < sortData.length ; ++i) {
|
||||
Document doc = new Document();
|
||||
doc.add(new Field("tracer", sortData[i][0], customType));
|
||||
doc.add(new TextField("contents", sortData[i][1], Field.Store.NO));
|
||||
if (sortData[i][2] != null)
|
||||
doc.add(new TextField("US", usAnalyzer.tokenStream("US", sortData[i][2])));
|
||||
if (sortData[i][3] != null)
|
||||
doc.add(new TextField("France", franceAnalyzer.tokenStream("France", sortData[i][3])));
|
||||
if (sortData[i][4] != null)
|
||||
doc.add(new TextField("Sweden", swedenAnalyzer.tokenStream("Sweden", sortData[i][4])));
|
||||
if (sortData[i][5] != null)
|
||||
doc.add(new TextField("Denmark", denmarkAnalyzer.tokenStream("Denmark", sortData[i][5])));
|
||||
writer.addDocument(doc);
|
||||
}
|
||||
writer.forceMerge(1);
|
||||
writer.shutdown();
|
||||
IndexReader reader = DirectoryReader.open(indexStore);
|
||||
IndexSearcher searcher = new IndexSearcher(reader);
|
||||
|
||||
Sort sort = new Sort();
|
||||
Query queryX = new TermQuery(new Term ("contents", "x"));
|
||||
Query queryY = new TermQuery(new Term ("contents", "y"));
|
||||
|
||||
sort.setSort(new SortField("US", SortField.Type.STRING));
|
||||
assertMatches(searcher, queryY, sort, usResult);
|
||||
|
||||
sort.setSort(new SortField("France", SortField.Type.STRING));
|
||||
assertMatches(searcher, queryX, sort, frResult);
|
||||
|
||||
sort.setSort(new SortField("Sweden", SortField.Type.STRING));
|
||||
assertMatches(searcher, queryY, sort, svResult);
|
||||
|
||||
sort.setSort(new SortField("Denmark", SortField.Type.STRING));
|
||||
assertMatches(searcher, queryY, sort, dkResult);
|
||||
reader.close();
|
||||
indexStore.close();
|
||||
}
|
||||
|
||||
// Make sure the documents returned by the search match the expected list
|
||||
// Copied from TestSort.java
|
||||
private void assertMatches(IndexSearcher searcher, Query query, Sort sort,
|
||||
|
|
Loading…
Reference in New Issue