LUCENE-5666: add docvalues for JDK collator, too

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5666@1593806 13f79535-47bb-0310-9956-ffa450edef68
2014-05-11 15:29:22 +00:00 · 2014-05-11 15:29:22 +00:00 · c52cad95fa
parent f0f6578cf1
commit c52cad95fa
5 changed files with 213 additions and 117 deletions
--- a/lucene/analysis/common/src/java/org/apache/lucene/collation/CollationDocValuesField.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/collation/CollationDocValuesField.java
@ -0,0 +1,70 @@
+package org.apache.lucene.collation;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.text.Collator;
+
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.SortedDocValuesField;
+import org.apache.lucene.search.FieldCacheRangeFilter;
+import org.apache.lucene.util.BytesRef;
+
+/**
+ * Indexes collation keys as a single-valued {@link SortedDocValuesField}.
+ * <p>
+ * This is more efficient that {@link CollationKeyAnalyzer} if the field 
+ * only has one value: no uninversion is necessary to sort on the field, 
+ * locale-sensitive range queries can still work via {@link FieldCacheRangeFilter}, 
+ * and the underlying data structures built at index-time are likely more efficient 
+ * and use less memory than FieldCache.
+ */
+public final class CollationDocValuesField extends Field {
+  private final String name;
+  private final Collator collator;
+  private final BytesRef bytes = new BytesRef();
+  
+  /**
+   * Create a new ICUCollationDocValuesField.
+   * <p>
+   * NOTE: you should not create a new one for each document, instead
+   * just make one and reuse it during your indexing process, setting
+   * the value via {@link #setStringValue(String)}.
+   * @param name field name
+   * @param collator Collator for generating collation keys.
+   */
+  // TODO: can we make this trap-free? maybe just synchronize on the collator
+  // instead? 
+  public CollationDocValuesField(String name, Collator collator) {
+    super(name, SortedDocValuesField.TYPE);
+    this.name = name;
+    this.collator = (Collator) collator.clone();
+    fieldsData = bytes; // so wrong setters cannot be called
+  }
+
+  @Override
+  public String name() {
+    return name;
+  }
+  
+  @Override
+  public void setStringValue(String value) {
+    bytes.bytes = collator.getCollationKey(value).toByteArray();
+    bytes.offset = 0;
+    bytes.length = bytes.bytes.length;
+  }
+}
--- a/lucene/analysis/common/src/test/org/apache/lucene/collation/TestCollationDocValuesField.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/collation/TestCollationDocValuesField.java
@ -0,0 +1,143 @@
+package org.apache.lucene.collation;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.text.Collator;
+import java.util.Locale;
+
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.StringField;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.RandomIndexWriter;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.ConstantScoreQuery;
+import org.apache.lucene.search.FieldCacheRangeFilter;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.MatchAllDocsQuery;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.QueryUtils;
+import org.apache.lucene.search.ScoreDoc;
+import org.apache.lucene.search.Sort;
+import org.apache.lucene.search.SortField;
+import org.apache.lucene.search.TopDocs;
+import org.apache.lucene.search.BooleanClause.Occur;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.TestUtil;
+import org.apache.lucene.util.LuceneTestCase.SuppressCodecs;
+
+/**
+ * trivial test of CollationDocValuesField
+ */
+@SuppressCodecs("Lucene3x")
+public class TestCollationDocValuesField extends LuceneTestCase {
+  
+  public void testBasic() throws Exception {
+    Directory dir = newDirectory();
+    RandomIndexWriter iw = new RandomIndexWriter(random(), dir);
+    Document doc = new Document();
+    Field field = newField("field", "", StringField.TYPE_STORED);
+    CollationDocValuesField collationField = new CollationDocValuesField("collated", Collator.getInstance(Locale.ENGLISH));
+    doc.add(field);
+    doc.add(collationField);
+
+    field.setStringValue("ABC");
+    collationField.setStringValue("ABC");
+    iw.addDocument(doc);
+    
+    field.setStringValue("abc");
+    collationField.setStringValue("abc");
+    iw.addDocument(doc);
+    
+    IndexReader ir = iw.getReader();
+    iw.shutdown();
+    
+    IndexSearcher is = newSearcher(ir);
+    
+    SortField sortField = new SortField("collated", SortField.Type.STRING);
+    
+    TopDocs td = is.search(new MatchAllDocsQuery(), 5, new Sort(sortField));
+    assertEquals("abc", ir.document(td.scoreDocs[0].doc).get("field"));
+    assertEquals("ABC", ir.document(td.scoreDocs[1].doc).get("field"));
+    ir.close();
+    dir.close();
+  }
+  
+  public void testRanges() throws Exception {
+    Directory dir = newDirectory();
+    RandomIndexWriter iw = new RandomIndexWriter(random(), dir);
+    Document doc = new Document();
+    Field field = newField("field", "", StringField.TYPE_STORED);
+    Collator collator = Collator.getInstance(); // uses -Dtests.locale
+    if (random().nextBoolean()) {
+      collator.setStrength(Collator.PRIMARY);
+    }
+    CollationDocValuesField collationField = new CollationDocValuesField("collated", collator);
+    doc.add(field);
+    doc.add(collationField);
+    
+    int numDocs = atLeast(500);
+    for (int i = 0; i < numDocs; i++) {
+      String value = TestUtil.randomSimpleString(random());
+      field.setStringValue(value);
+      collationField.setStringValue(value);
+      iw.addDocument(doc);
+    }
+    
+    IndexReader ir = iw.getReader();
+    iw.shutdown();
+    IndexSearcher is = newSearcher(ir);
+    
+    int numChecks = atLeast(100);
+    for (int i = 0; i < numChecks; i++) {
+      String start = TestUtil.randomSimpleString(random());
+      String end = TestUtil.randomSimpleString(random());
+      BytesRef lowerVal = new BytesRef(collator.getCollationKey(start).toByteArray());
+      BytesRef upperVal = new BytesRef(collator.getCollationKey(end).toByteArray());
+      Query query = new ConstantScoreQuery(FieldCacheRangeFilter.newBytesRefRange("collated", lowerVal, upperVal, true, true));
+      doTestRanges(is, start, end, query, collator);
+    }
+    
+    ir.close();
+    dir.close();
+  }
+  
+  private void doTestRanges(IndexSearcher is, String startPoint, String endPoint, Query query, Collator collator) throws Exception { 
+    QueryUtils.check(query);
+    
+    // positive test
+    TopDocs docs = is.search(query, is.getIndexReader().maxDoc());
+    for (ScoreDoc doc : docs.scoreDocs) {
+      String value = is.doc(doc.doc).get("field");
+      assertTrue(collator.compare(value, startPoint) >= 0);
+      assertTrue(collator.compare(value, endPoint) <= 0);
+    }
+    
+    // negative test
+    BooleanQuery bq = new BooleanQuery();
+    bq.add(new MatchAllDocsQuery(), Occur.SHOULD);
+    bq.add(query, Occur.MUST_NOT);
+    docs = is.search(bq, is.getIndexReader().maxDoc());
+    for (ScoreDoc doc : docs.scoreDocs) {
+      String value = is.doc(doc.doc).get("field");
+      assertTrue(collator.compare(value, startPoint) < 0 || collator.compare(value, endPoint) > 0);
+    }
+  }
+}
--- a/lucene/analysis/common/src/test/org/apache/lucene/collation/TestCollationKeyAnalyzer.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/collation/TestCollationKeyAnalyzer.java
@ -60,23 +60,6 @@ public class TestCollationKeyAnalyzer extends CollationTestBase {
       secondRangeBeginning, secondRangeEnd);
  }
  
-  public void testCollationKeySort() throws Exception {
-    Analyzer usAnalyzer 
-      = new CollationKeyAnalyzer(TEST_VERSION_CURRENT, Collator.getInstance(Locale.US));
-    Analyzer franceAnalyzer 
-      = new CollationKeyAnalyzer(TEST_VERSION_CURRENT, Collator.getInstance(Locale.FRANCE));
-    Analyzer swedenAnalyzer 
-      = new CollationKeyAnalyzer(TEST_VERSION_CURRENT, Collator.getInstance(new Locale("sv", "se")));
-    Analyzer denmarkAnalyzer 
-      = new CollationKeyAnalyzer(TEST_VERSION_CURRENT, Collator.getInstance(new Locale("da", "dk")));
-    
-    // The ICU Collator and Sun java.text.Collator implementations differ in their
-    // orderings - "BFJDH" is the ordering for java.text.Collator for Locale.US.
-    testCollationKeySort
-    (usAnalyzer, franceAnalyzer, swedenAnalyzer, denmarkAnalyzer, 
-     oStrokeFirst ? "BFJHD" : "BFJDH", "EACGI", "BJDFH", "BJDHF");
-  }
-  
  public void testThreadSafe() throws Exception {
    int iters = 20 * RANDOM_MULTIPLIER;
    for (int i = 0; i < iters; i++) {
--- a/lucene/analysis/icu/src/test/org/apache/lucene/collation/TestICUCollationKeyAnalyzer.java
+++ b/lucene/analysis/icu/src/test/org/apache/lucene/collation/TestICUCollationKeyAnalyzer.java
@ -56,29 +56,6 @@ public class TestICUCollationKeyAnalyzer extends CollationTestBase {
       secondRangeBeginning, secondRangeEnd);
  }

-  // Test using various international locales with accented characters (which
-  // sort differently depending on locale)
-  //
-  // Copied (and slightly modified) from 
-  // org.apache.lucene.search.TestSort.testInternationalSort()
-  //  
-  public void testCollationKeySort() throws Exception {
-    Analyzer usAnalyzer = new ICUCollationKeyAnalyzer
-      (TEST_VERSION_CURRENT, Collator.getInstance(Locale.ROOT));
-    Analyzer franceAnalyzer = new ICUCollationKeyAnalyzer
-      (TEST_VERSION_CURRENT, Collator.getInstance(Locale.FRANCE));
-    Analyzer swedenAnalyzer = new ICUCollationKeyAnalyzer
-      (TEST_VERSION_CURRENT, Collator.getInstance(new Locale("sv", "se")));
-    Analyzer denmarkAnalyzer = new ICUCollationKeyAnalyzer
-      (TEST_VERSION_CURRENT, Collator.getInstance(new Locale("da", "dk")));
-
-    // The ICU Collator and java.text.Collator implementations differ in their
-    // orderings - "BFJHD" is the ordering for the ICU Collator for Locale.ROOT.
-    testCollationKeySort
-    (usAnalyzer, franceAnalyzer, swedenAnalyzer, denmarkAnalyzer, 
-     "BFJHD", "ECAGI", "BJDFH", "BJDHF");
-  }
-  
  public void testThreadSafe() throws Exception {
    int iters = 20 * RANDOM_MULTIPLIER;
    for (int i = 0; i < iters; i++) {
--- a/lucene/test-framework/src/java/org/apache/lucene/analysis/CollationTestBase.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/CollationTestBase.java
@ -154,83 +154,6 @@ public abstract class CollationTestBase extends LuceneTestCase {
    farsiIndex.close();
  }
  
-  // Test using various international locales with accented characters (which
-  // sort differently depending on locale)
-  //
-  // Copied (and slightly modified) from 
-  // org.apache.lucene.search.TestSort.testInternationalSort()
-  //  
-  // TODO: this test is really fragile. there are already 3 different cases,
-  // depending upon unicode version.
-  public void testCollationKeySort(Analyzer usAnalyzer,
-                                   Analyzer franceAnalyzer,
-                                   Analyzer swedenAnalyzer,
-                                   Analyzer denmarkAnalyzer,
-                                   String usResult,
-                                   String frResult,
-                                   String svResult,
-                                   String dkResult) throws Exception {
-    Directory indexStore = newDirectory();
-    IndexWriter writer = new IndexWriter(indexStore, new IndexWriterConfig(
-        TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false)));
-
-    // document data:
-    // the tracer field is used to determine which document was hit
-    String[][] sortData = new String[][] {
-      // tracer contents US                 France             Sweden (sv_SE)     Denmark (da_DK)
-      {  "A",   "x",     "p\u00EAche",      "p\u00EAche",      "p\u00EAche",      "p\u00EAche"      },
-      {  "B",   "y",     "HAT",             "HAT",             "HAT",             "HAT"             },
-      {  "C",   "x",     "p\u00E9ch\u00E9", "p\u00E9ch\u00E9", "p\u00E9ch\u00E9", "p\u00E9ch\u00E9" },
-      {  "D",   "y",     "HUT",             "HUT",             "HUT",             "HUT"             },
-      {  "E",   "x",     "peach",           "peach",           "peach",           "peach"           },
-      {  "F",   "y",     "H\u00C5T",        "H\u00C5T",        "H\u00C5T",        "H\u00C5T"        },
-      {  "G",   "x",     "sin",             "sin",             "sin",             "sin"             },
-      {  "H",   "y",     "H\u00D8T",        "H\u00D8T",        "H\u00D8T",        "H\u00D8T"        },
-      {  "I",   "x",     "s\u00EDn",        "s\u00EDn",        "s\u00EDn",        "s\u00EDn"        },
-      {  "J",   "y",     "HOT",             "HOT",             "HOT",             "HOT"             },
-    };
-
-    FieldType customType = new FieldType();
-    customType.setStored(true);
-    
-    for (int i = 0 ; i < sortData.length ; ++i) {
-      Document doc = new Document();
-      doc.add(new Field("tracer", sortData[i][0], customType));
-      doc.add(new TextField("contents", sortData[i][1], Field.Store.NO));
-      if (sortData[i][2] != null) 
-        doc.add(new TextField("US", usAnalyzer.tokenStream("US", sortData[i][2])));
-      if (sortData[i][3] != null) 
-        doc.add(new TextField("France", franceAnalyzer.tokenStream("France", sortData[i][3])));
-      if (sortData[i][4] != null)
-        doc.add(new TextField("Sweden", swedenAnalyzer.tokenStream("Sweden", sortData[i][4])));
-      if (sortData[i][5] != null) 
-        doc.add(new TextField("Denmark", denmarkAnalyzer.tokenStream("Denmark", sortData[i][5])));
-      writer.addDocument(doc);
-    }
-    writer.forceMerge(1);
-    writer.shutdown();
-    IndexReader reader = DirectoryReader.open(indexStore);
-    IndexSearcher searcher = new IndexSearcher(reader);
-
-    Sort sort = new Sort();
-    Query queryX = new TermQuery(new Term ("contents", "x"));
-    Query queryY = new TermQuery(new Term ("contents", "y"));
-    
-    sort.setSort(new SortField("US", SortField.Type.STRING));
-    assertMatches(searcher, queryY, sort, usResult);
-
-    sort.setSort(new SortField("France", SortField.Type.STRING));
-    assertMatches(searcher, queryX, sort, frResult);
-
-    sort.setSort(new SortField("Sweden", SortField.Type.STRING));
-    assertMatches(searcher, queryY, sort, svResult);
-
-    sort.setSort(new SortField("Denmark", SortField.Type.STRING));
-    assertMatches(searcher, queryY, sort, dkResult);
-    reader.close();
-    indexStore.close();
-  }
-    
  // Make sure the documents returned by the search match the expected list
  // Copied from TestSort.java
  private void assertMatches(IndexSearcher searcher, Query query, Sort sort,