diff --git a/CHANGES.txt b/CHANGES.txt index 0442a3e610b..da617db1325 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -83,6 +83,10 @@ Optimizations Build + * LUCENE-2124: Moved the JDK-based collation support from contrib/collation + into core, and moved the ICU-based collation support into contrib/icu. + (Robert Muir) + Test Cases * LUCENE-2037 Allow Junit4 tests in our envrionment (Erick Erickson diff --git a/NOTICE.txt b/NOTICE.txt index 8eeeb6121a0..593f25cd9c3 100644 --- a/NOTICE.txt +++ b/NOTICE.txt @@ -30,6 +30,6 @@ Includes lib/servlet-api-2.4.jar from Apache Tomcat The SmartChineseAnalyzer source code (under contrib/analyzers) was provided by Xiaoping Gao and copyright 2009 by www.imdict.net. -ICU4J, (under contrib/collation) is licensed under an MIT styles license -(contrib/collation/lib/ICU-LICENSE.txt) and Copyright (c) 1995-2008 +ICU4J, (under contrib/icu) is licensed under an MIT styles license +(contrib/icu/lib/ICU-LICENSE.txt) and Copyright (c) 1995-2008 International Business Machines Corporation and others diff --git a/build.xml b/build.xml index 15a4c766072..05d1246164d 100644 --- a/build.xml +++ b/build.xml @@ -300,7 +300,7 @@ - + @@ -334,7 +334,7 @@ - + diff --git a/contrib/CHANGES.txt b/contrib/CHANGES.txt index 30719c114a0..f3b2997e401 100644 --- a/contrib/CHANGES.txt +++ b/contrib/CHANGES.txt @@ -37,6 +37,12 @@ New features * LUCENE-2062: Add a Bulgarian analyzer. (Robert Muir, Simon Willnauer) +Build + + * LUCENE-2124: Moved the JDK-based collation support from contrib/collation + into core, and moved the ICU-based collation support into contrib/icu. + (Robert Muir) + Test Cases * LUCENE-2115: Cutover contrib tests to use Java5 generics. (Kay Kay diff --git a/contrib/collation/build.xml b/contrib/icu/build.xml similarity index 65% rename from contrib/collation/build.xml rename to contrib/icu/build.xml index 5f03c2082d8..f8fbd2438f9 100644 --- a/contrib/collation/build.xml +++ b/contrib/icu/build.xml @@ -17,17 +17,16 @@ limitations under the License. --> - + - CollationKeyFilter, ICUCollationKeyFilter, CollationKeyAnalyzer, and - ICUCollationKeyAnalyzer - converts tokens into indexable collation keys + Provides integration with ICU (International Components for Unicode) for + stronger Unicode and internationalization support. - - - - - - - - - - - - - diff --git a/contrib/collation/lib/ICU-LICENSE.txt b/contrib/icu/lib/ICU-LICENSE.txt similarity index 100% rename from contrib/collation/lib/ICU-LICENSE.txt rename to contrib/icu/lib/ICU-LICENSE.txt diff --git a/contrib/collation/lib/icu4j-collation-4.0.jar b/contrib/icu/lib/icu4j-collation-4.0.jar similarity index 100% rename from contrib/collation/lib/icu4j-collation-4.0.jar rename to contrib/icu/lib/icu4j-collation-4.0.jar diff --git a/contrib/collation/pom.xml.template b/contrib/icu/pom.xml.template similarity index 85% rename from contrib/collation/pom.xml.template rename to contrib/icu/pom.xml.template index 5f1b6621864..bc3ad8df9be 100644 --- a/contrib/collation/pom.xml.template +++ b/contrib/icu/pom.xml.template @@ -27,14 +27,14 @@ @version@ org.apache.lucene - lucene-collation + lucene-icu - Lucene CollationKeyFilter/Analyzer & ICUCollationKeyFilter/Analyzer + Lucene ICUCollationKeyFilter/Analyzer @version@ - CollationKeyFilter, ICUCollationKeyFilter, CollationKeyAnalyzer, and - ICUCollationKeyAnalyzer - converts tokens into indexable collation keys + Provides integration with ICU (International Components for Unicode) for + stronger Unicode and internationalization support. jar diff --git a/contrib/collation/src/java/org/apache/lucene/collation/ICUCollationKeyAnalyzer.java b/contrib/icu/src/java/org/apache/lucene/collation/ICUCollationKeyAnalyzer.java similarity index 100% rename from contrib/collation/src/java/org/apache/lucene/collation/ICUCollationKeyAnalyzer.java rename to contrib/icu/src/java/org/apache/lucene/collation/ICUCollationKeyAnalyzer.java diff --git a/contrib/collation/src/java/org/apache/lucene/collation/ICUCollationKeyFilter.java b/contrib/icu/src/java/org/apache/lucene/collation/ICUCollationKeyFilter.java similarity index 100% rename from contrib/collation/src/java/org/apache/lucene/collation/ICUCollationKeyFilter.java rename to contrib/icu/src/java/org/apache/lucene/collation/ICUCollationKeyFilter.java diff --git a/contrib/collation/src/java/org/apache/lucene/collation/package.html b/contrib/icu/src/java/org/apache/lucene/collation/package.html similarity index 94% rename from contrib/collation/src/java/org/apache/lucene/collation/package.html rename to contrib/icu/src/java/org/apache/lucene/collation/package.html index 25d69e49165..4e2882cc7a4 100644 --- a/contrib/collation/src/java/org/apache/lucene/collation/package.html +++ b/contrib/icu/src/java/org/apache/lucene/collation/package.html @@ -21,8 +21,8 @@

- CollationKeyFilter and ICUCollationKeyFilter - convert each token into its binary CollationKey using the + ICUCollationKeyFilter + converts each token into its binary CollationKey using the provided Collator, and then encode the CollationKey as a String using {@link org.apache.lucene.util.IndexableBinaryStringTools}, to allow it to be @@ -60,9 +60,8 @@

Farsi Range Queries

-  // "fa" Locale is not supported by Sun JDK 1.4 or 1.5
   Collator collator = Collator.getInstance(new Locale("ar"));
-  CollationKeyAnalyzer analyzer = new CollationKeyAnalyzer(collator);
+  ICUCollationKeyAnalyzer analyzer = new ICUCollationKeyAnalyzer(collator);
   RAMDirectory ramDir = new RAMDirectory();
   IndexWriter writer = new IndexWriter
     (ramDir, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
@@ -92,7 +91,7 @@
 

Danish Sorting

   Analyzer analyzer 
-    = new CollationKeyAnalyzer(Collator.getInstance(new Locale("da", "dk")));
+    = new ICUCollationKeyAnalyzer(Collator.getInstance(new Locale("da", "dk")));
   RAMDirectory indexStore = new RAMDirectory();
   IndexWriter writer = new IndexWriter 
     (indexStore, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
@@ -121,7 +120,7 @@
 
   Collator collator = Collator.getInstance(new Locale("tr", "TR"));
   collator.setStrength(Collator.PRIMARY);
-  Analyzer analyzer = new CollationKeyAnalyzer(collator);
+  Analyzer analyzer = new ICUCollationKeyAnalyzer(collator);
   RAMDirectory ramDir = new RAMDirectory();
   IndexWriter writer = new IndexWriter
     (ramDir, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
diff --git a/contrib/collation/src/java/overview.html b/contrib/icu/src/java/overview.html
similarity index 90%
rename from contrib/collation/src/java/overview.html
rename to contrib/icu/src/java/overview.html
index cd090a9226d..b9d26c15330 100644
--- a/contrib/collation/src/java/overview.html
+++ b/contrib/icu/src/java/overview.html
@@ -17,8 +17,7 @@
 
   
     
-      Apache Lucene CollationKeyFilter/Analyzer and 
-      ICUCollationKeyFilter/Analyzer
+      Apache Lucene ICUCollationKeyFilter/Analyzer
     
   
   
diff --git a/contrib/collation/src/test/org/apache/lucene/collation/TestICUCollationKeyAnalyzer.java b/contrib/icu/src/test/org/apache/lucene/collation/TestICUCollationKeyAnalyzer.java
similarity index 96%
rename from contrib/collation/src/test/org/apache/lucene/collation/TestICUCollationKeyAnalyzer.java
rename to contrib/icu/src/test/org/apache/lucene/collation/TestICUCollationKeyAnalyzer.java
index 19a8359ebcc..b018ffc357c 100644
--- a/contrib/collation/src/test/org/apache/lucene/collation/TestICUCollationKeyAnalyzer.java
+++ b/contrib/icu/src/test/org/apache/lucene/collation/TestICUCollationKeyAnalyzer.java
@@ -37,11 +37,6 @@ public class TestICUCollationKeyAnalyzer extends CollationTestBase {
     (collator.getCollationKey(secondRangeBeginningOriginal).toByteArray());
   private String secondRangeEnd = encodeCollationKey
     (collator.getCollationKey(secondRangeEndOriginal).toByteArray());
-
-  
-  public void testFarsiQueryParserCollating() throws Exception {
-    testFarsiQueryParserCollating(analyzer);
-  }
   
   public void testFarsiRangeFilterCollating() throws Exception {
     testFarsiRangeFilterCollating(analyzer, firstRangeBeginning, firstRangeEnd, 
diff --git a/contrib/collation/src/test/org/apache/lucene/collation/TestICUCollationKeyFilter.java b/contrib/icu/src/test/org/apache/lucene/collation/TestICUCollationKeyFilter.java
similarity index 96%
rename from contrib/collation/src/test/org/apache/lucene/collation/TestICUCollationKeyFilter.java
rename to contrib/icu/src/test/org/apache/lucene/collation/TestICUCollationKeyFilter.java
index b7e7c8241b6..e0a6c998609 100644
--- a/contrib/collation/src/test/org/apache/lucene/collation/TestICUCollationKeyFilter.java
+++ b/contrib/icu/src/test/org/apache/lucene/collation/TestICUCollationKeyFilter.java
@@ -57,11 +57,6 @@ public class TestICUCollationKeyFilter extends CollationTestBase {
     }
   }
 
-  public void testFarsiQueryParserCollating() throws Exception {
-    testFarsiQueryParserCollating(analyzer);
-  }
-  
-  
   public void testFarsiRangeFilterCollating() throws Exception {
     testFarsiRangeFilterCollating(analyzer, firstRangeBeginning, firstRangeEnd, 
                                   secondRangeBeginning, secondRangeEnd);
diff --git a/contrib/collation/src/java/org/apache/lucene/collation/CollationKeyAnalyzer.java b/src/java/org/apache/lucene/collation/CollationKeyAnalyzer.java
similarity index 96%
rename from contrib/collation/src/java/org/apache/lucene/collation/CollationKeyAnalyzer.java
rename to src/java/org/apache/lucene/collation/CollationKeyAnalyzer.java
index 397a06b21e3..081624a4b27 100644
--- a/contrib/collation/src/java/org/apache/lucene/collation/CollationKeyAnalyzer.java
+++ b/src/java/org/apache/lucene/collation/CollationKeyAnalyzer.java
@@ -59,7 +59,8 @@ import java.io.IOException;
  *   
  *  
  * 

- * {@link ICUCollationKeyAnalyzer} uses ICU4J's Collator, which makes + * The ICUCollationKeyAnalyzer in the icu package of Lucene's + * contrib area uses ICU4J's Collator, which makes its * its version available, thus allowing collation to be versioned * independently from the JVM. ICUCollationKeyAnalyzer is also significantly * faster and generates significantly shorter keys than CollationKeyAnalyzer. diff --git a/contrib/collation/src/java/org/apache/lucene/collation/CollationKeyFilter.java b/src/java/org/apache/lucene/collation/CollationKeyFilter.java similarity index 95% rename from contrib/collation/src/java/org/apache/lucene/collation/CollationKeyFilter.java rename to src/java/org/apache/lucene/collation/CollationKeyFilter.java index 96d5ef16afd..6f0ea0578d2 100644 --- a/contrib/collation/src/java/org/apache/lucene/collation/CollationKeyFilter.java +++ b/src/java/org/apache/lucene/collation/CollationKeyFilter.java @@ -20,7 +20,6 @@ package org.apache.lucene.collation; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.util.IndexableBinaryStringTools; @@ -58,7 +57,8 @@ import java.text.Collator; * * *

- * {@link ICUCollationKeyFilter} uses ICU4J's Collator, which makes its + * The ICUCollationKeyFilter in the icu package of Lucene's + * contrib area uses ICU4J's Collator, which makes its * version available, thus allowing collation to be versioned independently * from the JVM. ICUCollationKeyFilter is also significantly faster and * generates significantly shorter keys than CollationKeyFilter. See @@ -71,7 +71,7 @@ import java.text.Collator; * CollationKeys generated by java.text.Collators are not compatible * with those those generated by ICU Collators. Specifically, if you use * CollationKeyFilter to generate index terms, do not use - * {@link ICUCollationKeyFilter} on the query side, or vice versa. + * ICUCollationKeyFilter on the query side, or vice versa. *

*/ public final class CollationKeyFilter extends TokenFilter { diff --git a/src/java/org/apache/lucene/collation/package.html b/src/java/org/apache/lucene/collation/package.html new file mode 100644 index 00000000000..b0c6f8016a8 --- /dev/null +++ b/src/java/org/apache/lucene/collation/package.html @@ -0,0 +1,176 @@ + + + + + Lucene Collation Package + + +

+ CollationKeyFilter + converts each token into its binary CollationKey using the + provided Collator, and then encode the CollationKey + as a String using + {@link org.apache.lucene.util.IndexableBinaryStringTools}, to allow it to be + stored as an index term. +

+ +

Use Cases

+ +
    +
  • + Efficient sorting of terms in languages that use non-Unicode character + orderings. (Lucene Sort using a Locale can be very slow.) +
  • +
  • + Efficient range queries over fields that contain terms in languages that + use non-Unicode character orderings. (Range queries using a Locale can be + very slow.) +
  • +
  • + Effective Locale-specific normalization (case differences, diacritics, etc.). + ({@link org.apache.lucene.analysis.LowerCaseFilter} and + {@link org.apache.lucene.analysis.ASCIIFoldingFilter} provide these services + in a generic way that doesn't take into account locale-specific needs.) +
  • +
+ +

Example Usages

+ +

Farsi Range Queries

+
+  // "fa" Locale is not supported by Sun JDK 1.4 or 1.5
+  Collator collator = Collator.getInstance(new Locale("ar"));
+  CollationKeyAnalyzer analyzer = new CollationKeyAnalyzer(collator);
+  RAMDirectory ramDir = new RAMDirectory();
+  IndexWriter writer = new IndexWriter
+    (ramDir, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
+  Document doc = new Document();
+  doc.add(new Field("content", "\u0633\u0627\u0628", 
+                    Field.Store.YES, Field.Index.ANALYZED));
+  writer.addDocument(doc);
+  writer.close();
+  IndexSearcher is = new IndexSearcher(ramDir, true);
+
+  // The AnalyzingQueryParser in Lucene's contrib allows terms in range queries
+  // to be passed through an analyzer - Lucene's standard QueryParser does not
+  // allow this.
+  AnalyzingQueryParser aqp = new AnalyzingQueryParser("content", analyzer);
+  aqp.setLowercaseExpandedTerms(false);
+  
+  // Unicode order would include U+0633 in [ U+062F - U+0698 ], but Farsi
+  // orders the U+0698 character before the U+0633 character, so the single
+  // indexed Term above should NOT be returned by a ConstantScoreRangeQuery
+  // with a Farsi Collator (or an Arabic one for the case when Farsi is not
+  // supported).
+  ScoreDoc[] result
+    = is.search(aqp.parse("[ \u062F TO \u0698 ]"), null, 1000).scoreDocs;
+  assertEquals("The index Term should not be included.", 0, result.length);
+
+ +

Danish Sorting

+
+  Analyzer analyzer 
+    = new CollationKeyAnalyzer(Collator.getInstance(new Locale("da", "dk")));
+  RAMDirectory indexStore = new RAMDirectory();
+  IndexWriter writer = new IndexWriter 
+    (indexStore, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
+  String[] tracer = new String[] { "A", "B", "C", "D", "E" };
+  String[] data = new String[] { "HAT", "HUT", "H\u00C5T", "H\u00D8T", "HOT" };
+  String[] sortedTracerOrder = new String[] { "A", "E", "B", "D", "C" };
+  for (int i = 0 ; i < data.length ; ++i) {
+    Document doc = new Document();
+    doc.add(new Field("tracer", tracer[i], Field.Store.YES, Field.Index.NO));
+    doc.add(new Field("contents", data[i], Field.Store.NO, Field.Index.ANALYZED));
+    writer.addDocument(doc);
+  }
+  writer.close();
+  Searcher searcher = new IndexSearcher(indexStore, true);
+  Sort sort = new Sort();
+  sort.setSort(new SortField("contents", SortField.STRING));
+  Query query = new MatchAllDocsQuery();
+  ScoreDoc[] result = searcher.search(query, null, 1000, sort).scoreDocs;
+  for (int i = 0 ; i < result.length ; ++i) {
+    Document doc = searcher.doc(result[i].doc);
+    assertEquals(sortedTracerOrder[i], doc.getValues("tracer")[0]);
+  }
+
+ +

Turkish Case Normalization

+
+  Collator collator = Collator.getInstance(new Locale("tr", "TR"));
+  collator.setStrength(Collator.PRIMARY);
+  Analyzer analyzer = new CollationKeyAnalyzer(collator);
+  RAMDirectory ramDir = new RAMDirectory();
+  IndexWriter writer = new IndexWriter
+    (ramDir, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
+  Document doc = new Document();
+  doc.add(new Field("contents", "DIGY", Field.Store.NO, Field.Index.ANALYZED));
+  writer.addDocument(doc);
+  writer.close();
+  IndexSearcher is = new IndexSearcher(ramDir, true);
+  QueryParser parser = new QueryParser("contents", analyzer);
+  Query query = parser.parse("d\u0131gy");   // U+0131: dotless i
+  ScoreDoc[] result = is.search(query, null, 1000).scoreDocs;
+  assertEquals("The index Term should be included.", 1, result.length);
+
+ +

Caveats and Comparisons

+

+ WARNING: Make sure you use exactly the same + Collator at index and query time -- CollationKeys + are only comparable when produced by + the same Collator. Since {@link java.text.RuleBasedCollator}s + are not independently versioned, it is unsafe to search against stored + CollationKeys unless the following are exactly the same (best + practice is to store this information with the index and check that they + remain the same at query time): +

+
    +
  1. JVM vendor
  2. +
  3. JVM version, including patch version
  4. +
  5. + The language (and country and variant, if specified) of the Locale + used when constructing the collator via + {@link java.text.Collator#getInstance(java.util.Locale)}. +
  6. +
  7. + The collation strength used - see {@link java.text.Collator#setStrength(int)} +
  8. +
+

+ ICUCollationKeyFilter, available in the icu package in Lucene's contrib area, + uses ICU4J's Collator, which + makes its version available, thus allowing collation to be versioned + independently from the JVM. ICUCollationKeyFilter is also + significantly faster and generates significantly shorter keys than + CollationKeyFilter. See + http://site.icu-project.org/charts/collation-icu4j-sun for key + generation timing and key length comparisons between ICU4J and + java.text.Collator over several languages. +

+

+ CollationKeys generated by java.text.Collators are + not compatible with those those generated by ICU Collators. Specifically, if + you use CollationKeyFilter to generate index terms, do not use + ICUCollationKeyFilter on the query side, or vice versa. +

+
+
+ + diff --git a/contrib/collation/src/test/org/apache/lucene/collation/CollationTestBase.java b/src/test/org/apache/lucene/collation/CollationTestBase.java similarity index 89% rename from contrib/collation/src/test/org/apache/lucene/collation/CollationTestBase.java rename to src/test/org/apache/lucene/collation/CollationTestBase.java index 581f20a8b22..f5c8fe710cf 100644 --- a/contrib/collation/src/test/org/apache/lucene/collation/CollationTestBase.java +++ b/src/test/org/apache/lucene/collation/CollationTestBase.java @@ -38,7 +38,6 @@ import org.apache.lucene.search.SortField; import org.apache.lucene.document.Field; import org.apache.lucene.document.Document; import org.apache.lucene.util.IndexableBinaryStringTools; -import org.apache.lucene.queryParser.analyzing.AnalyzingQueryParser; import org.apache.lucene.util.Version; import java.io.IOException; @@ -71,40 +70,7 @@ public class CollationTestBase extends TestCase { IndexableBinaryStringTools.encode(begBuf, encodedBegBuf); return new String(encodedBegArray); } - - public void testFarsiQueryParserCollating(Analyzer analyzer) throws Exception { - - RAMDirectory ramDir = new RAMDirectory(); - IndexWriter writer = new IndexWriter - (ramDir, analyzer, true, IndexWriter.MaxFieldLength.LIMITED); - Document doc = new Document(); - doc.add(new Field("content", "\u0633\u0627\u0628", - Field.Store.YES, Field.Index.ANALYZED)); - writer.addDocument(doc); - writer.close(); - IndexSearcher is = new IndexSearcher(ramDir, true); - - AnalyzingQueryParser aqp = new AnalyzingQueryParser(Version.LUCENE_CURRENT, "content", analyzer); - aqp.setLowercaseExpandedTerms(false); - - // Unicode order would include U+0633 in [ U+062F - U+0698 ], but Farsi - // orders the U+0698 character before the U+0633 character, so the single - // index Term below should NOT be returned by a TermRangeQuery - // with a Farsi Collator (or an Arabic one for the case when Farsi is not - // supported). - - // Test TermRangeQuery - ScoreDoc[] result - = is.search(aqp.parse("[ \u062F TO \u0698 ]"), null, 1000).scoreDocs; - assertEquals("The index Term should not be included.", 0, result.length); - - result = is.search(aqp.parse("[ \u0633 TO \u0638 ]"), null, 1000).scoreDocs; - assertEquals("The index Term should be included.", 1, result.length); - - is.close(); - } - - + public void testFarsiRangeFilterCollating(Analyzer analyzer, String firstBeg, String firstEnd, String secondBeg, String secondEnd) throws Exception { diff --git a/contrib/collation/src/test/org/apache/lucene/collation/TestCollationKeyAnalyzer.java b/src/test/org/apache/lucene/collation/TestCollationKeyAnalyzer.java similarity index 96% rename from contrib/collation/src/test/org/apache/lucene/collation/TestCollationKeyAnalyzer.java rename to src/test/org/apache/lucene/collation/TestCollationKeyAnalyzer.java index 4f6563c10b7..06c6d074deb 100644 --- a/contrib/collation/src/test/org/apache/lucene/collation/TestCollationKeyAnalyzer.java +++ b/src/test/org/apache/lucene/collation/TestCollationKeyAnalyzer.java @@ -41,11 +41,6 @@ public class TestCollationKeyAnalyzer extends CollationTestBase { private String secondRangeEnd = encodeCollationKey (collator.getCollationKey(secondRangeEndOriginal).toByteArray()); - - public void testFarsiQueryParserCollating() throws Exception { - testFarsiQueryParserCollating(analyzer); - } - public void testFarsiRangeFilterCollating() throws Exception { testFarsiRangeFilterCollating (analyzer, firstRangeBeginning, firstRangeEnd, diff --git a/contrib/collation/src/test/org/apache/lucene/collation/TestCollationKeyFilter.java b/src/test/org/apache/lucene/collation/TestCollationKeyFilter.java similarity index 96% rename from contrib/collation/src/test/org/apache/lucene/collation/TestCollationKeyFilter.java rename to src/test/org/apache/lucene/collation/TestCollationKeyFilter.java index 3ffd5ac8597..401591e5af1 100644 --- a/contrib/collation/src/test/org/apache/lucene/collation/TestCollationKeyFilter.java +++ b/src/test/org/apache/lucene/collation/TestCollationKeyFilter.java @@ -60,11 +60,6 @@ public class TestCollationKeyFilter extends CollationTestBase { } } - public void testFarsiQueryParserCollating() throws Exception { - testFarsiQueryParserCollating(analyzer); - } - - public void testFarsiRangeFilterCollating() throws Exception { testFarsiRangeFilterCollating (analyzer, firstRangeBeginning, firstRangeEnd,