- CollationKeyFilter
and ICUCollationKeyFilter
- convert each token into its binary CollationKey
using the
+ ICUCollationKeyFilter
+ converts each token into its binary CollationKey
using the
provided Collator
, and then encode the CollationKey
as a String using
{@link org.apache.lucene.util.IndexableBinaryStringTools}, to allow it to be
@@ -60,9 +60,8 @@
- // "fa" Locale is not supported by Sun JDK 1.4 or 1.5
Collator collator = Collator.getInstance(new Locale("ar"));
- CollationKeyAnalyzer analyzer = new CollationKeyAnalyzer(collator);
+ ICUCollationKeyAnalyzer analyzer = new ICUCollationKeyAnalyzer(collator);
RAMDirectory ramDir = new RAMDirectory();
IndexWriter writer = new IndexWriter
(ramDir, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
@@ -92,7 +91,7 @@
Danish Sorting
Analyzer analyzer
- = new CollationKeyAnalyzer(Collator.getInstance(new Locale("da", "dk")));
+ = new ICUCollationKeyAnalyzer(Collator.getInstance(new Locale("da", "dk")));
RAMDirectory indexStore = new RAMDirectory();
IndexWriter writer = new IndexWriter
(indexStore, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
@@ -121,7 +120,7 @@
Collator collator = Collator.getInstance(new Locale("tr", "TR"));
collator.setStrength(Collator.PRIMARY);
- Analyzer analyzer = new CollationKeyAnalyzer(collator);
+ Analyzer analyzer = new ICUCollationKeyAnalyzer(collator);
RAMDirectory ramDir = new RAMDirectory();
IndexWriter writer = new IndexWriter
(ramDir, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
diff --git a/contrib/collation/src/java/overview.html b/contrib/icu/src/java/overview.html
similarity index 90%
rename from contrib/collation/src/java/overview.html
rename to contrib/icu/src/java/overview.html
index cd090a9226d..b9d26c15330 100644
--- a/contrib/collation/src/java/overview.html
+++ b/contrib/icu/src/java/overview.html
@@ -17,8 +17,7 @@
- Apache Lucene CollationKeyFilter/Analyzer and
- ICUCollationKeyFilter/Analyzer
+ Apache Lucene ICUCollationKeyFilter/Analyzer
diff --git a/contrib/collation/src/test/org/apache/lucene/collation/TestICUCollationKeyAnalyzer.java b/contrib/icu/src/test/org/apache/lucene/collation/TestICUCollationKeyAnalyzer.java
similarity index 96%
rename from contrib/collation/src/test/org/apache/lucene/collation/TestICUCollationKeyAnalyzer.java
rename to contrib/icu/src/test/org/apache/lucene/collation/TestICUCollationKeyAnalyzer.java
index 19a8359ebcc..b018ffc357c 100644
--- a/contrib/collation/src/test/org/apache/lucene/collation/TestICUCollationKeyAnalyzer.java
+++ b/contrib/icu/src/test/org/apache/lucene/collation/TestICUCollationKeyAnalyzer.java
@@ -37,11 +37,6 @@ public class TestICUCollationKeyAnalyzer extends CollationTestBase {
(collator.getCollationKey(secondRangeBeginningOriginal).toByteArray());
private String secondRangeEnd = encodeCollationKey
(collator.getCollationKey(secondRangeEndOriginal).toByteArray());
-
-
- public void testFarsiQueryParserCollating() throws Exception {
- testFarsiQueryParserCollating(analyzer);
- }
public void testFarsiRangeFilterCollating() throws Exception {
testFarsiRangeFilterCollating(analyzer, firstRangeBeginning, firstRangeEnd,
diff --git a/contrib/collation/src/test/org/apache/lucene/collation/TestICUCollationKeyFilter.java b/contrib/icu/src/test/org/apache/lucene/collation/TestICUCollationKeyFilter.java
similarity index 96%
rename from contrib/collation/src/test/org/apache/lucene/collation/TestICUCollationKeyFilter.java
rename to contrib/icu/src/test/org/apache/lucene/collation/TestICUCollationKeyFilter.java
index b7e7c8241b6..e0a6c998609 100644
--- a/contrib/collation/src/test/org/apache/lucene/collation/TestICUCollationKeyFilter.java
+++ b/contrib/icu/src/test/org/apache/lucene/collation/TestICUCollationKeyFilter.java
@@ -57,11 +57,6 @@ public class TestICUCollationKeyFilter extends CollationTestBase {
}
}
- public void testFarsiQueryParserCollating() throws Exception {
- testFarsiQueryParserCollating(analyzer);
- }
-
-
public void testFarsiRangeFilterCollating() throws Exception {
testFarsiRangeFilterCollating(analyzer, firstRangeBeginning, firstRangeEnd,
secondRangeBeginning, secondRangeEnd);
diff --git a/contrib/collation/src/java/org/apache/lucene/collation/CollationKeyAnalyzer.java b/src/java/org/apache/lucene/collation/CollationKeyAnalyzer.java
similarity index 96%
rename from contrib/collation/src/java/org/apache/lucene/collation/CollationKeyAnalyzer.java
rename to src/java/org/apache/lucene/collation/CollationKeyAnalyzer.java
index 397a06b21e3..081624a4b27 100644
--- a/contrib/collation/src/java/org/apache/lucene/collation/CollationKeyAnalyzer.java
+++ b/src/java/org/apache/lucene/collation/CollationKeyAnalyzer.java
@@ -59,7 +59,8 @@ import java.io.IOException;
*
*
*
- * {@link ICUCollationKeyAnalyzer} uses ICU4J's Collator, which makes
+ * The ICUCollationKeyAnalyzer
in the icu package of Lucene's
+ * contrib area uses ICU4J's Collator, which makes its
* its version available, thus allowing collation to be versioned
* independently from the JVM. ICUCollationKeyAnalyzer is also significantly
* faster and generates significantly shorter keys than CollationKeyAnalyzer.
diff --git a/contrib/collation/src/java/org/apache/lucene/collation/CollationKeyFilter.java b/src/java/org/apache/lucene/collation/CollationKeyFilter.java
similarity index 95%
rename from contrib/collation/src/java/org/apache/lucene/collation/CollationKeyFilter.java
rename to src/java/org/apache/lucene/collation/CollationKeyFilter.java
index 96d5ef16afd..6f0ea0578d2 100644
--- a/contrib/collation/src/java/org/apache/lucene/collation/CollationKeyFilter.java
+++ b/src/java/org/apache/lucene/collation/CollationKeyFilter.java
@@ -20,7 +20,6 @@ package org.apache.lucene.collation;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.util.IndexableBinaryStringTools;
@@ -58,7 +57,8 @@ import java.text.Collator;
*
*
*
- * {@link ICUCollationKeyFilter} uses ICU4J's Collator, which makes its
+ * The ICUCollationKeyFilter
in the icu package of Lucene's
+ * contrib area uses ICU4J's Collator, which makes its
* version available, thus allowing collation to be versioned independently
* from the JVM. ICUCollationKeyFilter is also significantly faster and
* generates significantly shorter keys than CollationKeyFilter. See
@@ -71,7 +71,7 @@ import java.text.Collator;
* CollationKeys generated by java.text.Collators are not compatible
* with those those generated by ICU Collators. Specifically, if you use
* CollationKeyFilter to generate index terms, do not use
- * {@link ICUCollationKeyFilter} on the query side, or vice versa.
+ * ICUCollationKeyFilter on the query side, or vice versa.
*
*/
public final class CollationKeyFilter extends TokenFilter {
diff --git a/src/java/org/apache/lucene/collation/package.html b/src/java/org/apache/lucene/collation/package.html
new file mode 100644
index 00000000000..b0c6f8016a8
--- /dev/null
+++ b/src/java/org/apache/lucene/collation/package.html
@@ -0,0 +1,176 @@
+
+
+
+
+ Lucene Collation Package
+
+
+
+ CollationKeyFilter
+ converts each token into its binary CollationKey
using the
+ provided Collator
, and then encode the CollationKey
+ as a String using
+ {@link org.apache.lucene.util.IndexableBinaryStringTools}, to allow it to be
+ stored as an index term.
+
+
+Use Cases
+
+
+ -
+ Efficient sorting of terms in languages that use non-Unicode character
+ orderings. (Lucene Sort using a Locale can be very slow.)
+
+ -
+ Efficient range queries over fields that contain terms in languages that
+ use non-Unicode character orderings. (Range queries using a Locale can be
+ very slow.)
+
+ -
+ Effective Locale-specific normalization (case differences, diacritics, etc.).
+ ({@link org.apache.lucene.analysis.LowerCaseFilter} and
+ {@link org.apache.lucene.analysis.ASCIIFoldingFilter} provide these services
+ in a generic way that doesn't take into account locale-specific needs.)
+
+
+
+Example Usages
+
+Farsi Range Queries
+
+ // "fa" Locale is not supported by Sun JDK 1.4 or 1.5
+ Collator collator = Collator.getInstance(new Locale("ar"));
+ CollationKeyAnalyzer analyzer = new CollationKeyAnalyzer(collator);
+ RAMDirectory ramDir = new RAMDirectory();
+ IndexWriter writer = new IndexWriter
+ (ramDir, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
+ Document doc = new Document();
+ doc.add(new Field("content", "\u0633\u0627\u0628",
+ Field.Store.YES, Field.Index.ANALYZED));
+ writer.addDocument(doc);
+ writer.close();
+ IndexSearcher is = new IndexSearcher(ramDir, true);
+
+ // The AnalyzingQueryParser in Lucene's contrib allows terms in range queries
+ // to be passed through an analyzer - Lucene's standard QueryParser does not
+ // allow this.
+ AnalyzingQueryParser aqp = new AnalyzingQueryParser("content", analyzer);
+ aqp.setLowercaseExpandedTerms(false);
+
+ // Unicode order would include U+0633 in [ U+062F - U+0698 ], but Farsi
+ // orders the U+0698 character before the U+0633 character, so the single
+ // indexed Term above should NOT be returned by a ConstantScoreRangeQuery
+ // with a Farsi Collator (or an Arabic one for the case when Farsi is not
+ // supported).
+ ScoreDoc[] result
+ = is.search(aqp.parse("[ \u062F TO \u0698 ]"), null, 1000).scoreDocs;
+ assertEquals("The index Term should not be included.", 0, result.length);
+
+
+Danish Sorting
+
+ Analyzer analyzer
+ = new CollationKeyAnalyzer(Collator.getInstance(new Locale("da", "dk")));
+ RAMDirectory indexStore = new RAMDirectory();
+ IndexWriter writer = new IndexWriter
+ (indexStore, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
+ String[] tracer = new String[] { "A", "B", "C", "D", "E" };
+ String[] data = new String[] { "HAT", "HUT", "H\u00C5T", "H\u00D8T", "HOT" };
+ String[] sortedTracerOrder = new String[] { "A", "E", "B", "D", "C" };
+ for (int i = 0 ; i < data.length ; ++i) {
+ Document doc = new Document();
+ doc.add(new Field("tracer", tracer[i], Field.Store.YES, Field.Index.NO));
+ doc.add(new Field("contents", data[i], Field.Store.NO, Field.Index.ANALYZED));
+ writer.addDocument(doc);
+ }
+ writer.close();
+ Searcher searcher = new IndexSearcher(indexStore, true);
+ Sort sort = new Sort();
+ sort.setSort(new SortField("contents", SortField.STRING));
+ Query query = new MatchAllDocsQuery();
+ ScoreDoc[] result = searcher.search(query, null, 1000, sort).scoreDocs;
+ for (int i = 0 ; i < result.length ; ++i) {
+ Document doc = searcher.doc(result[i].doc);
+ assertEquals(sortedTracerOrder[i], doc.getValues("tracer")[0]);
+ }
+
+
+Turkish Case Normalization
+
+ Collator collator = Collator.getInstance(new Locale("tr", "TR"));
+ collator.setStrength(Collator.PRIMARY);
+ Analyzer analyzer = new CollationKeyAnalyzer(collator);
+ RAMDirectory ramDir = new RAMDirectory();
+ IndexWriter writer = new IndexWriter
+ (ramDir, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
+ Document doc = new Document();
+ doc.add(new Field("contents", "DIGY", Field.Store.NO, Field.Index.ANALYZED));
+ writer.addDocument(doc);
+ writer.close();
+ IndexSearcher is = new IndexSearcher(ramDir, true);
+ QueryParser parser = new QueryParser("contents", analyzer);
+ Query query = parser.parse("d\u0131gy"); // U+0131: dotless i
+ ScoreDoc[] result = is.search(query, null, 1000).scoreDocs;
+ assertEquals("The index Term should be included.", 1, result.length);
+
+
+Caveats and Comparisons
+
+ WARNING: Make sure you use exactly the same
+ Collator
at index and query time -- CollationKey
s
+ are only comparable when produced by
+ the same Collator
. Since {@link java.text.RuleBasedCollator}s
+ are not independently versioned, it is unsafe to search against stored
+ CollationKey
s unless the following are exactly the same (best
+ practice is to store this information with the index and check that they
+ remain the same at query time):
+
+
+ - JVM vendor
+ - JVM version, including patch version
+ -
+ The language (and country and variant, if specified) of the Locale
+ used when constructing the collator via
+ {@link java.text.Collator#getInstance(java.util.Locale)}.
+
+ -
+ The collation strength used - see {@link java.text.Collator#setStrength(int)}
+
+
+
+ ICUCollationKeyFilter
, available in the icu package in Lucene's contrib area,
+ uses ICU4J's Collator
, which
+ makes its version available, thus allowing collation to be versioned
+ independently from the JVM. ICUCollationKeyFilter
is also
+ significantly faster and generates significantly shorter keys than
+ CollationKeyFilter
. See
+ http://site.icu-project.org/charts/collation-icu4j-sun for key
+ generation timing and key length comparisons between ICU4J and
+ java.text.Collator
over several languages.
+
+
+ CollationKey
s generated by java.text.Collator
s are
+ not compatible with those those generated by ICU Collators. Specifically, if
+ you use CollationKeyFilter
to generate index terms, do not use
+ ICUCollationKeyFilter
on the query side, or vice versa.
+
+
+
+
+
diff --git a/contrib/collation/src/test/org/apache/lucene/collation/CollationTestBase.java b/src/test/org/apache/lucene/collation/CollationTestBase.java
similarity index 89%
rename from contrib/collation/src/test/org/apache/lucene/collation/CollationTestBase.java
rename to src/test/org/apache/lucene/collation/CollationTestBase.java
index 581f20a8b22..f5c8fe710cf 100644
--- a/contrib/collation/src/test/org/apache/lucene/collation/CollationTestBase.java
+++ b/src/test/org/apache/lucene/collation/CollationTestBase.java
@@ -38,7 +38,6 @@ import org.apache.lucene.search.SortField;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Document;
import org.apache.lucene.util.IndexableBinaryStringTools;
-import org.apache.lucene.queryParser.analyzing.AnalyzingQueryParser;
import org.apache.lucene.util.Version;
import java.io.IOException;
@@ -71,40 +70,7 @@ public class CollationTestBase extends TestCase {
IndexableBinaryStringTools.encode(begBuf, encodedBegBuf);
return new String(encodedBegArray);
}
-
- public void testFarsiQueryParserCollating(Analyzer analyzer) throws Exception {
-
- RAMDirectory ramDir = new RAMDirectory();
- IndexWriter writer = new IndexWriter
- (ramDir, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
- Document doc = new Document();
- doc.add(new Field("content", "\u0633\u0627\u0628",
- Field.Store.YES, Field.Index.ANALYZED));
- writer.addDocument(doc);
- writer.close();
- IndexSearcher is = new IndexSearcher(ramDir, true);
-
- AnalyzingQueryParser aqp = new AnalyzingQueryParser(Version.LUCENE_CURRENT, "content", analyzer);
- aqp.setLowercaseExpandedTerms(false);
-
- // Unicode order would include U+0633 in [ U+062F - U+0698 ], but Farsi
- // orders the U+0698 character before the U+0633 character, so the single
- // index Term below should NOT be returned by a TermRangeQuery
- // with a Farsi Collator (or an Arabic one for the case when Farsi is not
- // supported).
-
- // Test TermRangeQuery
- ScoreDoc[] result
- = is.search(aqp.parse("[ \u062F TO \u0698 ]"), null, 1000).scoreDocs;
- assertEquals("The index Term should not be included.", 0, result.length);
-
- result = is.search(aqp.parse("[ \u0633 TO \u0638 ]"), null, 1000).scoreDocs;
- assertEquals("The index Term should be included.", 1, result.length);
-
- is.close();
- }
-
-
+
public void testFarsiRangeFilterCollating(Analyzer analyzer, String firstBeg,
String firstEnd, String secondBeg,
String secondEnd) throws Exception {
diff --git a/contrib/collation/src/test/org/apache/lucene/collation/TestCollationKeyAnalyzer.java b/src/test/org/apache/lucene/collation/TestCollationKeyAnalyzer.java
similarity index 96%
rename from contrib/collation/src/test/org/apache/lucene/collation/TestCollationKeyAnalyzer.java
rename to src/test/org/apache/lucene/collation/TestCollationKeyAnalyzer.java
index 4f6563c10b7..06c6d074deb 100644
--- a/contrib/collation/src/test/org/apache/lucene/collation/TestCollationKeyAnalyzer.java
+++ b/src/test/org/apache/lucene/collation/TestCollationKeyAnalyzer.java
@@ -41,11 +41,6 @@ public class TestCollationKeyAnalyzer extends CollationTestBase {
private String secondRangeEnd = encodeCollationKey
(collator.getCollationKey(secondRangeEndOriginal).toByteArray());
-
- public void testFarsiQueryParserCollating() throws Exception {
- testFarsiQueryParserCollating(analyzer);
- }
-
public void testFarsiRangeFilterCollating() throws Exception {
testFarsiRangeFilterCollating
(analyzer, firstRangeBeginning, firstRangeEnd,
diff --git a/contrib/collation/src/test/org/apache/lucene/collation/TestCollationKeyFilter.java b/src/test/org/apache/lucene/collation/TestCollationKeyFilter.java
similarity index 96%
rename from contrib/collation/src/test/org/apache/lucene/collation/TestCollationKeyFilter.java
rename to src/test/org/apache/lucene/collation/TestCollationKeyFilter.java
index 3ffd5ac8597..401591e5af1 100644
--- a/contrib/collation/src/test/org/apache/lucene/collation/TestCollationKeyFilter.java
+++ b/src/test/org/apache/lucene/collation/TestCollationKeyFilter.java
@@ -60,11 +60,6 @@ public class TestCollationKeyFilter extends CollationTestBase {
}
}
- public void testFarsiQueryParserCollating() throws Exception {
- testFarsiQueryParserCollating(analyzer);
- }
-
-
public void testFarsiRangeFilterCollating() throws Exception {
testFarsiRangeFilterCollating
(analyzer, firstRangeBeginning, firstRangeEnd,