diff --git a/contrib/icu/src/java/org/apache/lucene/collation/package.html b/contrib/icu/src/java/org/apache/lucene/collation/package.html deleted file mode 100644 index 4e2882cc7a4..00000000000 --- a/contrib/icu/src/java/org/apache/lucene/collation/package.html +++ /dev/null @@ -1,181 +0,0 @@ - - - -
-
- ICUCollationKeyFilter
- converts each token into its binary CollationKey
using the
- provided Collator
, and then encode the CollationKey
- as a String using
- {@link org.apache.lucene.util.IndexableBinaryStringTools}, to allow it to be
- stored as an index term.
-
- ICUCollationKeyFilter
depends on ICU4J 4.0 to produce the
- CollationKey
s. icu4j-collation-4.0.jar
,
- a trimmed-down version of icu4j-4.0.jar
that contains only the
- code and data needed to support collation, is included in Lucene's Subversion
- repository at contrib/collation/lib/
.
-
- Collator collator = Collator.getInstance(new Locale("ar"));
- ICUCollationKeyAnalyzer analyzer = new ICUCollationKeyAnalyzer(collator);
- RAMDirectory ramDir = new RAMDirectory();
- IndexWriter writer = new IndexWriter
- (ramDir, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
- Document doc = new Document();
- doc.add(new Field("content", "\u0633\u0627\u0628",
- Field.Store.YES, Field.Index.ANALYZED));
- writer.addDocument(doc);
- writer.close();
- IndexSearcher is = new IndexSearcher(ramDir, true);
-
- // The AnalyzingQueryParser in Lucene's contrib allows terms in range queries
- // to be passed through an analyzer - Lucene's standard QueryParser does not
- // allow this.
- AnalyzingQueryParser aqp = new AnalyzingQueryParser("content", analyzer);
- aqp.setLowercaseExpandedTerms(false);
-
- // Unicode order would include U+0633 in [ U+062F - U+0698 ], but Farsi
- // orders the U+0698 character before the U+0633 character, so the single
- // indexed Term above should NOT be returned by a ConstantScoreRangeQuery
- // with a Farsi Collator (or an Arabic one for the case when Farsi is not
- // supported).
- ScoreDoc[] result
- = is.search(aqp.parse("[ \u062F TO \u0698 ]"), null, 1000).scoreDocs;
- assertEquals("The index Term should not be included.", 0, result.length);
-
-
-
- Analyzer analyzer
- = new ICUCollationKeyAnalyzer(Collator.getInstance(new Locale("da", "dk")));
- RAMDirectory indexStore = new RAMDirectory();
- IndexWriter writer = new IndexWriter
- (indexStore, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
- String[] tracer = new String[] { "A", "B", "C", "D", "E" };
- String[] data = new String[] { "HAT", "HUT", "H\u00C5T", "H\u00D8T", "HOT" };
- String[] sortedTracerOrder = new String[] { "A", "E", "B", "D", "C" };
- for (int i = 0 ; i < data.length ; ++i) {
- Document doc = new Document();
- doc.add(new Field("tracer", tracer[i], Field.Store.YES, Field.Index.NO));
- doc.add(new Field("contents", data[i], Field.Store.NO, Field.Index.ANALYZED));
- writer.addDocument(doc);
- }
- writer.close();
- Searcher searcher = new IndexSearcher(indexStore, true);
- Sort sort = new Sort();
- sort.setSort(new SortField("contents", SortField.STRING));
- Query query = new MatchAllDocsQuery();
- ScoreDoc[] result = searcher.search(query, null, 1000, sort).scoreDocs;
- for (int i = 0 ; i < result.length ; ++i) {
- Document doc = searcher.doc(result[i].doc);
- assertEquals(sortedTracerOrder[i], doc.getValues("tracer")[0]);
- }
-
-
-
- Collator collator = Collator.getInstance(new Locale("tr", "TR"));
- collator.setStrength(Collator.PRIMARY);
- Analyzer analyzer = new ICUCollationKeyAnalyzer(collator);
- RAMDirectory ramDir = new RAMDirectory();
- IndexWriter writer = new IndexWriter
- (ramDir, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
- Document doc = new Document();
- doc.add(new Field("contents", "DIGY", Field.Store.NO, Field.Index.ANALYZED));
- writer.addDocument(doc);
- writer.close();
- IndexSearcher is = new IndexSearcher(ramDir, true);
- QueryParser parser = new QueryParser("contents", analyzer);
- Query query = parser.parse("d\u0131gy"); // U+0131: dotless i
- ScoreDoc[] result = is.search(query, null, 1000).scoreDocs;
- assertEquals("The index Term should be included.", 1, result.length);
-
-
-
- WARNING: Make sure you use exactly the same
- Collator
at index and query time -- CollationKey
s
- are only comparable when produced by
- the same Collator
. Since {@link java.text.RuleBasedCollator}s
- are not independently versioned, it is unsafe to search against stored
- CollationKey
s unless the following are exactly the same (best
- practice is to store this information with the index and check that they
- remain the same at query time):
-
- ICUCollationKeyFilter
uses ICU4J's Collator
, which
- makes its version available, thus allowing collation to be versioned
- independently from the JVM. ICUCollationKeyFilter
is also
- significantly faster and generates significantly shorter keys than
- CollationKeyFilter
. See
- http://site.icu-project.org/charts/collation-icu4j-sun for key
- generation timing and key length comparisons between ICU4J and
- java.text.Collator
over several languages.
-
- CollationKey
s generated by java.text.Collator
s are
- not compatible with those those generated by ICU Collators. Specifically, if
- you use CollationKeyFilter
to generate index terms, do not use
- ICUCollationKeyFilter
on the query side, or vice versa.
-
-- - diff --git a/contrib/icu/src/java/overview.html b/contrib/icu/src/java/overview.html index b9d26c15330..c11eeefccbd 100644 --- a/contrib/icu/src/java/overview.html +++ b/contrib/icu/src/java/overview.html @@ -20,6 +20,163 @@ Apache Lucene ICUCollationKeyFilter/Analyzer - - -