mirror of https://github.com/apache/lucene.git
LUCENE-2124: mv package.html docs to overview, like we did for queryparser, to avoid javadocs warnings
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@888784 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
550a4ef1af
commit
cc619905c4
|
@ -1,181 +0,0 @@
|
||||||
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
|
|
||||||
<!--
|
|
||||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
|
||||||
contributor license agreements. See the NOTICE file distributed with
|
|
||||||
this work for additional information regarding copyright ownership.
|
|
||||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
|
||||||
(the "License"); you may not use this file except in compliance with
|
|
||||||
the License. You may obtain a copy of the License at
|
|
||||||
|
|
||||||
http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
|
|
||||||
Unless required by applicable law or agreed to in writing, software
|
|
||||||
distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
See the License for the specific language governing permissions and
|
|
||||||
limitations under the License.
|
|
||||||
-->
|
|
||||||
<html>
|
|
||||||
<head>
|
|
||||||
<title>Lucene Collation Package</title>
|
|
||||||
</head>
|
|
||||||
<body>
|
|
||||||
<p>
|
|
||||||
<code>ICUCollationKeyFilter</code>
|
|
||||||
converts each token into its binary <code>CollationKey</code> using the
|
|
||||||
provided <code>Collator</code>, and then encode the <code>CollationKey</code>
|
|
||||||
as a String using
|
|
||||||
{@link org.apache.lucene.util.IndexableBinaryStringTools}, to allow it to be
|
|
||||||
stored as an index term.
|
|
||||||
</p>
|
|
||||||
<p>
|
|
||||||
<code>ICUCollationKeyFilter</code> depends on ICU4J 4.0 to produce the
|
|
||||||
<code>CollationKey</code>s. <code>icu4j-collation-4.0.jar</code>,
|
|
||||||
a trimmed-down version of <code>icu4j-4.0.jar</code> that contains only the
|
|
||||||
code and data needed to support collation, is included in Lucene's Subversion
|
|
||||||
repository at <code>contrib/collation/lib/</code>.
|
|
||||||
</p>
|
|
||||||
|
|
||||||
<h2>Use Cases</h2>
|
|
||||||
|
|
||||||
<ul>
|
|
||||||
<li>
|
|
||||||
Efficient sorting of terms in languages that use non-Unicode character
|
|
||||||
orderings. (Lucene Sort using a Locale can be very slow.)
|
|
||||||
</li>
|
|
||||||
<li>
|
|
||||||
Efficient range queries over fields that contain terms in languages that
|
|
||||||
use non-Unicode character orderings. (Range queries using a Locale can be
|
|
||||||
very slow.)
|
|
||||||
</li>
|
|
||||||
<li>
|
|
||||||
Effective Locale-specific normalization (case differences, diacritics, etc.).
|
|
||||||
({@link org.apache.lucene.analysis.LowerCaseFilter} and
|
|
||||||
{@link org.apache.lucene.analysis.ASCIIFoldingFilter} provide these services
|
|
||||||
in a generic way that doesn't take into account locale-specific needs.)
|
|
||||||
</li>
|
|
||||||
</ul>
|
|
||||||
|
|
||||||
<h2>Example Usages</h2>
|
|
||||||
|
|
||||||
<h3>Farsi Range Queries</h3>
|
|
||||||
<code><pre>
|
|
||||||
Collator collator = Collator.getInstance(new Locale("ar"));
|
|
||||||
ICUCollationKeyAnalyzer analyzer = new ICUCollationKeyAnalyzer(collator);
|
|
||||||
RAMDirectory ramDir = new RAMDirectory();
|
|
||||||
IndexWriter writer = new IndexWriter
|
|
||||||
(ramDir, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
|
|
||||||
Document doc = new Document();
|
|
||||||
doc.add(new Field("content", "\u0633\u0627\u0628",
|
|
||||||
Field.Store.YES, Field.Index.ANALYZED));
|
|
||||||
writer.addDocument(doc);
|
|
||||||
writer.close();
|
|
||||||
IndexSearcher is = new IndexSearcher(ramDir, true);
|
|
||||||
|
|
||||||
// The AnalyzingQueryParser in Lucene's contrib allows terms in range queries
|
|
||||||
// to be passed through an analyzer - Lucene's standard QueryParser does not
|
|
||||||
// allow this.
|
|
||||||
AnalyzingQueryParser aqp = new AnalyzingQueryParser("content", analyzer);
|
|
||||||
aqp.setLowercaseExpandedTerms(false);
|
|
||||||
|
|
||||||
// Unicode order would include U+0633 in [ U+062F - U+0698 ], but Farsi
|
|
||||||
// orders the U+0698 character before the U+0633 character, so the single
|
|
||||||
// indexed Term above should NOT be returned by a ConstantScoreRangeQuery
|
|
||||||
// with a Farsi Collator (or an Arabic one for the case when Farsi is not
|
|
||||||
// supported).
|
|
||||||
ScoreDoc[] result
|
|
||||||
= is.search(aqp.parse("[ \u062F TO \u0698 ]"), null, 1000).scoreDocs;
|
|
||||||
assertEquals("The index Term should not be included.", 0, result.length);
|
|
||||||
</pre></code>
|
|
||||||
|
|
||||||
<h3>Danish Sorting</h3>
|
|
||||||
<code><pre>
|
|
||||||
Analyzer analyzer
|
|
||||||
= new ICUCollationKeyAnalyzer(Collator.getInstance(new Locale("da", "dk")));
|
|
||||||
RAMDirectory indexStore = new RAMDirectory();
|
|
||||||
IndexWriter writer = new IndexWriter
|
|
||||||
(indexStore, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
|
|
||||||
String[] tracer = new String[] { "A", "B", "C", "D", "E" };
|
|
||||||
String[] data = new String[] { "HAT", "HUT", "H\u00C5T", "H\u00D8T", "HOT" };
|
|
||||||
String[] sortedTracerOrder = new String[] { "A", "E", "B", "D", "C" };
|
|
||||||
for (int i = 0 ; i < data.length ; ++i) {
|
|
||||||
Document doc = new Document();
|
|
||||||
doc.add(new Field("tracer", tracer[i], Field.Store.YES, Field.Index.NO));
|
|
||||||
doc.add(new Field("contents", data[i], Field.Store.NO, Field.Index.ANALYZED));
|
|
||||||
writer.addDocument(doc);
|
|
||||||
}
|
|
||||||
writer.close();
|
|
||||||
Searcher searcher = new IndexSearcher(indexStore, true);
|
|
||||||
Sort sort = new Sort();
|
|
||||||
sort.setSort(new SortField("contents", SortField.STRING));
|
|
||||||
Query query = new MatchAllDocsQuery();
|
|
||||||
ScoreDoc[] result = searcher.search(query, null, 1000, sort).scoreDocs;
|
|
||||||
for (int i = 0 ; i < result.length ; ++i) {
|
|
||||||
Document doc = searcher.doc(result[i].doc);
|
|
||||||
assertEquals(sortedTracerOrder[i], doc.getValues("tracer")[0]);
|
|
||||||
}
|
|
||||||
</pre></code>
|
|
||||||
|
|
||||||
<h3>Turkish Case Normalization</h3>
|
|
||||||
<code><pre>
|
|
||||||
Collator collator = Collator.getInstance(new Locale("tr", "TR"));
|
|
||||||
collator.setStrength(Collator.PRIMARY);
|
|
||||||
Analyzer analyzer = new ICUCollationKeyAnalyzer(collator);
|
|
||||||
RAMDirectory ramDir = new RAMDirectory();
|
|
||||||
IndexWriter writer = new IndexWriter
|
|
||||||
(ramDir, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
|
|
||||||
Document doc = new Document();
|
|
||||||
doc.add(new Field("contents", "DIGY", Field.Store.NO, Field.Index.ANALYZED));
|
|
||||||
writer.addDocument(doc);
|
|
||||||
writer.close();
|
|
||||||
IndexSearcher is = new IndexSearcher(ramDir, true);
|
|
||||||
QueryParser parser = new QueryParser("contents", analyzer);
|
|
||||||
Query query = parser.parse("d\u0131gy"); // U+0131: dotless i
|
|
||||||
ScoreDoc[] result = is.search(query, null, 1000).scoreDocs;
|
|
||||||
assertEquals("The index Term should be included.", 1, result.length);
|
|
||||||
</pre></code>
|
|
||||||
|
|
||||||
<h2>Caveats and Comparisons</h2>
|
|
||||||
<p>
|
|
||||||
<strong>WARNING:</strong> Make sure you use exactly the same
|
|
||||||
<code>Collator</code> at index and query time -- <code>CollationKey</code>s
|
|
||||||
are only comparable when produced by
|
|
||||||
the same <code>Collator</code>. Since {@link java.text.RuleBasedCollator}s
|
|
||||||
are not independently versioned, it is unsafe to search against stored
|
|
||||||
<code>CollationKey</code>s unless the following are exactly the same (best
|
|
||||||
practice is to store this information with the index and check that they
|
|
||||||
remain the same at query time):
|
|
||||||
</p>
|
|
||||||
<ol>
|
|
||||||
<li>JVM vendor</li>
|
|
||||||
<li>JVM version, including patch version</li>
|
|
||||||
<li>
|
|
||||||
The language (and country and variant, if specified) of the Locale
|
|
||||||
used when constructing the collator via
|
|
||||||
{@link java.text.Collator#getInstance(java.util.Locale)}.
|
|
||||||
</li>
|
|
||||||
<li>
|
|
||||||
The collation strength used - see {@link java.text.Collator#setStrength(int)}
|
|
||||||
</li>
|
|
||||||
</ol>
|
|
||||||
<p>
|
|
||||||
<code>ICUCollationKeyFilter</code> uses ICU4J's <code>Collator</code>, which
|
|
||||||
makes its version available, thus allowing collation to be versioned
|
|
||||||
independently from the JVM. <code>ICUCollationKeyFilter</code> is also
|
|
||||||
significantly faster and generates significantly shorter keys than
|
|
||||||
<code>CollationKeyFilter</code>. See
|
|
||||||
<a href="http://site.icu-project.org/charts/collation-icu4j-sun"
|
|
||||||
>http://site.icu-project.org/charts/collation-icu4j-sun</a> for key
|
|
||||||
generation timing and key length comparisons between ICU4J and
|
|
||||||
<code>java.text.Collator</code> over several languages.
|
|
||||||
</p>
|
|
||||||
<p>
|
|
||||||
<code>CollationKey</code>s generated by <code>java.text.Collator</code>s are
|
|
||||||
not compatible with those those generated by ICU Collators. Specifically, if
|
|
||||||
you use <code>CollationKeyFilter</code> to generate index terms, do not use
|
|
||||||
<code>ICUCollationKeyFilter</code> on the query side, or vice versa.
|
|
||||||
</p>
|
|
||||||
<pre>
|
|
||||||
</pre>
|
|
||||||
</body>
|
|
||||||
</html>
|
|
|
@ -20,6 +20,163 @@
|
||||||
Apache Lucene ICUCollationKeyFilter/Analyzer
|
Apache Lucene ICUCollationKeyFilter/Analyzer
|
||||||
</title>
|
</title>
|
||||||
</head>
|
</head>
|
||||||
<body>
|
<body>
|
||||||
</body>
|
<p>
|
||||||
</html>
|
<code>ICUCollationKeyFilter</code>
|
||||||
|
converts each token into its binary <code>CollationKey</code> using the
|
||||||
|
provided <code>Collator</code>, and then encode the <code>CollationKey</code>
|
||||||
|
as a String using
|
||||||
|
{@link org.apache.lucene.util.IndexableBinaryStringTools}, to allow it to be
|
||||||
|
stored as an index term.
|
||||||
|
</p>
|
||||||
|
<p>
|
||||||
|
<code>ICUCollationKeyFilter</code> depends on ICU4J 4.0 to produce the
|
||||||
|
<code>CollationKey</code>s. <code>icu4j-collation-4.0.jar</code>,
|
||||||
|
a trimmed-down version of <code>icu4j-4.0.jar</code> that contains only the
|
||||||
|
code and data needed to support collation, is included in Lucene's Subversion
|
||||||
|
repository at <code>contrib/collation/lib/</code>.
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<h2>Use Cases</h2>
|
||||||
|
|
||||||
|
<ul>
|
||||||
|
<li>
|
||||||
|
Efficient sorting of terms in languages that use non-Unicode character
|
||||||
|
orderings. (Lucene Sort using a Locale can be very slow.)
|
||||||
|
</li>
|
||||||
|
<li>
|
||||||
|
Efficient range queries over fields that contain terms in languages that
|
||||||
|
use non-Unicode character orderings. (Range queries using a Locale can be
|
||||||
|
very slow.)
|
||||||
|
</li>
|
||||||
|
<li>
|
||||||
|
Effective Locale-specific normalization (case differences, diacritics, etc.).
|
||||||
|
({@link org.apache.lucene.analysis.LowerCaseFilter} and
|
||||||
|
{@link org.apache.lucene.analysis.ASCIIFoldingFilter} provide these services
|
||||||
|
in a generic way that doesn't take into account locale-specific needs.)
|
||||||
|
</li>
|
||||||
|
</ul>
|
||||||
|
|
||||||
|
<h2>Example Usages</h2>
|
||||||
|
|
||||||
|
<h3>Farsi Range Queries</h3>
|
||||||
|
<code><pre>
|
||||||
|
Collator collator = Collator.getInstance(new Locale("ar"));
|
||||||
|
ICUCollationKeyAnalyzer analyzer = new ICUCollationKeyAnalyzer(collator);
|
||||||
|
RAMDirectory ramDir = new RAMDirectory();
|
||||||
|
IndexWriter writer = new IndexWriter
|
||||||
|
(ramDir, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
|
||||||
|
Document doc = new Document();
|
||||||
|
doc.add(new Field("content", "\u0633\u0627\u0628",
|
||||||
|
Field.Store.YES, Field.Index.ANALYZED));
|
||||||
|
writer.addDocument(doc);
|
||||||
|
writer.close();
|
||||||
|
IndexSearcher is = new IndexSearcher(ramDir, true);
|
||||||
|
|
||||||
|
// The AnalyzingQueryParser in Lucene's contrib allows terms in range queries
|
||||||
|
// to be passed through an analyzer - Lucene's standard QueryParser does not
|
||||||
|
// allow this.
|
||||||
|
AnalyzingQueryParser aqp = new AnalyzingQueryParser("content", analyzer);
|
||||||
|
aqp.setLowercaseExpandedTerms(false);
|
||||||
|
|
||||||
|
// Unicode order would include U+0633 in [ U+062F - U+0698 ], but Farsi
|
||||||
|
// orders the U+0698 character before the U+0633 character, so the single
|
||||||
|
// indexed Term above should NOT be returned by a ConstantScoreRangeQuery
|
||||||
|
// with a Farsi Collator (or an Arabic one for the case when Farsi is not
|
||||||
|
// supported).
|
||||||
|
ScoreDoc[] result
|
||||||
|
= is.search(aqp.parse("[ \u062F TO \u0698 ]"), null, 1000).scoreDocs;
|
||||||
|
assertEquals("The index Term should not be included.", 0, result.length);
|
||||||
|
</pre></code>
|
||||||
|
|
||||||
|
<h3>Danish Sorting</h3>
|
||||||
|
<code><pre>
|
||||||
|
Analyzer analyzer
|
||||||
|
= new ICUCollationKeyAnalyzer(Collator.getInstance(new Locale("da", "dk")));
|
||||||
|
RAMDirectory indexStore = new RAMDirectory();
|
||||||
|
IndexWriter writer = new IndexWriter
|
||||||
|
(indexStore, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
|
||||||
|
String[] tracer = new String[] { "A", "B", "C", "D", "E" };
|
||||||
|
String[] data = new String[] { "HAT", "HUT", "H\u00C5T", "H\u00D8T", "HOT" };
|
||||||
|
String[] sortedTracerOrder = new String[] { "A", "E", "B", "D", "C" };
|
||||||
|
for (int i = 0 ; i < data.length ; ++i) {
|
||||||
|
Document doc = new Document();
|
||||||
|
doc.add(new Field("tracer", tracer[i], Field.Store.YES, Field.Index.NO));
|
||||||
|
doc.add(new Field("contents", data[i], Field.Store.NO, Field.Index.ANALYZED));
|
||||||
|
writer.addDocument(doc);
|
||||||
|
}
|
||||||
|
writer.close();
|
||||||
|
Searcher searcher = new IndexSearcher(indexStore, true);
|
||||||
|
Sort sort = new Sort();
|
||||||
|
sort.setSort(new SortField("contents", SortField.STRING));
|
||||||
|
Query query = new MatchAllDocsQuery();
|
||||||
|
ScoreDoc[] result = searcher.search(query, null, 1000, sort).scoreDocs;
|
||||||
|
for (int i = 0 ; i < result.length ; ++i) {
|
||||||
|
Document doc = searcher.doc(result[i].doc);
|
||||||
|
assertEquals(sortedTracerOrder[i], doc.getValues("tracer")[0]);
|
||||||
|
}
|
||||||
|
</pre></code>
|
||||||
|
|
||||||
|
<h3>Turkish Case Normalization</h3>
|
||||||
|
<code><pre>
|
||||||
|
Collator collator = Collator.getInstance(new Locale("tr", "TR"));
|
||||||
|
collator.setStrength(Collator.PRIMARY);
|
||||||
|
Analyzer analyzer = new ICUCollationKeyAnalyzer(collator);
|
||||||
|
RAMDirectory ramDir = new RAMDirectory();
|
||||||
|
IndexWriter writer = new IndexWriter
|
||||||
|
(ramDir, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
|
||||||
|
Document doc = new Document();
|
||||||
|
doc.add(new Field("contents", "DIGY", Field.Store.NO, Field.Index.ANALYZED));
|
||||||
|
writer.addDocument(doc);
|
||||||
|
writer.close();
|
||||||
|
IndexSearcher is = new IndexSearcher(ramDir, true);
|
||||||
|
QueryParser parser = new QueryParser("contents", analyzer);
|
||||||
|
Query query = parser.parse("d\u0131gy"); // U+0131: dotless i
|
||||||
|
ScoreDoc[] result = is.search(query, null, 1000).scoreDocs;
|
||||||
|
assertEquals("The index Term should be included.", 1, result.length);
|
||||||
|
</pre></code>
|
||||||
|
|
||||||
|
<h2>Caveats and Comparisons</h2>
|
||||||
|
<p>
|
||||||
|
<strong>WARNING:</strong> Make sure you use exactly the same
|
||||||
|
<code>Collator</code> at index and query time -- <code>CollationKey</code>s
|
||||||
|
are only comparable when produced by
|
||||||
|
the same <code>Collator</code>. Since {@link java.text.RuleBasedCollator}s
|
||||||
|
are not independently versioned, it is unsafe to search against stored
|
||||||
|
<code>CollationKey</code>s unless the following are exactly the same (best
|
||||||
|
practice is to store this information with the index and check that they
|
||||||
|
remain the same at query time):
|
||||||
|
</p>
|
||||||
|
<ol>
|
||||||
|
<li>JVM vendor</li>
|
||||||
|
<li>JVM version, including patch version</li>
|
||||||
|
<li>
|
||||||
|
The language (and country and variant, if specified) of the Locale
|
||||||
|
used when constructing the collator via
|
||||||
|
{@link java.text.Collator#getInstance(java.util.Locale)}.
|
||||||
|
</li>
|
||||||
|
<li>
|
||||||
|
The collation strength used - see {@link java.text.Collator#setStrength(int)}
|
||||||
|
</li>
|
||||||
|
</ol>
|
||||||
|
<p>
|
||||||
|
<code>ICUCollationKeyFilter</code> uses ICU4J's <code>Collator</code>, which
|
||||||
|
makes its version available, thus allowing collation to be versioned
|
||||||
|
independently from the JVM. <code>ICUCollationKeyFilter</code> is also
|
||||||
|
significantly faster and generates significantly shorter keys than
|
||||||
|
<code>CollationKeyFilter</code>. See
|
||||||
|
<a href="http://site.icu-project.org/charts/collation-icu4j-sun"
|
||||||
|
>http://site.icu-project.org/charts/collation-icu4j-sun</a> for key
|
||||||
|
generation timing and key length comparisons between ICU4J and
|
||||||
|
<code>java.text.Collator</code> over several languages.
|
||||||
|
</p>
|
||||||
|
<p>
|
||||||
|
<code>CollationKey</code>s generated by <code>java.text.Collator</code>s are
|
||||||
|
not compatible with those those generated by ICU Collators. Specifically, if
|
||||||
|
you use <code>CollationKeyFilter</code> to generate index terms, do not use
|
||||||
|
<code>ICUCollationKeyFilter</code> on the query side, or vice versa.
|
||||||
|
</p>
|
||||||
|
<pre>
|
||||||
|
</pre>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
|
Loading…
Reference in New Issue