mirror of https://github.com/apache/lucene.git
LUCENE-2124: mv package.html docs to overview, like we did for queryparser, to avoid javadocs warnings
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@888784 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
550a4ef1af
commit
cc619905c4
|
@ -1,181 +0,0 @@
|
|||
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
<html>
|
||||
<head>
|
||||
<title>Lucene Collation Package</title>
|
||||
</head>
|
||||
<body>
|
||||
<p>
|
||||
<code>ICUCollationKeyFilter</code>
|
||||
converts each token into its binary <code>CollationKey</code> using the
|
||||
provided <code>Collator</code>, and then encode the <code>CollationKey</code>
|
||||
as a String using
|
||||
{@link org.apache.lucene.util.IndexableBinaryStringTools}, to allow it to be
|
||||
stored as an index term.
|
||||
</p>
|
||||
<p>
|
||||
<code>ICUCollationKeyFilter</code> depends on ICU4J 4.0 to produce the
|
||||
<code>CollationKey</code>s. <code>icu4j-collation-4.0.jar</code>,
|
||||
a trimmed-down version of <code>icu4j-4.0.jar</code> that contains only the
|
||||
code and data needed to support collation, is included in Lucene's Subversion
|
||||
repository at <code>contrib/collation/lib/</code>.
|
||||
</p>
|
||||
|
||||
<h2>Use Cases</h2>
|
||||
|
||||
<ul>
|
||||
<li>
|
||||
Efficient sorting of terms in languages that use non-Unicode character
|
||||
orderings. (Lucene Sort using a Locale can be very slow.)
|
||||
</li>
|
||||
<li>
|
||||
Efficient range queries over fields that contain terms in languages that
|
||||
use non-Unicode character orderings. (Range queries using a Locale can be
|
||||
very slow.)
|
||||
</li>
|
||||
<li>
|
||||
Effective Locale-specific normalization (case differences, diacritics, etc.).
|
||||
({@link org.apache.lucene.analysis.LowerCaseFilter} and
|
||||
{@link org.apache.lucene.analysis.ASCIIFoldingFilter} provide these services
|
||||
in a generic way that doesn't take into account locale-specific needs.)
|
||||
</li>
|
||||
</ul>
|
||||
|
||||
<h2>Example Usages</h2>
|
||||
|
||||
<h3>Farsi Range Queries</h3>
|
||||
<code><pre>
|
||||
Collator collator = Collator.getInstance(new Locale("ar"));
|
||||
ICUCollationKeyAnalyzer analyzer = new ICUCollationKeyAnalyzer(collator);
|
||||
RAMDirectory ramDir = new RAMDirectory();
|
||||
IndexWriter writer = new IndexWriter
|
||||
(ramDir, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
|
||||
Document doc = new Document();
|
||||
doc.add(new Field("content", "\u0633\u0627\u0628",
|
||||
Field.Store.YES, Field.Index.ANALYZED));
|
||||
writer.addDocument(doc);
|
||||
writer.close();
|
||||
IndexSearcher is = new IndexSearcher(ramDir, true);
|
||||
|
||||
// The AnalyzingQueryParser in Lucene's contrib allows terms in range queries
|
||||
// to be passed through an analyzer - Lucene's standard QueryParser does not
|
||||
// allow this.
|
||||
AnalyzingQueryParser aqp = new AnalyzingQueryParser("content", analyzer);
|
||||
aqp.setLowercaseExpandedTerms(false);
|
||||
|
||||
// Unicode order would include U+0633 in [ U+062F - U+0698 ], but Farsi
|
||||
// orders the U+0698 character before the U+0633 character, so the single
|
||||
// indexed Term above should NOT be returned by a ConstantScoreRangeQuery
|
||||
// with a Farsi Collator (or an Arabic one for the case when Farsi is not
|
||||
// supported).
|
||||
ScoreDoc[] result
|
||||
= is.search(aqp.parse("[ \u062F TO \u0698 ]"), null, 1000).scoreDocs;
|
||||
assertEquals("The index Term should not be included.", 0, result.length);
|
||||
</pre></code>
|
||||
|
||||
<h3>Danish Sorting</h3>
|
||||
<code><pre>
|
||||
Analyzer analyzer
|
||||
= new ICUCollationKeyAnalyzer(Collator.getInstance(new Locale("da", "dk")));
|
||||
RAMDirectory indexStore = new RAMDirectory();
|
||||
IndexWriter writer = new IndexWriter
|
||||
(indexStore, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
|
||||
String[] tracer = new String[] { "A", "B", "C", "D", "E" };
|
||||
String[] data = new String[] { "HAT", "HUT", "H\u00C5T", "H\u00D8T", "HOT" };
|
||||
String[] sortedTracerOrder = new String[] { "A", "E", "B", "D", "C" };
|
||||
for (int i = 0 ; i < data.length ; ++i) {
|
||||
Document doc = new Document();
|
||||
doc.add(new Field("tracer", tracer[i], Field.Store.YES, Field.Index.NO));
|
||||
doc.add(new Field("contents", data[i], Field.Store.NO, Field.Index.ANALYZED));
|
||||
writer.addDocument(doc);
|
||||
}
|
||||
writer.close();
|
||||
Searcher searcher = new IndexSearcher(indexStore, true);
|
||||
Sort sort = new Sort();
|
||||
sort.setSort(new SortField("contents", SortField.STRING));
|
||||
Query query = new MatchAllDocsQuery();
|
||||
ScoreDoc[] result = searcher.search(query, null, 1000, sort).scoreDocs;
|
||||
for (int i = 0 ; i < result.length ; ++i) {
|
||||
Document doc = searcher.doc(result[i].doc);
|
||||
assertEquals(sortedTracerOrder[i], doc.getValues("tracer")[0]);
|
||||
}
|
||||
</pre></code>
|
||||
|
||||
<h3>Turkish Case Normalization</h3>
|
||||
<code><pre>
|
||||
Collator collator = Collator.getInstance(new Locale("tr", "TR"));
|
||||
collator.setStrength(Collator.PRIMARY);
|
||||
Analyzer analyzer = new ICUCollationKeyAnalyzer(collator);
|
||||
RAMDirectory ramDir = new RAMDirectory();
|
||||
IndexWriter writer = new IndexWriter
|
||||
(ramDir, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
|
||||
Document doc = new Document();
|
||||
doc.add(new Field("contents", "DIGY", Field.Store.NO, Field.Index.ANALYZED));
|
||||
writer.addDocument(doc);
|
||||
writer.close();
|
||||
IndexSearcher is = new IndexSearcher(ramDir, true);
|
||||
QueryParser parser = new QueryParser("contents", analyzer);
|
||||
Query query = parser.parse("d\u0131gy"); // U+0131: dotless i
|
||||
ScoreDoc[] result = is.search(query, null, 1000).scoreDocs;
|
||||
assertEquals("The index Term should be included.", 1, result.length);
|
||||
</pre></code>
|
||||
|
||||
<h2>Caveats and Comparisons</h2>
|
||||
<p>
|
||||
<strong>WARNING:</strong> Make sure you use exactly the same
|
||||
<code>Collator</code> at index and query time -- <code>CollationKey</code>s
|
||||
are only comparable when produced by
|
||||
the same <code>Collator</code>. Since {@link java.text.RuleBasedCollator}s
|
||||
are not independently versioned, it is unsafe to search against stored
|
||||
<code>CollationKey</code>s unless the following are exactly the same (best
|
||||
practice is to store this information with the index and check that they
|
||||
remain the same at query time):
|
||||
</p>
|
||||
<ol>
|
||||
<li>JVM vendor</li>
|
||||
<li>JVM version, including patch version</li>
|
||||
<li>
|
||||
The language (and country and variant, if specified) of the Locale
|
||||
used when constructing the collator via
|
||||
{@link java.text.Collator#getInstance(java.util.Locale)}.
|
||||
</li>
|
||||
<li>
|
||||
The collation strength used - see {@link java.text.Collator#setStrength(int)}
|
||||
</li>
|
||||
</ol>
|
||||
<p>
|
||||
<code>ICUCollationKeyFilter</code> uses ICU4J's <code>Collator</code>, which
|
||||
makes its version available, thus allowing collation to be versioned
|
||||
independently from the JVM. <code>ICUCollationKeyFilter</code> is also
|
||||
significantly faster and generates significantly shorter keys than
|
||||
<code>CollationKeyFilter</code>. See
|
||||
<a href="http://site.icu-project.org/charts/collation-icu4j-sun"
|
||||
>http://site.icu-project.org/charts/collation-icu4j-sun</a> for key
|
||||
generation timing and key length comparisons between ICU4J and
|
||||
<code>java.text.Collator</code> over several languages.
|
||||
</p>
|
||||
<p>
|
||||
<code>CollationKey</code>s generated by <code>java.text.Collator</code>s are
|
||||
not compatible with those those generated by ICU Collators. Specifically, if
|
||||
you use <code>CollationKeyFilter</code> to generate index terms, do not use
|
||||
<code>ICUCollationKeyFilter</code> on the query side, or vice versa.
|
||||
</p>
|
||||
<pre>
|
||||
</pre>
|
||||
</body>
|
||||
</html>
|
|
@ -20,6 +20,163 @@
|
|||
Apache Lucene ICUCollationKeyFilter/Analyzer
|
||||
</title>
|
||||
</head>
|
||||
<body>
|
||||
</body>
|
||||
</html>
|
||||
<body>
|
||||
<p>
|
||||
<code>ICUCollationKeyFilter</code>
|
||||
converts each token into its binary <code>CollationKey</code> using the
|
||||
provided <code>Collator</code>, and then encode the <code>CollationKey</code>
|
||||
as a String using
|
||||
{@link org.apache.lucene.util.IndexableBinaryStringTools}, to allow it to be
|
||||
stored as an index term.
|
||||
</p>
|
||||
<p>
|
||||
<code>ICUCollationKeyFilter</code> depends on ICU4J 4.0 to produce the
|
||||
<code>CollationKey</code>s. <code>icu4j-collation-4.0.jar</code>,
|
||||
a trimmed-down version of <code>icu4j-4.0.jar</code> that contains only the
|
||||
code and data needed to support collation, is included in Lucene's Subversion
|
||||
repository at <code>contrib/collation/lib/</code>.
|
||||
</p>
|
||||
|
||||
<h2>Use Cases</h2>
|
||||
|
||||
<ul>
|
||||
<li>
|
||||
Efficient sorting of terms in languages that use non-Unicode character
|
||||
orderings. (Lucene Sort using a Locale can be very slow.)
|
||||
</li>
|
||||
<li>
|
||||
Efficient range queries over fields that contain terms in languages that
|
||||
use non-Unicode character orderings. (Range queries using a Locale can be
|
||||
very slow.)
|
||||
</li>
|
||||
<li>
|
||||
Effective Locale-specific normalization (case differences, diacritics, etc.).
|
||||
({@link org.apache.lucene.analysis.LowerCaseFilter} and
|
||||
{@link org.apache.lucene.analysis.ASCIIFoldingFilter} provide these services
|
||||
in a generic way that doesn't take into account locale-specific needs.)
|
||||
</li>
|
||||
</ul>
|
||||
|
||||
<h2>Example Usages</h2>
|
||||
|
||||
<h3>Farsi Range Queries</h3>
|
||||
<code><pre>
|
||||
Collator collator = Collator.getInstance(new Locale("ar"));
|
||||
ICUCollationKeyAnalyzer analyzer = new ICUCollationKeyAnalyzer(collator);
|
||||
RAMDirectory ramDir = new RAMDirectory();
|
||||
IndexWriter writer = new IndexWriter
|
||||
(ramDir, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
|
||||
Document doc = new Document();
|
||||
doc.add(new Field("content", "\u0633\u0627\u0628",
|
||||
Field.Store.YES, Field.Index.ANALYZED));
|
||||
writer.addDocument(doc);
|
||||
writer.close();
|
||||
IndexSearcher is = new IndexSearcher(ramDir, true);
|
||||
|
||||
// The AnalyzingQueryParser in Lucene's contrib allows terms in range queries
|
||||
// to be passed through an analyzer - Lucene's standard QueryParser does not
|
||||
// allow this.
|
||||
AnalyzingQueryParser aqp = new AnalyzingQueryParser("content", analyzer);
|
||||
aqp.setLowercaseExpandedTerms(false);
|
||||
|
||||
// Unicode order would include U+0633 in [ U+062F - U+0698 ], but Farsi
|
||||
// orders the U+0698 character before the U+0633 character, so the single
|
||||
// indexed Term above should NOT be returned by a ConstantScoreRangeQuery
|
||||
// with a Farsi Collator (or an Arabic one for the case when Farsi is not
|
||||
// supported).
|
||||
ScoreDoc[] result
|
||||
= is.search(aqp.parse("[ \u062F TO \u0698 ]"), null, 1000).scoreDocs;
|
||||
assertEquals("The index Term should not be included.", 0, result.length);
|
||||
</pre></code>
|
||||
|
||||
<h3>Danish Sorting</h3>
|
||||
<code><pre>
|
||||
Analyzer analyzer
|
||||
= new ICUCollationKeyAnalyzer(Collator.getInstance(new Locale("da", "dk")));
|
||||
RAMDirectory indexStore = new RAMDirectory();
|
||||
IndexWriter writer = new IndexWriter
|
||||
(indexStore, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
|
||||
String[] tracer = new String[] { "A", "B", "C", "D", "E" };
|
||||
String[] data = new String[] { "HAT", "HUT", "H\u00C5T", "H\u00D8T", "HOT" };
|
||||
String[] sortedTracerOrder = new String[] { "A", "E", "B", "D", "C" };
|
||||
for (int i = 0 ; i < data.length ; ++i) {
|
||||
Document doc = new Document();
|
||||
doc.add(new Field("tracer", tracer[i], Field.Store.YES, Field.Index.NO));
|
||||
doc.add(new Field("contents", data[i], Field.Store.NO, Field.Index.ANALYZED));
|
||||
writer.addDocument(doc);
|
||||
}
|
||||
writer.close();
|
||||
Searcher searcher = new IndexSearcher(indexStore, true);
|
||||
Sort sort = new Sort();
|
||||
sort.setSort(new SortField("contents", SortField.STRING));
|
||||
Query query = new MatchAllDocsQuery();
|
||||
ScoreDoc[] result = searcher.search(query, null, 1000, sort).scoreDocs;
|
||||
for (int i = 0 ; i < result.length ; ++i) {
|
||||
Document doc = searcher.doc(result[i].doc);
|
||||
assertEquals(sortedTracerOrder[i], doc.getValues("tracer")[0]);
|
||||
}
|
||||
</pre></code>
|
||||
|
||||
<h3>Turkish Case Normalization</h3>
|
||||
<code><pre>
|
||||
Collator collator = Collator.getInstance(new Locale("tr", "TR"));
|
||||
collator.setStrength(Collator.PRIMARY);
|
||||
Analyzer analyzer = new ICUCollationKeyAnalyzer(collator);
|
||||
RAMDirectory ramDir = new RAMDirectory();
|
||||
IndexWriter writer = new IndexWriter
|
||||
(ramDir, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
|
||||
Document doc = new Document();
|
||||
doc.add(new Field("contents", "DIGY", Field.Store.NO, Field.Index.ANALYZED));
|
||||
writer.addDocument(doc);
|
||||
writer.close();
|
||||
IndexSearcher is = new IndexSearcher(ramDir, true);
|
||||
QueryParser parser = new QueryParser("contents", analyzer);
|
||||
Query query = parser.parse("d\u0131gy"); // U+0131: dotless i
|
||||
ScoreDoc[] result = is.search(query, null, 1000).scoreDocs;
|
||||
assertEquals("The index Term should be included.", 1, result.length);
|
||||
</pre></code>
|
||||
|
||||
<h2>Caveats and Comparisons</h2>
|
||||
<p>
|
||||
<strong>WARNING:</strong> Make sure you use exactly the same
|
||||
<code>Collator</code> at index and query time -- <code>CollationKey</code>s
|
||||
are only comparable when produced by
|
||||
the same <code>Collator</code>. Since {@link java.text.RuleBasedCollator}s
|
||||
are not independently versioned, it is unsafe to search against stored
|
||||
<code>CollationKey</code>s unless the following are exactly the same (best
|
||||
practice is to store this information with the index and check that they
|
||||
remain the same at query time):
|
||||
</p>
|
||||
<ol>
|
||||
<li>JVM vendor</li>
|
||||
<li>JVM version, including patch version</li>
|
||||
<li>
|
||||
The language (and country and variant, if specified) of the Locale
|
||||
used when constructing the collator via
|
||||
{@link java.text.Collator#getInstance(java.util.Locale)}.
|
||||
</li>
|
||||
<li>
|
||||
The collation strength used - see {@link java.text.Collator#setStrength(int)}
|
||||
</li>
|
||||
</ol>
|
||||
<p>
|
||||
<code>ICUCollationKeyFilter</code> uses ICU4J's <code>Collator</code>, which
|
||||
makes its version available, thus allowing collation to be versioned
|
||||
independently from the JVM. <code>ICUCollationKeyFilter</code> is also
|
||||
significantly faster and generates significantly shorter keys than
|
||||
<code>CollationKeyFilter</code>. See
|
||||
<a href="http://site.icu-project.org/charts/collation-icu4j-sun"
|
||||
>http://site.icu-project.org/charts/collation-icu4j-sun</a> for key
|
||||
generation timing and key length comparisons between ICU4J and
|
||||
<code>java.text.Collator</code> over several languages.
|
||||
</p>
|
||||
<p>
|
||||
<code>CollationKey</code>s generated by <code>java.text.Collator</code>s are
|
||||
not compatible with those those generated by ICU Collators. Specifically, if
|
||||
you use <code>CollationKeyFilter</code> to generate index terms, do not use
|
||||
<code>ICUCollationKeyFilter</code> on the query side, or vice versa.
|
||||
</p>
|
||||
<pre>
|
||||
</pre>
|
||||
</body>
|
||||
</html>
|
||||
|
|
Loading…
Reference in New Issue