mirror of https://github.com/apache/lucene.git
LUCENE-9209: fix javadocs to be html5, enable doclint html checks, remove jtidy
Current javadocs declare an HTML5 doctype: !DOCTYPE HTML. Some HTML5 features are used, but unfortunately also some constructs that do not exist in HTML5 are used as well. Because of this, we have no checking of any html syntax. jtidy is disabled because it works with html4. doclint is disabled because it works with html5. our docs are neither. javadoc "doclint" feature can efficiently check that the html isn't crazy. we just have to fix really ancient removed/deprecated stuff (such as use of tt tag). This enables the html checking in both ant and gradle. The docs are fixed via straightforward transformations. One exception is table cellpadding, for this some helper CSS classes were added to make the transition easier (since it must apply padding to inner th/td, not possible inline). I added TODOs, we should clean this up. Most problems look like they may have been generated from a GUI or similar and not a human.
This commit is contained in:
parent
abd282d258
commit
0d339043e3
|
@ -44,7 +44,7 @@ allprojects {
|
|||
)
|
||||
|
||||
opts.addStringOption("-release", "11")
|
||||
opts.addBooleanOption('Xdoclint:all,-missing,-accessibility,-html', true)
|
||||
opts.addBooleanOption('Xdoclint:all,-missing,-accessibility', true)
|
||||
|
||||
def libName = project.path.startsWith(":lucene") ? "Lucene" : "Solr"
|
||||
opts.overview = file("src/main/java/overview.html").toString()
|
||||
|
|
|
@ -39,10 +39,10 @@ public class BrazilianStemmer {
|
|||
}
|
||||
|
||||
/**
|
||||
* Stems the given term to an unique <tt>discriminator</tt>.
|
||||
* Stems the given term to an unique <code>discriminator</code>.
|
||||
*
|
||||
* @param term The term that should be stemmed.
|
||||
* @return Discriminator for <tt>term</tt>
|
||||
* @return Discriminator for <code>term</code>
|
||||
*/
|
||||
protected String stem( String term ) {
|
||||
boolean altered = false ; // altered the term
|
||||
|
|
|
@ -30184,7 +30184,7 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
|
|||
*
|
||||
* All internal variables are reset, the old input stream
|
||||
* <b>cannot</b> be reused (internal buffer is discarded and lost).
|
||||
* Lexical state is set to <tt>ZZ_INITIAL</tt>.
|
||||
* Lexical state is set to <code>ZZ_INITIAL</code>.
|
||||
*
|
||||
* Internal scan buffer is resized down to its initial length, if it has grown.
|
||||
*
|
||||
|
@ -30232,7 +30232,7 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
|
|||
|
||||
|
||||
/**
|
||||
* Returns the character at position <tt>pos</tt> from the
|
||||
* Returns the character at position <code>pos</code> from the
|
||||
* matched text.
|
||||
*
|
||||
* It is equivalent to yytext().charAt(pos), but faster
|
||||
|
|
|
@ -18,7 +18,8 @@
|
|||
/**
|
||||
* A filter that decomposes compound words you find in many Germanic
|
||||
* languages into the word parts. This example shows what it does:
|
||||
* <table border="1" summary="example input stream">
|
||||
* <table style="border: 1px solid">
|
||||
* <caption>example input stream</caption>
|
||||
* <tr>
|
||||
* <th>Input token stream</th>
|
||||
* </tr>
|
||||
|
@ -27,7 +28,8 @@
|
|||
* </tr>
|
||||
* </table>
|
||||
* <br>
|
||||
* <table border="1" summary="example output stream">
|
||||
* <table style="border: 1px solid">
|
||||
* <caption>example output stream</caption>
|
||||
* <tr>
|
||||
* <th>Output token stream</th>
|
||||
* </tr>
|
||||
|
@ -118,7 +120,8 @@
|
|||
*
|
||||
* <h3>Which variant should I use?</h3>
|
||||
* This decision matrix should help you:
|
||||
* <table border="1" summary="comparison of dictionary and hyphenation based decompounding">
|
||||
* <table style="border: 1px solid">
|
||||
* <caption>comparison of dictionary and hyphenation based decompounding</caption>
|
||||
* <tr>
|
||||
* <th>Token filter</th>
|
||||
* <th>Output quality</th>
|
||||
|
|
|
@ -37,17 +37,17 @@ public class GermanStemmer
|
|||
private StringBuilder sb = new StringBuilder();
|
||||
|
||||
/**
|
||||
* Amount of characters that are removed with <tt>substitute()</tt> while stemming.
|
||||
* Amount of characters that are removed with <code>substitute()</code> while stemming.
|
||||
*/
|
||||
private int substCount = 0;
|
||||
|
||||
private static final Locale locale = new Locale("de", "DE");
|
||||
|
||||
/**
|
||||
* Stemms the given term to an unique <tt>discriminator</tt>.
|
||||
* Stemms the given term to an unique <code>discriminator</code>.
|
||||
*
|
||||
* @param term The term that should be stemmed.
|
||||
* @return Discriminator for <tt>term</tt>
|
||||
* @return Discriminator for <code>term</code>
|
||||
*/
|
||||
protected String stem( String term )
|
||||
{
|
||||
|
|
|
@ -27,23 +27,23 @@ import org.apache.lucene.util.automaton.TooComplexToDeterminizeException;
|
|||
* Factory for {@link ConcatenateGraphFilter}.
|
||||
*
|
||||
* <ul>
|
||||
* <li><tt>preserveSep</tt>:
|
||||
* <li><code>preserveSep</code>:
|
||||
* For lucene versions lesser than {@link org.apache.lucene.util.Version#LUCENE_8_4_0}
|
||||
* Whether {@link ConcatenateGraphFilter#SEP_LABEL}
|
||||
* should separate the input tokens in the concatenated token
|
||||
* </li>
|
||||
* <li><tt>tokenSeparator</tt>:
|
||||
* <li><code>tokenSeparator</code>:
|
||||
* Separator to use for concatenation. If not present,
|
||||
* {@link ConcatenateGraphFilter#DEFAULT_TOKEN_SEPARATOR} will be used.
|
||||
* If empty, tokens will be concatenated without any separators.
|
||||
* </li>
|
||||
* <li><tt>preservePositionIncrements</tt>:
|
||||
* <li><code>preservePositionIncrements</code>:
|
||||
* Whether to add an empty token for missing positions.
|
||||
* The effect is a consecutive {@link ConcatenateGraphFilter#SEP_LABEL}.
|
||||
* When false, it's as if there were no missing positions
|
||||
* (we pretend the surrounding tokens were adjacent).
|
||||
* </li>
|
||||
* <li><tt>maxGraphExpansions</tt>:
|
||||
* <li><code>maxGraphExpansions</code>:
|
||||
* If the tokenStream graph has more than this many possible paths through, then we'll throw
|
||||
* {@link TooComplexToDeterminizeException} to preserve the stability and memory of the
|
||||
* machine.
|
||||
|
|
|
@ -28,7 +28,7 @@ import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
|||
* Tokenizes the given token into n-grams of given size(s).
|
||||
* <p>
|
||||
* This {@link TokenFilter} create n-grams from the beginning edge of a input token.
|
||||
* <p><a name="match_version"></a>As of Lucene 4.4, this filter handles correctly
|
||||
* <p><a id="match_version"></a>As of Lucene 4.4, this filter handles correctly
|
||||
* supplementary characters.
|
||||
*/
|
||||
public final class EdgeNGramTokenFilter extends TokenFilter {
|
||||
|
|
|
@ -24,7 +24,7 @@ import org.apache.lucene.util.AttributeFactory;
|
|||
* Tokenizes the input from an edge into n-grams of given size(s).
|
||||
* <p>
|
||||
* This {@link Tokenizer} create n-grams from the beginning edge of a input token.
|
||||
* <p><a name="match_version"></a>As of Lucene 4.4, this class supports
|
||||
* <p><a id="match_version"></a>As of Lucene 4.4, this class supports
|
||||
* {@link #isTokenChar(int) pre-tokenization} and correctly handles
|
||||
* supplementary characters.
|
||||
*/
|
||||
|
|
|
@ -33,13 +33,14 @@ import org.apache.lucene.util.AttributeFactory;
|
|||
* that characters between startOffset and endOffset in the original stream are
|
||||
* the same as the term chars.
|
||||
* <p>For example, "abcde" would be tokenized as (minGram=2, maxGram=3):
|
||||
* <table summary="ngram tokens example">
|
||||
* <table>
|
||||
* <caption>ngram tokens example</caption>
|
||||
* <tr><th>Term</th><td>ab</td><td>abc</td><td>bc</td><td>bcd</td><td>cd</td><td>cde</td><td>de</td></tr>
|
||||
* <tr><th>Position increment</th><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td></tr>
|
||||
* <tr><th>Position length</th><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td></tr>
|
||||
* <tr><th>Offsets</th><td>[0,2[</td><td>[0,3[</td><td>[1,3[</td><td>[1,4[</td><td>[2,4[</td><td>[2,5[</td><td>[3,5[</td></tr>
|
||||
* </table>
|
||||
* <a name="version"></a>
|
||||
* <a id="version"></a>
|
||||
* <p>This tokenizer changed a lot in Lucene 4.4 in order to:<ul>
|
||||
* <li>tokenize in a streaming fashion to support streams which are larger
|
||||
* than 1024 chars (limit of the previous version),
|
||||
|
|
|
@ -39,7 +39,7 @@ public class ClassicFilter extends TokenFilter {
|
|||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
|
||||
/** Returns the next token in the stream, or null at EOS.
|
||||
* <p>Removes <tt>'s</tt> from the end of words.
|
||||
* <p>Removes <code>'s</code> from the end of words.
|
||||
* <p>Removes dots from acronyms.
|
||||
*/
|
||||
@Override
|
||||
|
|
|
@ -481,7 +481,7 @@ public final void getText(CharTermAttribute t) {
|
|||
*
|
||||
* All internal variables are reset, the old input stream
|
||||
* <b>cannot</b> be reused (internal buffer is discarded and lost).
|
||||
* Lexical state is set to <tt>ZZ_INITIAL</tt>.
|
||||
* Lexical state is set to <code>ZZ_INITIAL</code>.
|
||||
*
|
||||
* Internal scan buffer is resized down to its initial length, if it has grown.
|
||||
*
|
||||
|
@ -529,7 +529,7 @@ public final void getText(CharTermAttribute t) {
|
|||
|
||||
|
||||
/**
|
||||
* Returns the character at position <tt>pos</tt> from the
|
||||
* Returns the character at position <code>pos</code> from the
|
||||
* matched text.
|
||||
*
|
||||
* It is equivalent to yytext().charAt(pos), but faster
|
||||
|
|
|
@ -40861,7 +40861,7 @@ public final class UAX29URLEmailTokenizerImpl {
|
|||
*
|
||||
* All internal variables are reset, the old input stream
|
||||
* <b>cannot</b> be reused (internal buffer is discarded and lost).
|
||||
* Lexical state is set to <tt>ZZ_INITIAL</tt>.
|
||||
* Lexical state is set to <code>ZZ_INITIAL</code>.
|
||||
*
|
||||
* Internal scan buffer is resized down to its initial length, if it has grown.
|
||||
*
|
||||
|
@ -40909,7 +40909,7 @@ public final class UAX29URLEmailTokenizerImpl {
|
|||
|
||||
|
||||
/**
|
||||
* Returns the character at position <tt>pos</tt> from the
|
||||
* Returns the character at position <code>pos</code> from the
|
||||
* matched text.
|
||||
*
|
||||
* It is equivalent to yytext().charAt(pos), but faster
|
||||
|
|
|
@ -620,7 +620,7 @@ final void reset() {
|
|||
*
|
||||
* All internal variables are reset, the old input stream
|
||||
* <b>cannot</b> be reused (internal buffer is discarded and lost).
|
||||
* Lexical state is set to <tt>ZZ_INITIAL</tt>.
|
||||
* Lexical state is set to <code>ZZ_INITIAL</code>.
|
||||
*
|
||||
* Internal scan buffer is resized down to its initial length, if it has grown.
|
||||
*
|
||||
|
@ -668,7 +668,7 @@ final void reset() {
|
|||
|
||||
|
||||
/**
|
||||
* Returns the character at position <tt>pos</tt> from the
|
||||
* Returns the character at position <code>pos</code> from the
|
||||
* matched text.
|
||||
*
|
||||
* It is equivalent to yytext().charAt(pos), but faster
|
||||
|
|
|
@ -213,7 +213,7 @@ document.write("Last Published: " + document.lastModified);
|
|||
</ul>
|
||||
</div>
|
||||
|
||||
<a name="N1000D"></a><a name="intro"></a>
|
||||
<a id="N1000D"></a><a id="intro"></a>
|
||||
<h2 class="boxed">What Is Solr?</h2>
|
||||
<div class="section">
|
||||
<p>
|
||||
|
@ -228,10 +228,10 @@ document.write("Last Published: " + document.lastModified);
|
|||
</div>
|
||||
|
||||
|
||||
<a name="N1002A"></a><a name="news"></a>
|
||||
<a id="N1002A"></a><a id="news"></a>
|
||||
<h2 class="boxed">News</h2>
|
||||
<div class="section">
|
||||
<a name="N10030"></a><a name="02+October+2007+-+Solr+at+OSSummit+Asia"></a>
|
||||
<a id="N10030"></a><a id="02+October+2007+-+Solr+at+OSSummit+Asia"></a>
|
||||
<h3 class="boxed">02 October 2007 - Solr at OSSummit Asia</h3>
|
||||
<p>
|
||||
<a href="http://www.ossummit.com"><img alt="OSSummit Asia logo" class="float-right" src="http://www.ossummit.com/2007/images/logo.png"></a>
|
||||
|
@ -250,7 +250,7 @@ document.write("Last Published: " + document.lastModified);
|
|||
<a href="http://www.ossummit.com/2007/program/talk/67">Lucene Case Studies</a> by Erik Hatcher. A rapid series of examples of many Lucene and Solr using applications.</li>
|
||||
|
||||
</ul>
|
||||
<a name="N10058"></a><a name="03+September+2007+-+Lucene+at+ApacheCon+Atlanta"></a>
|
||||
<a id="N10058"></a><a id="03+September+2007+-+Lucene+at+ApacheCon+Atlanta"></a>
|
||||
<h3 class="boxed">03 September 2007 - Lucene at ApacheCon Atlanta</h3>
|
||||
<p>
|
||||
<a href="http://www.us.apachecon.com"><img alt="ApacheCon US logo" class="float-right" src="http://www.apache.org/ads/ApacheCon/2007-usa-125x125.png"></a>
|
||||
|
@ -270,7 +270,7 @@ document.write("Last Published: " + document.lastModified);
|
|||
<li>November 16, 4:00 pm: <a href="http://us.apachecon.com/us2007/program/talk/2017"> Advanced Indexing Techniques with Apache Lucene</a> by Michael Busch. Information on payloads and advanced indexing techniques.</li>
|
||||
|
||||
</ul>
|
||||
<a name="N10091"></a><a name="06+June+2007%3A+Release+1.2+available"></a>
|
||||
<a id="N10091"></a><a id="06+June+2007%3A+Release+1.2+available"></a>
|
||||
<h3 class="boxed">06 June 2007: Release 1.2 available</h3>
|
||||
<p>
|
||||
This is the first release since Solr graduated from the Incubator,
|
||||
|
@ -280,40 +280,40 @@ document.write("Last Published: " + document.lastModified);
|
|||
and more flexible plugins.
|
||||
</p>
|
||||
<p>See the <a href="http://svn.apache.org/repos/asf/lucene/solr/tags/release-1.2.0/CHANGES.txt">release notes</a> for more details.</p>
|
||||
<a name="N100A2"></a><a name="17+January+2007%3A+Solr+graduates+from+Incubator"></a>
|
||||
<a id="N100A2"></a><a id="17+January+2007%3A+Solr+graduates+from+Incubator"></a>
|
||||
<h3 class="boxed">17 January 2007: Solr graduates from Incubator</h3>
|
||||
<p>
|
||||
Solr has graduated from the Apache Incubator, and is now a sub-project of Lucene.
|
||||
</p>
|
||||
<a name="N100AC"></a><a name="22+December+2006%3A+Release+1.1.0+available"></a>
|
||||
<a id="N100AC"></a><a id="22+December+2006%3A+Release+1.1.0+available"></a>
|
||||
<h3 class="boxed">22 December 2006: Release 1.1.0 available</h3>
|
||||
<p>
|
||||
This is the first release since Solr joined the Incubator, and brings
|
||||
many new features and performance optimizations including highlighting,
|
||||
faceted search, and JSON/Python/Ruby response formats.
|
||||
</p>
|
||||
<a name="N100B6"></a><a name="15+August+2006%3A+Solr+at+ApacheCon+US"></a>
|
||||
<a id="N100B6"></a><a id="15+August+2006%3A+Solr+at+ApacheCon+US"></a>
|
||||
<h3 class="boxed">15 August 2006: Solr at ApacheCon US</h3>
|
||||
<p>Chris Hostetter will be presenting
|
||||
<strong><a href="http://www.apachecon.com/2006/US/html/sessions.html#FR26">"Faceted Searching With Apache Solr"</a></strong>
|
||||
at ApacheCon US 2006, on October 13th at 4:30pm.
|
||||
See the <a href="http://www.us.apachecon.com/">ApacheCon</a> website for more details.
|
||||
</p>
|
||||
<a name="N100C9"></a><a name="21+April+2006%3A+Solr+at+ApacheCon"></a>
|
||||
<a id="N100C9"></a><a id="21+April+2006%3A+Solr+at+ApacheCon"></a>
|
||||
<h3 class="boxed">21 April 2006: Solr at ApacheCon</h3>
|
||||
<p>Yonik Seeley will be presenting
|
||||
<strong>"Apache Solr, a Full-Text Search Server based on Lucene"</strong>
|
||||
at ApacheCon Europe 2006, on June 29th at 5:30pm.
|
||||
See the <a href="http://www.eu.apachecon.com/">ApacheCon</a> website for more details.
|
||||
</p>
|
||||
<a name="N100DA"></a><a name="21+February+2006%3A+nightly+builds"></a>
|
||||
<a id="N100DA"></a><a id="21+February+2006%3A+nightly+builds"></a>
|
||||
<h3 class="boxed">21 February 2006: nightly builds</h3>
|
||||
<p>Solr now has nightly builds. This automatically creates a
|
||||
<a href="http://people.apache.org/builds/lucene/solr/nightly/">downloadable version of Solr every
|
||||
night</a>. All unit tests must pass, or a message is sent to
|
||||
the developers mailing list and no new version is created. This
|
||||
also updates the <a href="api/index.html">javadoc</a>.</p>
|
||||
<a name="N100EC"></a><a name="17+January+2006%3A+Solr+Joins+Apache+Incubator"></a>
|
||||
<a id="N100EC"></a><a id="17+January+2006%3A+Solr+Joins+Apache+Incubator"></a>
|
||||
<h3 class="boxed">17 January 2006: Solr Joins Apache Incubator</h3>
|
||||
<p>Solr, a search server based on Lucene, has been accepted into the Apache Incubator.
|
||||
Solr was originally developed by CNET Networks, and is widely used within CNET
|
||||
|
|
|
@ -59,7 +59,7 @@ import com.ibm.icu.text.RuleBasedBreakIterator;
|
|||
*
|
||||
* <p>
|
||||
* To add per-script rules, add a "rulefiles" argument, which should contain a
|
||||
* comma-separated list of <tt>code:rulefile</tt> pairs in the following format:
|
||||
* comma-separated list of <code>code:rulefile</code> pairs in the following format:
|
||||
* <a href="http://unicode.org/iso15924/iso15924-codes.html"
|
||||
* >four-letter ISO 15924 script code</a>, followed by a colon, then a resource
|
||||
* path. E.g. to specify rules for Latin (script code "Latn") and Cyrillic
|
||||
|
|
|
@ -47,8 +47,8 @@ This module exposes the following functionality:
|
|||
<li><a href="#transform">Text Transformation</a>: Transforms Unicode text in
|
||||
a context-sensitive fashion: e.g. mapping Traditional to Simplified Chinese</li>
|
||||
</ul>
|
||||
<hr/>
|
||||
<h1><a name="segmentation">Text Segmentation</a></h1>
|
||||
<hr>
|
||||
<h1><a id="segmentation">Text Segmentation</a></h1>
|
||||
<p>
|
||||
Text Segmentation (Tokenization) divides document and query text into index terms
|
||||
(typically words). Unicode provides special properties and rules so that this can
|
||||
|
@ -76,8 +76,8 @@ algorithm.
|
|||
*/
|
||||
Tokenizer tokenizer = new ICUTokenizer(reader);
|
||||
</pre>
|
||||
<hr/>
|
||||
<h1><a name="collation">Collation</a></h1>
|
||||
<hr>
|
||||
<h1><a id="collation">Collation</a></h1>
|
||||
<p>
|
||||
<code>ICUCollationKeyAnalyzer</code>
|
||||
converts each token into its binary <code>CollationKey</code> using the
|
||||
|
@ -225,8 +225,8 @@ algorithm.
|
|||
you use <code>CollationKeyAnalyzer</code> to generate index terms, do not use
|
||||
<code>ICUCollationKeyAnalyzer</code> on the query side, or vice versa.
|
||||
</p>
|
||||
<hr/>
|
||||
<h1><a name="normalization">Normalization</a></h1>
|
||||
<hr>
|
||||
<h1><a id="normalization">Normalization</a></h1>
|
||||
<p>
|
||||
<code>ICUNormalizer2Filter</code> normalizes term text to a
|
||||
<a href="http://unicode.org/reports/tr15/">Unicode Normalization Form</a>, so
|
||||
|
@ -253,8 +253,8 @@ algorithm.
|
|||
*/
|
||||
TokenStream tokenstream = new ICUNormalizer2Filter(tokenizer, normalizer);
|
||||
</pre>
|
||||
<hr/>
|
||||
<h1><a name="casefolding">Case Folding</a></h1>
|
||||
<hr>
|
||||
<h1><a id="casefolding">Case Folding</a></h1>
|
||||
<p>
|
||||
Default caseless matching, or case-folding is more than just conversion to
|
||||
lowercase. For example, it handles cases such as the Greek sigma, so that
|
||||
|
@ -288,8 +288,8 @@ this integration. To perform case-folding, you use normalization with the form
|
|||
*/
|
||||
TokenStream tokenstream = new ICUNormalizer2Filter(tokenizer);
|
||||
</pre>
|
||||
<hr/>
|
||||
<h1><a name="searchfolding">Search Term Folding</a></h1>
|
||||
<hr>
|
||||
<h1><a id="searchfolding">Search Term Folding</a></h1>
|
||||
<p>
|
||||
Search term folding removes distinctions (such as accent marks) between
|
||||
similar characters. It is useful for a fuzzy or loose search.
|
||||
|
@ -316,8 +316,8 @@ many character foldings recursively.
|
|||
*/
|
||||
TokenStream tokenstream = new ICUFoldingFilter(tokenizer);
|
||||
</pre>
|
||||
<hr/>
|
||||
<h1><a name="transform">Text Transformation</a></h1>
|
||||
<hr>
|
||||
<h1><a id="transform">Text Transformation</a></h1>
|
||||
<p>
|
||||
ICU provides text-transformation functionality via its Transliteration API. This allows
|
||||
you to transform text in a variety of ways, taking context into account.
|
||||
|
@ -352,8 +352,8 @@ and
|
|||
*/
|
||||
TokenStream tokenstream = new ICUTransformFilter(tokenizer, Transliterator.getInstance("Serbian-Latin/BGN"));
|
||||
</pre>
|
||||
<hr/>
|
||||
<h1><a name="backcompat">Backwards Compatibility</a></h1>
|
||||
<hr>
|
||||
<h1><a id="backcompat">Backwards Compatibility</a></h1>
|
||||
<p>
|
||||
This module exists to provide up-to-date Unicode functionality that supports
|
||||
the most recent version of Unicode (currently 11.0). However, some users who wish
|
||||
|
|
|
@ -41,7 +41,7 @@ import org.apache.lucene.util.AttributeSource;
|
|||
* </p>
|
||||
* <p>
|
||||
* The dictionary file must be encoded as UTF-8, with one entry per line,
|
||||
* in the form <tt>word[tab]lemma[tab]part-of-speech</tt>
|
||||
* in the form <code>word[tab]lemma[tab]part-of-speech</code>
|
||||
* </p>
|
||||
*/
|
||||
public class OpenNLPLemmatizerFilter extends TokenFilter {
|
||||
|
|
|
@ -97,8 +97,8 @@ public class Diff {
|
|||
}
|
||||
|
||||
/**
|
||||
* Apply the given patch string <tt>diff</tt> to the given string <tt>
|
||||
* dest</tt>.
|
||||
* Apply the given patch string <code>diff</code> to the given string <code>
|
||||
* dest</code>.
|
||||
*
|
||||
* @param dest Destination string
|
||||
* @param diff Patch string
|
||||
|
|
|
@ -101,8 +101,8 @@ public class Gener extends Reduce {
|
|||
*
|
||||
* @param in the Row to test
|
||||
* @param remap Description of the Parameter
|
||||
* @return <tt>true</tt> if the Row should remain, <tt>false
|
||||
* </tt> otherwise
|
||||
* @return <code>true</code> if the Row should remain, <code>false
|
||||
* </code> otherwise
|
||||
*/
|
||||
public boolean eat(Row in, int remap[]) {
|
||||
int sum = 0;
|
||||
|
|
|
@ -71,7 +71,7 @@ public class Lift extends Reduce {
|
|||
/**
|
||||
* Constructor for the Lift object.
|
||||
*
|
||||
* @param changeSkip when set to <tt>true</tt>, comparison of two Cells takes
|
||||
* @param changeSkip when set to <code>true</code>, comparison of two Cells takes
|
||||
* a skip command into account
|
||||
*/
|
||||
public Lift(boolean changeSkip) {
|
||||
|
|
|
@ -92,7 +92,7 @@ public class MultiTrie extends Trie {
|
|||
/**
|
||||
* Constructor for the MultiTrie object
|
||||
*
|
||||
* @param forward set to <tt>true</tt> if the elements should be read left to
|
||||
* @param forward set to <code>true</code> if the elements should be read left to
|
||||
* right
|
||||
*/
|
||||
public MultiTrie(boolean forward) {
|
||||
|
@ -157,7 +157,7 @@ public class MultiTrie extends Trie {
|
|||
* Add an element to this structure consisting of the given key and patch
|
||||
* command.
|
||||
* <p>
|
||||
* This method will return without executing if the <tt>cmd</tt>
|
||||
* This method will return without executing if the <code>cmd</code>
|
||||
* parameter's length is 0.
|
||||
*
|
||||
* @param key the key
|
||||
|
|
|
@ -81,7 +81,7 @@ public class MultiTrie2 extends MultiTrie {
|
|||
/**
|
||||
* Constructor for the MultiTrie2 object
|
||||
*
|
||||
* @param forward set to <tt>true</tt> if the elements should be read left to
|
||||
* @param forward set to <code>true</code> if the elements should be read left to
|
||||
* right
|
||||
*/
|
||||
public MultiTrie2(boolean forward) {
|
||||
|
@ -187,7 +187,7 @@ public class MultiTrie2 extends MultiTrie {
|
|||
* Add an element to this structure consisting of the given key and patch
|
||||
* command.
|
||||
* <p>
|
||||
* This method will return without executing if the <tt>cmd</tt>
|
||||
* This method will return without executing if the <code>cmd</code>
|
||||
* parameter's length is 0.
|
||||
*
|
||||
* @param key the key
|
||||
|
|
|
@ -117,7 +117,7 @@ public class Optimizer extends Reduce {
|
|||
*
|
||||
* @param master the master Row
|
||||
* @param existing the existing Row
|
||||
* @return the resulting Row, or <tt>null</tt> if the operation cannot be
|
||||
* @return the resulting Row, or <code>null</code> if the operation cannot be
|
||||
* realized
|
||||
*/
|
||||
public Row merge(Row master, Row existing) {
|
||||
|
@ -151,7 +151,7 @@ public class Optimizer extends Reduce {
|
|||
*
|
||||
* @param m the master Cell
|
||||
* @param e the existing Cell
|
||||
* @return the resulting Cell, or <tt>null</tt> if the operation cannot be
|
||||
* @return the resulting Cell, or <code>null</code> if the operation cannot be
|
||||
* realized
|
||||
*/
|
||||
public Cell merge(Cell m, Cell e) {
|
||||
|
|
|
@ -58,7 +58,7 @@ package org.egothor.stemmer;
|
|||
* The Optimizer class is a Trie that will be reduced (have empty rows removed).
|
||||
* <p>
|
||||
* This is the result of allowing a joining of rows when there is no collision
|
||||
* between non-<tt>null</tt> values in the rows. Information loss, resulting in
|
||||
* between non-<code>null</code> values in the rows. Information loss, resulting in
|
||||
* the stemmer not being able to recognize words (as in Optimizer), is
|
||||
* curtailed, allowing the stemmer to recognize words for which the original
|
||||
* trie was built. Use of this class allows the stemmer to be self-teaching.
|
||||
|
@ -74,7 +74,7 @@ public class Optimizer2 extends Optimizer {
|
|||
*
|
||||
* @param m the master Cell
|
||||
* @param e the existing Cell
|
||||
* @return the resulting Cell, or <tt>null</tt> if the operation cannot be
|
||||
* @return the resulting Cell, or <code>null</code> if the operation cannot be
|
||||
* realized
|
||||
*/
|
||||
@Override
|
||||
|
|
|
@ -220,7 +220,7 @@ public class Row {
|
|||
* Character.
|
||||
*
|
||||
* @param way the Character associated with the desired Cell
|
||||
* @return the reference, or -1 if the Cell is <tt>null</tt>
|
||||
* @return the reference, or -1 if the Cell is <code>null</code>
|
||||
*/
|
||||
public int getRef(Character way) {
|
||||
Cell c = at(way);
|
||||
|
@ -255,7 +255,7 @@ public class Row {
|
|||
* Return the number of identical Cells (containing patch commands) in this
|
||||
* Row.
|
||||
*
|
||||
* @param eqSkip when set to <tt>false</tt> the removed patch commands are
|
||||
* @param eqSkip when set to <code>false</code> the removed patch commands are
|
||||
* considered
|
||||
* @return the number of identical Cells, or -1 if there are (at least) two
|
||||
* different cells
|
||||
|
|
|
@ -96,7 +96,7 @@ public class Trie {
|
|||
/**
|
||||
* Constructor for the Trie object.
|
||||
*
|
||||
* @param forward set to <tt>true</tt>
|
||||
* @param forward set to <code>true</code>
|
||||
*/
|
||||
public Trie(boolean forward) {
|
||||
rows.add(new Row());
|
||||
|
@ -107,7 +107,7 @@ public class Trie {
|
|||
/**
|
||||
* Constructor for the Trie object.
|
||||
*
|
||||
* @param forward <tt>true</tt> if read left to right, <tt>false</tt> if read
|
||||
* @param forward <code>true</code> if read left to right, <code>false</code> if read
|
||||
* right to left
|
||||
* @param root index of the row that is the root node
|
||||
* @param cmds the patch commands to store
|
||||
|
|
|
@ -133,12 +133,11 @@ all possible cases, so there is always some loss of precision/recall
|
|||
(which
|
||||
means that even the words from the training corpus are sometimes
|
||||
incorrectly stemmed).<br>
|
||||
<h2>Algorithm and implementation<span style="font-style: italic;"></span></h2>
|
||||
<h2>Algorithm and implementation</h2>
|
||||
The algorithm and its Java implementation is described in detail in the
|
||||
publications cited below. Here's just a short excerpt from [2]:<br>
|
||||
<br>
|
||||
<center>
|
||||
<div style="width: 80%;" align="justify">"The aim is separation of the
|
||||
<div style="width: 80%; text-align: center">"The aim is separation of the
|
||||
stemmer execution code from the data
|
||||
structures [...]. In other words, a static algorithm configurable by
|
||||
data must be developed. The word transformations that happen in the
|
||||
|
@ -171,7 +170,6 @@ The P-commands are applied from the end of a word (right to left). This
|
|||
assumption can reduce the set of P-command's, because the last NOOP,
|
||||
moving the cursor to the end of a string without any changes, need not
|
||||
be stored."</div>
|
||||
</center>
|
||||
<br>
|
||||
Data structure used to keep the dictionary (words and their P-commands)
|
||||
is a trie. Several optimization steps are applied in turn to reduce and
|
||||
|
@ -273,10 +271,9 @@ incorrect lemma. Note: quite often in such case the output was a
|
|||
correct stem.</li>
|
||||
<li><b>table size:</b> the size in bytes of the stemmer table.</li>
|
||||
</ul>
|
||||
<div align="center">
|
||||
<table border="1" cellpadding="2" cellspacing="0">
|
||||
<table class="padding2" style="border: 1px solid; border-spacing: 0px; border-collapse: separate">
|
||||
<tbody>
|
||||
<tr bgcolor="#a0b0c0">
|
||||
<tr style="background-color: #a0b0c0">
|
||||
<th>Training sets</th>
|
||||
<th>Testing forms</th>
|
||||
<th>Stem OK</th>
|
||||
|
@ -286,7 +283,7 @@ correct stem.</li>
|
|||
<th>Lemma Bad</th>
|
||||
<th>Table size [B]</th>
|
||||
</tr>
|
||||
<tr align="right">
|
||||
<tr style="text-align: right">
|
||||
<td>100</td>
|
||||
<td>1022985</td>
|
||||
<td>842209</td>
|
||||
|
@ -296,7 +293,7 @@ correct stem.</li>
|
|||
<td>256642</td>
|
||||
<td>28438</td>
|
||||
</tr>
|
||||
<tr align="right">
|
||||
<tr style="text-align: right">
|
||||
<td>200</td>
|
||||
<td>1022985</td>
|
||||
<td>862789</td>
|
||||
|
@ -306,7 +303,7 @@ correct stem.</li>
|
|||
<td>223209</td>
|
||||
<td>48660</td>
|
||||
</tr>
|
||||
<tr align="right">
|
||||
<tr style="text-align: right">
|
||||
<td>500</td>
|
||||
<td>1022985</td>
|
||||
<td>885786</td>
|
||||
|
@ -316,7 +313,7 @@ correct stem.</li>
|
|||
<td>207204</td>
|
||||
<td>108798</td>
|
||||
</tr>
|
||||
<tr align="right">
|
||||
<tr style="text-align: right">
|
||||
<td>700</td>
|
||||
<td>1022985</td>
|
||||
<td>909031</td>
|
||||
|
@ -326,7 +323,7 @@ correct stem.</li>
|
|||
<td>211292</td>
|
||||
<td>139291</td>
|
||||
</tr>
|
||||
<tr align="right">
|
||||
<tr style="text-align: right">
|
||||
<td>1000</td>
|
||||
<td>1022985</td>
|
||||
<td>926079</td>
|
||||
|
@ -336,7 +333,7 @@ correct stem.</li>
|
|||
<td>207148</td>
|
||||
<td>183677</td>
|
||||
</tr>
|
||||
<tr align="right">
|
||||
<tr style="text-align: right">
|
||||
<td>2000</td>
|
||||
<td>1022985</td>
|
||||
<td>942886</td>
|
||||
|
@ -346,7 +343,7 @@ correct stem.</li>
|
|||
<td>202915</td>
|
||||
<td>313516</td>
|
||||
</tr>
|
||||
<tr align="right">
|
||||
<tr style="text-align: right">
|
||||
<td>5000</td>
|
||||
<td>1022985</td>
|
||||
<td>954721</td>
|
||||
|
@ -356,7 +353,7 @@ correct stem.</li>
|
|||
<td>201579</td>
|
||||
<td>640969</td>
|
||||
</tr>
|
||||
<tr align="right">
|
||||
<tr style="text-align: right">
|
||||
<td>7000</td>
|
||||
<td>1022985</td>
|
||||
<td>956165</td>
|
||||
|
@ -366,7 +363,7 @@ correct stem.</li>
|
|||
<td>198588</td>
|
||||
<td>839347</td>
|
||||
</tr>
|
||||
<tr align="right">
|
||||
<tr style="text-align: right">
|
||||
<td>10000</td>
|
||||
<td>1022985</td>
|
||||
<td>965427</td>
|
||||
|
@ -376,7 +373,7 @@ correct stem.</li>
|
|||
<td>196681</td>
|
||||
<td>1144537</td>
|
||||
</tr>
|
||||
<tr align="right">
|
||||
<tr style="text-align: right">
|
||||
<td>12000</td>
|
||||
<td>1022985</td>
|
||||
<td>967664</td>
|
||||
|
@ -386,7 +383,7 @@ correct stem.</li>
|
|||
<td>192120</td>
|
||||
<td>1313508</td>
|
||||
</tr>
|
||||
<tr align="right">
|
||||
<tr style="text-align: right">
|
||||
<td>15000</td>
|
||||
<td>1022985</td>
|
||||
<td>973188</td>
|
||||
|
@ -396,7 +393,7 @@ correct stem.</li>
|
|||
<td>190871</td>
|
||||
<td>1567902</td>
|
||||
</tr>
|
||||
<tr align="right">
|
||||
<tr style="text-align: right">
|
||||
<td>17000</td>
|
||||
<td>1022985</td>
|
||||
<td>974203</td>
|
||||
|
@ -406,7 +403,7 @@ correct stem.</li>
|
|||
<td>188862</td>
|
||||
<td>1733957</td>
|
||||
</tr>
|
||||
<tr align="right">
|
||||
<tr style="text-align: right">
|
||||
<td>20000</td>
|
||||
<td>1022985</td>
|
||||
<td>976234</td>
|
||||
|
@ -418,7 +415,6 @@ correct stem.</li>
|
|||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
<p>I also measured the time to produce a stem (which involves
|
||||
traversing a trie,
|
||||
retrieving a patch command and applying the patch command to the input
|
||||
|
@ -462,7 +458,7 @@ press.</li>
|
|||
Intelligent Information Processing and Web Mining Conference, 2004,
|
||||
Zakopane, Poland.</li>
|
||||
<li>Galambos, L.: Lemmatizer for Document Information Retrieval
|
||||
Systems in JAVA.<span style="text-decoration: underline;"> </span><a
|
||||
Systems in JAVA.<a
|
||||
class="moz-txt-link-rfc2396E"
|
||||
href="http://www.informatik.uni-trier.de/%7Eley/db/conf/sofsem/sofsem2001.html#Galambos01"><http://www.informatik.uni-trier.de/%7Eley/db/conf/sofsem/sofsem2001.html#Galambos01></a>
|
||||
SOFSEM 2001, Piestany, Slovakia. <br>
|
||||
|
|
|
@ -95,14 +95,14 @@ import org.apache.lucene.util.packed.PackedInts;
|
|||
* <p>
|
||||
* Files and detailed format:
|
||||
* <ul>
|
||||
* <li><tt>.tim</tt>: <a href="#Termdictionary">Term Dictionary</a></li>
|
||||
* <li><tt>.tip</tt>: <a href="#Termindex">Term Index</a></li>
|
||||
* <li><tt>.doc</tt>: <a href="#Frequencies">Frequencies and Skip Data</a></li>
|
||||
* <li><tt>.pos</tt>: <a href="#Positions">Positions</a></li>
|
||||
* <li><tt>.pay</tt>: <a href="#Payloads">Payloads and Offsets</a></li>
|
||||
* <li><code>.tim</code>: <a href="#Termdictionary">Term Dictionary</a></li>
|
||||
* <li><code>.tip</code>: <a href="#Termindex">Term Index</a></li>
|
||||
* <li><code>.doc</code>: <a href="#Frequencies">Frequencies and Skip Data</a></li>
|
||||
* <li><code>.pos</code>: <a href="#Positions">Positions</a></li>
|
||||
* <li><code>.pay</code>: <a href="#Payloads">Payloads and Offsets</a></li>
|
||||
* </ul>
|
||||
*
|
||||
* <a name="Termdictionary"></a>
|
||||
* <a id="Termdictionary"></a>
|
||||
* <dl>
|
||||
* <dd>
|
||||
* <b>Term Dictionary</b>
|
||||
|
@ -162,7 +162,7 @@ import org.apache.lucene.util.packed.PackedInts;
|
|||
* </dd>
|
||||
* </dl>
|
||||
*
|
||||
* <a name="Termindex"></a>
|
||||
* <a id="Termindex"></a>
|
||||
* <dl>
|
||||
* <dd>
|
||||
* <b>Term Index</b>
|
||||
|
@ -172,7 +172,7 @@ import org.apache.lucene.util.packed.PackedInts;
|
|||
* </dl>
|
||||
*
|
||||
*
|
||||
* <a name="Frequencies"></a>
|
||||
* <a id="Frequencies"></a>
|
||||
* <dl>
|
||||
* <dd>
|
||||
* <b>Frequencies and Skip Data</b>
|
||||
|
@ -260,7 +260,7 @@ import org.apache.lucene.util.packed.PackedInts;
|
|||
* </dd>
|
||||
* </dl>
|
||||
*
|
||||
* <a name="Positions"></a>
|
||||
* <a id="Positions"></a>
|
||||
* <dl>
|
||||
* <dd>
|
||||
* <b>Positions</b>
|
||||
|
@ -313,7 +313,7 @@ import org.apache.lucene.util.packed.PackedInts;
|
|||
* </dd>
|
||||
* </dl>
|
||||
*
|
||||
* <a name="Payloads"></a>
|
||||
* <a id="Payloads"></a>
|
||||
* <dl>
|
||||
* <dd>
|
||||
* <b>Payloads and Offsets</b>
|
||||
|
|
|
@ -25,7 +25,8 @@
|
|||
* Contained packages:
|
||||
* </p>
|
||||
*
|
||||
* <table border=1 cellpadding=4 summary="table of benchmark packages">
|
||||
* <table class="padding4" style="border: 1px solid">
|
||||
* <caption>table of benchmark packages</caption>
|
||||
* <tr>
|
||||
* <td><b>Package</b></td>
|
||||
* <td><b>Description</b></td>
|
||||
|
@ -63,7 +64,7 @@
|
|||
* report.</a></li>
|
||||
* <li><a href="#recsCounting">Results record counting clarified</a></li>
|
||||
* </ol>
|
||||
* <a name="concept"></a>
|
||||
* <a id="concept"></a>
|
||||
* <h2>Benchmarking By Tasks</h2>
|
||||
* <p>
|
||||
* Benchmark Lucene using task primitives.
|
||||
|
@ -79,7 +80,7 @@
|
|||
* additional characteristics of the benchmark run.
|
||||
* </p>
|
||||
*
|
||||
* <a name="usage"></a>
|
||||
* <a id="usage"></a>
|
||||
* <h2>How to use</h2>
|
||||
* <p>
|
||||
* Easiest way to run a benchmarks is using the predefined ant task:
|
||||
|
@ -166,7 +167,7 @@
|
|||
* <b>org.apache.lucene.benchmark.byTask.tasks</b> specify that package thru the
|
||||
* <span style="color: #FF0000">alt.tasks.packages</span> property.
|
||||
*
|
||||
* <a name="algorithm"></a>
|
||||
* <a id="algorithm"></a>
|
||||
* <h2>Benchmark "algorithm"</h2>
|
||||
*
|
||||
* <p>
|
||||
|
@ -312,7 +313,7 @@
|
|||
* </ol>
|
||||
*
|
||||
*
|
||||
* <a name="tasks"></a>
|
||||
* <a id="tasks"></a>
|
||||
* <h2>Supported tasks/commands</h2>
|
||||
*
|
||||
* <p>
|
||||
|
@ -481,7 +482,7 @@
|
|||
* </li>
|
||||
* </ol>
|
||||
*
|
||||
* <a name="properties"></a>
|
||||
* <a id="properties"></a>
|
||||
* <h2>Benchmark properties</h2>
|
||||
*
|
||||
* <p>
|
||||
|
@ -604,7 +605,7 @@
|
|||
* For sample use of these properties see the *.alg files under conf.
|
||||
* </p>
|
||||
*
|
||||
* <a name="example"></a>
|
||||
* <a id="example"></a>
|
||||
* <h2>Example input algorithm and the result benchmark report</h2>
|
||||
* <p>
|
||||
* The following example is in conf/sample.alg:
|
||||
|
@ -690,7 +691,7 @@
|
|||
* PopulateLong - - 1 20 1000 - - 1 - - 10003 - - - 77.0 - - 129.92 - 87,309,608 - 100,831,232
|
||||
* </pre>
|
||||
*
|
||||
* <a name="recsCounting"></a>
|
||||
* <a id="recsCounting"></a>
|
||||
* <h2>Results record counting clarified</h2>
|
||||
* <p>
|
||||
* Two columns in the results table indicate records counts: records-per-run and
|
||||
|
|
|
@ -51,8 +51,8 @@ import java.util.regex.Pattern;
|
|||
* <li>Analyzer args:
|
||||
* <ul>
|
||||
* <li><b>Required</b>: <code>name:<i>analyzer-factory-name</i></code></li>
|
||||
* <li>Optional: <tt>positionIncrementGap:<i>int value</i></tt> (default: 0)</li>
|
||||
* <li>Optional: <tt>offsetGap:<i>int value</i></tt> (default: 1)</li>
|
||||
* <li>Optional: <code>positionIncrementGap:<i>int value</i></code> (default: 0)</li>
|
||||
* <li>Optional: <code>offsetGap:<i>int value</i></code> (default: 1)</li>
|
||||
* </ul>
|
||||
* </li>
|
||||
* <li>zero or more CharFilterFactory's, followed by</li>
|
||||
|
@ -60,7 +60,7 @@ import java.util.regex.Pattern;
|
|||
* <li>zero or more TokenFilterFactory's</li>
|
||||
* </ol>
|
||||
*
|
||||
* Each component analysis factory may specify <tt>luceneMatchVersion</tt> (defaults to
|
||||
* Each component analysis factory may specify <code>luceneMatchVersion</code> (defaults to
|
||||
* {@link Version#LATEST}) and any of the args understood by the specified
|
||||
* *Factory class, in the above-describe param format.
|
||||
* <p>
|
||||
|
|
|
@ -144,13 +144,6 @@
|
|||
|
||||
<!-- we check for broken links across all documentation -->
|
||||
<target name="-documentation-lint" depends="documentation">
|
||||
<echo message="checking for broken html..."/>
|
||||
<jtidy-macro>
|
||||
<!-- NOTE: must currently exclude deprecated-list due to a javadocs bug (as of 1.7.0_09)
|
||||
javadocs generates invalid XML if you deprecate a method that takes a parameter
|
||||
with a generic type -->
|
||||
<fileset dir="build/docs" includes="**/*.html" excludes="**/deprecated-list.html"/>
|
||||
</jtidy-macro>
|
||||
<echo message="Checking for broken links..."/>
|
||||
<check-broken-links dir="build/docs"/>
|
||||
<echo message="Checking for missing docs..."/>
|
||||
|
|
|
@ -58,11 +58,11 @@ import org.apache.lucene.util.fst.Util;
|
|||
* <p>
|
||||
* File:
|
||||
* <ul>
|
||||
* <li><tt>.tst</tt>: <a href="#Termdictionary">Term Dictionary</a></li>
|
||||
* <li><code>.tst</code>: <a href="#Termdictionary">Term Dictionary</a></li>
|
||||
* </ul>
|
||||
* <p>
|
||||
*
|
||||
* <a name="Termdictionary"></a>
|
||||
* <a id="Termdictionary"></a>
|
||||
* <h3>Term Dictionary</h3>
|
||||
* <p>
|
||||
* The .tst contains a list of FSTs, one for each field.
|
||||
|
|
|
@ -202,7 +202,7 @@
|
|||
<property name="javadoc.noindex" value="true"/>
|
||||
|
||||
<!---TODO: Fix accessibility (order of H1/H2/H3 headings), see https://issues.apache.org/jira/browse/LUCENE-8729 -->
|
||||
<property name="javadoc.doclint.args" value="-Xdoclint:all,-missing,-accessibility,-html"/>
|
||||
<property name="javadoc.doclint.args" value="-Xdoclint:all,-missing,-accessibility"/>
|
||||
<!---proc:none was added because of LOG4J2-1925 / JDK-8186647 -->
|
||||
<property name="javac.doclint.args" value="-Xdoclint:all/protected -Xdoclint:-missing -Xdoclint:-accessibility -proc:none"/>
|
||||
|
||||
|
@ -2089,30 +2089,6 @@ ${ant.project.name}.test.dependencies=${test.classpath.list}
|
|||
</sequential>
|
||||
</macrodef>
|
||||
|
||||
<!-- TODO: if we make a custom ant task, we can give better
|
||||
errors and stuff here, and not make a stupid temp dir -->
|
||||
<macrodef name="jtidy-macro">
|
||||
<element name="nested" implicit="yes" optional="yes"/>
|
||||
<sequential>
|
||||
<!--
|
||||
TODO: find a better replacement for jTIDY that can handle HTML5
|
||||
<ivy:cachepath organisation="net.sf.jtidy" module="jtidy" revision="r938"
|
||||
log="download-only" inline="true" conf="master" type="jar" pathid="jtidy.classpath" />
|
||||
<taskdef name="tidy" classname="org.w3c.tidy.ant.JTidyTask" classpathref="jtidy.classpath"/>
|
||||
<delete dir="${common.dir}/build/jtidy_tmp" quiet="true"/>
|
||||
<echo message="Checking for broken html (such as invalid tags)..." taskname="jtidy"/>
|
||||
<tidy failonerror="true" destdir="${common.dir}/build/jtidy_tmp">
|
||||
<nested/>
|
||||
<parameter name="input-encoding" value="UTF-8" />
|
||||
<parameter name="only-errors" value="true" />
|
||||
<parameter name="show-warnings" value="false" />
|
||||
</tidy>
|
||||
<delete dir="${common.dir}/build/jtidy_tmp" quiet="true"/>
|
||||
-->
|
||||
<echo message="FIXME: Broken HTML checks were disabled, as jtidy can't handle HTML5." taskname="jtidy"/>
|
||||
</sequential>
|
||||
</macrodef>
|
||||
|
||||
<property name="failonjavadocwarning" value="true"/>
|
||||
<macrodef name="invoke-javadoc">
|
||||
<element name="sources" optional="yes"/>
|
||||
|
@ -2166,6 +2142,10 @@ ${ant.project.name}.test.dependencies=${test.classpath.list}
|
|||
</javadoc>
|
||||
<record name="@{destdir}/log_javadoc.txt" action="stop"/>
|
||||
|
||||
<!-- append some special table css -->
|
||||
<concat destfile="@{destdir}/stylesheet.css" append="true" fixlastline="true" encoding="UTF-8">
|
||||
<filelist dir="${common.dir}/tools/javadoc" files="table_padding.css"/>
|
||||
</concat>
|
||||
<!-- append prettify to scripts and css -->
|
||||
<concat destfile="@{destdir}/stylesheet.css" append="true" fixlastline="true" encoding="UTF-8">
|
||||
<filelist dir="${prettify.dir}" files="prettify.css"/>
|
||||
|
|
|
@ -163,7 +163,7 @@
|
|||
*
|
||||
* All internal variables are reset, the old input stream
|
||||
* <b>cannot</b> be reused (internal buffer is discarded and lost).
|
||||
* Lexical state is set to <tt>ZZ_INITIAL</tt>.
|
||||
* Lexical state is set to <code>ZZ_INITIAL</code>.
|
||||
*
|
||||
* Internal scan buffer is resized down to its initial length, if it has grown.
|
||||
*
|
||||
|
@ -211,7 +211,7 @@
|
|||
|
||||
|
||||
/**
|
||||
* Returns the character at position <tt>pos</tt> from the
|
||||
* Returns the character at position <code>pos</code> from the
|
||||
* matched text.
|
||||
*
|
||||
* It is equivalent to yytext().charAt(pos), but faster
|
||||
|
|
|
@ -169,7 +169,7 @@
|
|||
*
|
||||
* All internal variables are reset, the old input stream
|
||||
* <b>cannot</b> be reused (internal buffer is discarded and lost).
|
||||
* Lexical state is set to <tt>ZZ_INITIAL</tt>.
|
||||
* Lexical state is set to <code>ZZ_INITIAL</code>.
|
||||
*
|
||||
* Internal scan buffer is resized down to its initial length, if it has grown.
|
||||
*
|
||||
|
@ -217,7 +217,7 @@
|
|||
|
||||
|
||||
/**
|
||||
* Returns the character at position <tt>pos</tt> from the
|
||||
* Returns the character at position <code>pos</code> from the
|
||||
* matched text.
|
||||
*
|
||||
* It is equivalent to yytext().charAt(pos), but faster
|
||||
|
|
|
@ -383,7 +383,8 @@
|
|||
* synonyms, setting the position increment to 0 is enough to denote the fact that two
|
||||
* words are synonyms, for example:
|
||||
* </p>
|
||||
* <table summary="table showing position increments of 1 and 0 for red and magenta, respectively">
|
||||
* <table>
|
||||
* <caption>table showing position increments of 1 and 0 for red and magenta, respectively</caption>
|
||||
* <tr><td>Term</td><td>red</td><td>magenta</td></tr>
|
||||
* <tr><td>Position increment</td><td>1</td><td>0</td></tr>
|
||||
* </table>
|
||||
|
@ -394,7 +395,8 @@
|
|||
* a TokenStream where "IBM" is a synonym of "Internal Business Machines". Position increments
|
||||
* are not enough anymore:
|
||||
* </p>
|
||||
* <table summary="position increments where international is zero">
|
||||
* <table>
|
||||
* <caption>position increments where international is zero</caption>
|
||||
* <tr><td>Term</td><td>IBM</td><td>International</td><td>Business</td><td>Machines</td></tr>
|
||||
* <tr><td>Position increment</td><td>1</td><td>0</td><td>1</td><td>1</td></tr>
|
||||
* </table>
|
||||
|
@ -405,7 +407,8 @@
|
|||
* than "International" is a synonym of "Business". The only way to solve this issue is to
|
||||
* make "IBM" span across 3 positions, this is where position lengths come to rescue.
|
||||
* </p>
|
||||
* <table summary="position lengths where IBM is three">
|
||||
* <table>
|
||||
* <caption>position lengths where IBM is three</caption>
|
||||
* <tr><td>Term</td><td>IBM</td><td>International</td><td>Business</td><td>Machines</td></tr>
|
||||
* <tr><td>Position increment</td><td>1</td><td>0</td><td>1</td><td>1</td></tr>
|
||||
* <tr><td>Position length</td><td>3</td><td>1</td><td>1</td><td>1</td></tr>
|
||||
|
@ -414,7 +417,7 @@
|
|||
* This new attribute makes clear that "IBM" and "International Business Machines" start and end
|
||||
* at the same positions.
|
||||
* </p>
|
||||
* <a name="corrupt"></a>
|
||||
* <a id="corrupt"></a>
|
||||
* <h3>How to not write corrupt token streams</h3>
|
||||
* <p>
|
||||
* There are a few rules to observe when writing custom Tokenizers and TokenFilters:
|
||||
|
@ -472,7 +475,9 @@
|
|||
* <p>
|
||||
* Lucene provides seven Attributes out of the box:
|
||||
* </p>
|
||||
* <table rules="all" frame="box" cellpadding="3" summary="common bundled attributes">
|
||||
* <table class="padding3">
|
||||
* <caption>common bundled attributes</caption>
|
||||
* <tbody style="border: 1px solid">
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.analysis.tokenattributes.CharTermAttribute}</td>
|
||||
* <td>
|
||||
|
@ -513,6 +518,7 @@
|
|||
* return true from this attribute's isKeyword() method.
|
||||
* </td>
|
||||
* </tr>
|
||||
* </tbody>
|
||||
* </table>
|
||||
* <h3>More Requirements for Analysis Component Classes</h3>
|
||||
* Due to the historical development of the API, there are some perhaps
|
||||
|
|
|
@ -726,7 +726,7 @@ public final class StandardTokenizerImpl {
|
|||
*
|
||||
* All internal variables are reset, the old input stream
|
||||
* <b>cannot</b> be reused (internal buffer is discarded and lost).
|
||||
* Lexical state is set to <tt>ZZ_INITIAL</tt>.
|
||||
* Lexical state is set to <code>ZZ_INITIAL</code>.
|
||||
*
|
||||
* Internal scan buffer is resized down to its initial length, if it has grown.
|
||||
*
|
||||
|
@ -774,7 +774,7 @@ public final class StandardTokenizerImpl {
|
|||
|
||||
|
||||
/**
|
||||
* Returns the character at position <tt>pos</tt> from the
|
||||
* Returns the character at position <code>pos</code> from the
|
||||
* matched text.
|
||||
*
|
||||
* It is equivalent to yytext().charAt(pos), but faster
|
||||
|
|
|
@ -95,11 +95,11 @@ import org.apache.lucene.util.fst.Util;
|
|||
*
|
||||
* Files:
|
||||
* <ul>
|
||||
* <li><tt>.tim</tt>: <a href="#Termdictionary">Term Dictionary</a></li>
|
||||
* <li><tt>.tip</tt>: <a href="#Termindex">Term Index</a></li>
|
||||
* <li><code>.tim</code>: <a href="#Termdictionary">Term Dictionary</a></li>
|
||||
* <li><code>.tip</code>: <a href="#Termindex">Term Index</a></li>
|
||||
* </ul>
|
||||
* <p>
|
||||
* <a name="Termdictionary"></a>
|
||||
* <a id="Termdictionary"></a>
|
||||
* <h3>Term Dictionary</h3>
|
||||
*
|
||||
* <p>The .tim file contains the list of terms in each
|
||||
|
@ -158,7 +158,7 @@ import org.apache.lucene.util.fst.Util;
|
|||
* <li>For inner nodes of the tree, every entry will steal one bit to mark whether it points
|
||||
* to child nodes(sub-block). If so, the corresponding TermStats and TermMetaData are omitted </li>
|
||||
* </ul>
|
||||
* <a name="Termindex"></a>
|
||||
* <a id="Termindex"></a>
|
||||
* <h3>Term Index</h3>
|
||||
* <p>The .tip file contains an index into the term dictionary, so that it can be
|
||||
* accessed randomly. The index is also used to determine
|
||||
|
|
|
@ -35,8 +35,8 @@ import org.apache.lucene.util.packed.DirectMonotonicWriter;
|
|||
* A {@link StoredFieldsFormat} that compresses documents in chunks in
|
||||
* order to improve the compression ratio.
|
||||
* <p>
|
||||
* For a chunk size of <tt>chunkSize</tt> bytes, this {@link StoredFieldsFormat}
|
||||
* does not support documents larger than (<tt>2<sup>31</sup> - chunkSize</tt>)
|
||||
* For a chunk size of <var>chunkSize</var> bytes, this {@link StoredFieldsFormat}
|
||||
* does not support documents larger than (<code>2<sup>31</sup> - chunkSize</code>)
|
||||
* bytes.
|
||||
* <p>
|
||||
* For optimal performance, you should use a {@link MergePolicy} that returns
|
||||
|
|
|
@ -34,9 +34,9 @@ import org.apache.lucene.store.IndexOutput;
|
|||
* <p>
|
||||
* Files:
|
||||
* <ul>
|
||||
* <li><tt>.cfs</tt>: An optional "virtual" file consisting of all the other
|
||||
* <li><code>.cfs</code>: An optional "virtual" file consisting of all the other
|
||||
* index files for systems that frequently run out of file handles.
|
||||
* <li><tt>.cfe</tt>: The "virtual" compound file's entry table holding all
|
||||
* <li><code>.cfe</code>: The "virtual" compound file's entry table holding all
|
||||
* entries in the corresponding .cfs file.
|
||||
* </ul>
|
||||
* <p>Description:</p>
|
||||
|
|
|
@ -40,7 +40,7 @@ import org.apache.lucene.store.IndexOutput;
|
|||
|
||||
/**
|
||||
* Lucene 5.0 Field Infos format.
|
||||
* <p>Field names are stored in the field info file, with suffix <tt>.fnm</tt>.
|
||||
* <p>Field names are stored in the field info file, with suffix <code>.fnm</code>.
|
||||
* <p>FieldInfos (.fnm) --> Header,FieldsCount, <FieldName,FieldNumber,
|
||||
* FieldBits,DocValuesBits,DocValuesGen,Attributes> <sup>FieldsCount</sup>,Footer
|
||||
* <p>Data types:
|
||||
|
|
|
@ -57,10 +57,10 @@ import org.apache.lucene.util.packed.DirectMonotonicWriter;
|
|||
* <p><b>File formats</b>
|
||||
* <p>Stored fields are represented by three files:
|
||||
* <ol>
|
||||
* <li><a name="field_data"></a>
|
||||
* <p>A fields data file (extension <tt>.fdt</tt>). This file stores a compact
|
||||
* <li><a id="field_data"></a>
|
||||
* <p>A fields data file (extension <code>.fdt</code>). This file stores a compact
|
||||
* representation of documents in compressed blocks of 16KB or more. When
|
||||
* writing a segment, documents are appended to an in-memory <tt>byte[]</tt>
|
||||
* writing a segment, documents are appended to an in-memory <code>byte[]</code>
|
||||
* buffer. When its size reaches 16KB or more, some metadata about the documents
|
||||
* is flushed to disk, immediately followed by a compressed representation of
|
||||
* the buffer using the
|
||||
|
@ -83,21 +83,21 @@ import org.apache.lucene.util.packed.DirectMonotonicWriter;
|
|||
* is less than 0.5%.</li>
|
||||
* </ul>
|
||||
* </li>
|
||||
* <li><a name="field_index"></a>
|
||||
* <p>A fields index file (extension <tt>.fdx</tt>). This file stores two
|
||||
* <li><a id="field_index"></a>
|
||||
* <p>A fields index file (extension <code>.fdx</code>). This file stores two
|
||||
* {@link DirectMonotonicWriter monotonic arrays}, one for the first doc IDs of
|
||||
* each block of compressed documents, and another one for the corresponding
|
||||
* offsets on disk. At search time, the array containing doc IDs is
|
||||
* binary-searched in order to find the block that contains the expected doc ID,
|
||||
* and the associated offset on disk is retrieved from the second array.</p>
|
||||
* <li><a name="field_meta"></a>
|
||||
* <p>A fields meta file (extension <tt>.fdm</tt>). This file stores metadata
|
||||
* <li><a id="field_meta"></a>
|
||||
* <p>A fields meta file (extension <code>.fdm</code>). This file stores metadata
|
||||
* about the monotonic arrays stored in the index file.</p>
|
||||
* </li>
|
||||
* </ol>
|
||||
* <p><b>Known limitations</b>
|
||||
* <p>This {@link StoredFieldsFormat} does not support individual documents
|
||||
* larger than (<tt>2<sup>31</sup> - 2<sup>14</sup></tt>) bytes.
|
||||
* larger than (<code>2<sup>31</sup> - 2<sup>14</sup></code>) bytes.
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public final class Lucene50StoredFieldsFormat extends StoredFieldsFormat {
|
||||
|
|
|
@ -48,8 +48,8 @@ import org.apache.lucene.util.packed.PackedInts;
|
|||
* Looking up term vectors for any document requires at most 1 disk seek.
|
||||
* <p><b>File formats</b>
|
||||
* <ol>
|
||||
* <li><a name="vector_data"></a>
|
||||
* <p>A vector data file (extension <tt>.tvd</tt>). This file stores terms,
|
||||
* <li><a id="vector_data"></a>
|
||||
* <p>A vector data file (extension <code>.tvd</code>). This file stores terms,
|
||||
* frequencies, positions, offsets and payloads for every document. Upon writing
|
||||
* a new segment, it accumulates data into memory until the buffer used to store
|
||||
* terms and payloads grows beyond 4KB. Then it flushes all metadata, terms
|
||||
|
@ -111,8 +111,8 @@ import org.apache.lucene.util.packed.PackedInts;
|
|||
* <li>Footer --> {@link CodecUtil#writeFooter CodecFooter}</li>
|
||||
* </ul>
|
||||
* </li>
|
||||
* <li><a name="vector_index"></a>
|
||||
* <p>An index file (extension <tt>.tvx</tt>).
|
||||
* <li><a id="vector_index"></a>
|
||||
* <p>An index file (extension <code>.tvx</code>).
|
||||
* <ul>
|
||||
* <li>VectorIndex (.tvx) --> <Header>, <ChunkIndex>, Footer</li>
|
||||
* <li>Header --> {@link CodecUtil#writeIndexHeader IndexHeader}</li>
|
||||
|
|
|
@ -40,7 +40,7 @@ import org.apache.lucene.store.IndexOutput;
|
|||
|
||||
/**
|
||||
* Lucene 6.0 Field Infos format.
|
||||
* <p>Field names are stored in the field info file, with suffix <tt>.fnm</tt>.
|
||||
* <p>Field names are stored in the field info file, with suffix <code>.fnm</code>.
|
||||
* <p>FieldInfos (.fnm) --> Header,FieldsCount, <FieldName,FieldNumber,
|
||||
* FieldBits,DocValuesBits,DocValuesGen,Attributes,DimensionCount,DimensionNumBytes> <sup>FieldsCount</sup>,Footer
|
||||
* <p>Data types:
|
||||
|
|
|
@ -45,7 +45,7 @@ import org.apache.lucene.util.Version;
|
|||
* <p>
|
||||
* Files:
|
||||
* <ul>
|
||||
* <li><tt>.si</tt>: Header, SegVersion, SegSize, IsCompoundFile, Diagnostics, Files, Attributes, IndexSort, Footer
|
||||
* <li><code>.si</code>: Header, SegVersion, SegSize, IsCompoundFile, Diagnostics, Files, Attributes, IndexSort, Footer
|
||||
* </ul>
|
||||
* Data types:
|
||||
* <ul>
|
||||
|
|
|
@ -35,8 +35,8 @@ import org.apache.lucene.util.packed.DirectWriter;
|
|||
* <p>
|
||||
* Documents that have a value for the field are encoded in a way that it is always possible to
|
||||
* know the ordinal of the current document in the set of documents that have a value. For instance,
|
||||
* say the set of documents that have a value for the field is <tt>{1, 5, 6, 11}</tt>. When the
|
||||
* iterator is on <tt>6</tt>, it knows that this is the 3rd item of the set. This way, values can
|
||||
* say the set of documents that have a value for the field is <code>{1, 5, 6, 11}</code>. When the
|
||||
* iterator is on <code>6</code>, it knows that this is the 3rd item of the set. This way, values can
|
||||
* be stored densely and accessed based on their index at search time. If all documents in a segment
|
||||
* have a value for the field, the index is the same as the doc ID, so this case is encoded implicitly
|
||||
* and is very fast at query time. On the other hand if some documents are missing a value for the
|
||||
|
@ -124,8 +124,8 @@ import org.apache.lucene.util.packed.DirectWriter;
|
|||
* <p>
|
||||
* Files:
|
||||
* <ol>
|
||||
* <li><tt>.dvd</tt>: DocValues data</li>
|
||||
* <li><tt>.dvm</tt>: DocValues metadata</li>
|
||||
* <li><code>.dvd</code>: DocValues data</li>
|
||||
* <li><code>.dvm</code>: DocValues metadata</li>
|
||||
* </ol>
|
||||
* @lucene.experimental
|
||||
*/
|
||||
|
|
|
@ -34,11 +34,11 @@ import org.apache.lucene.store.DataOutput;
|
|||
* <p>
|
||||
* Files:
|
||||
* <ol>
|
||||
* <li><tt>.nvd</tt>: Norms data</li>
|
||||
* <li><tt>.nvm</tt>: Norms metadata</li>
|
||||
* <li><code>.nvd</code>: Norms data</li>
|
||||
* <li><code>.nvm</code>: Norms metadata</li>
|
||||
* </ol>
|
||||
* <ol>
|
||||
* <li><a name="nvm"></a>
|
||||
* <li><a id="nvm"></a>
|
||||
* <p>The Norms metadata or .nvm file.</p>
|
||||
* <p>For each norms field, this stores metadata, such as the offset into the
|
||||
* Norms data (.nvd)</p>
|
||||
|
@ -62,7 +62,7 @@ import org.apache.lucene.store.DataOutput;
|
|||
* in the norms data (.nvd), or -2 if no documents have a norm value, or -1 if all documents have a norm
|
||||
* value.</p>
|
||||
* <p>DocsWithFieldLength is the number of bytes used to encode the set of documents that have a norm.</p>
|
||||
* <li><a name="nvd"></a>
|
||||
* <li><a id="nvd"></a>
|
||||
* <p>The Norms data or .nvd file.</p>
|
||||
* <p>For each Norms field, this stores the actual per-document data (the heavy-lifting)</p>
|
||||
* <p>Norms data (.nvd) --> Header,< Data ><sup>NumFields</sup>,Footer</p>
|
||||
|
|
|
@ -96,14 +96,14 @@ import org.apache.lucene.util.packed.PackedInts;
|
|||
* <p>
|
||||
* Files and detailed format:
|
||||
* <ul>
|
||||
* <li><tt>.tim</tt>: <a href="#Termdictionary">Term Dictionary</a></li>
|
||||
* <li><tt>.tip</tt>: <a href="#Termindex">Term Index</a></li>
|
||||
* <li><tt>.doc</tt>: <a href="#Frequencies">Frequencies and Skip Data</a></li>
|
||||
* <li><tt>.pos</tt>: <a href="#Positions">Positions</a></li>
|
||||
* <li><tt>.pay</tt>: <a href="#Payloads">Payloads and Offsets</a></li>
|
||||
* <li><code>.tim</code>: <a href="#Termdictionary">Term Dictionary</a></li>
|
||||
* <li><code>.tip</code>: <a href="#Termindex">Term Index</a></li>
|
||||
* <li><code>.doc</code>: <a href="#Frequencies">Frequencies and Skip Data</a></li>
|
||||
* <li><code>.pos</code>: <a href="#Positions">Positions</a></li>
|
||||
* <li><code>.pay</code>: <a href="#Payloads">Payloads and Offsets</a></li>
|
||||
* </ul>
|
||||
*
|
||||
* <a name="Termdictionary"></a>
|
||||
* <a id="Termdictionary"></a>
|
||||
* <dl>
|
||||
* <dd>
|
||||
* <b>Term Dictionary</b>
|
||||
|
@ -163,7 +163,7 @@ import org.apache.lucene.util.packed.PackedInts;
|
|||
* </dd>
|
||||
* </dl>
|
||||
*
|
||||
* <a name="Termindex"></a>
|
||||
* <a id="Termindex"></a>
|
||||
* <dl>
|
||||
* <dd>
|
||||
* <b>Term Index</b>
|
||||
|
@ -173,7 +173,7 @@ import org.apache.lucene.util.packed.PackedInts;
|
|||
* </dl>
|
||||
*
|
||||
*
|
||||
* <a name="Frequencies"></a>
|
||||
* <a id="Frequencies"></a>
|
||||
* <dl>
|
||||
* <dd>
|
||||
* <b>Frequencies and Skip Data</b>
|
||||
|
@ -261,7 +261,7 @@ import org.apache.lucene.util.packed.PackedInts;
|
|||
* </dd>
|
||||
* </dl>
|
||||
*
|
||||
* <a name="Positions"></a>
|
||||
* <a id="Positions"></a>
|
||||
* <dl>
|
||||
* <dd>
|
||||
* <b>Positions</b>
|
||||
|
@ -314,7 +314,7 @@ import org.apache.lucene.util.packed.PackedInts;
|
|||
* </dd>
|
||||
* </dl>
|
||||
*
|
||||
* <a name="Payloads"></a>
|
||||
* <a id="Payloads"></a>
|
||||
* <dl>
|
||||
* <dd>
|
||||
* <b>Payloads and Offsets</b>
|
||||
|
|
|
@ -41,7 +41,7 @@
|
|||
* </li>
|
||||
* </ul>
|
||||
* </div>
|
||||
* <a name="Introduction"></a>
|
||||
* <a id="Introduction"></a>
|
||||
* <h2>Introduction</h2>
|
||||
* <div>
|
||||
* <p>This document defines the index file formats used in this version of Lucene.
|
||||
|
@ -51,7 +51,7 @@
|
|||
* <p>This document attempts to provide a high-level definition of the Apache
|
||||
* Lucene file formats.</p>
|
||||
* </div>
|
||||
* <a name="Definitions"></a>
|
||||
* <a id="Definitions"></a>
|
||||
* <h2>Definitions</h2>
|
||||
* <div>
|
||||
* <p>The fundamental concepts in Lucene are index, document, field and term.</p>
|
||||
|
@ -64,14 +64,14 @@
|
|||
* <p>The same sequence of bytes in two different fields is considered a different
|
||||
* term. Thus terms are represented as a pair: the string naming the field, and the
|
||||
* bytes within the field.</p>
|
||||
* <a name="Inverted_Indexing"></a>
|
||||
* <a id="Inverted_Indexing"></a>
|
||||
* <h3>Inverted Indexing</h3>
|
||||
* <p>The index stores statistics about terms in order to make term-based search
|
||||
* more efficient. Lucene's index falls into the family of indexes known as an
|
||||
* <i>inverted index.</i> This is because it can list, for a term, the documents
|
||||
* that contain it. This is the inverse of the natural relationship, in which
|
||||
* documents list terms.</p>
|
||||
* <a name="Types_of_Fields"></a>
|
||||
* <a id="Types_of_Fields"></a>
|
||||
* <h3>Types of Fields</h3>
|
||||
* <p>In Lucene, fields may be <i>stored</i>, in which case their text is stored
|
||||
* in the index literally, in a non-inverted manner. Fields that are inverted are
|
||||
|
@ -82,7 +82,7 @@
|
|||
* indexed literally.</p>
|
||||
* <p>See the {@link org.apache.lucene.document.Field Field}
|
||||
* java docs for more information on Fields.</p>
|
||||
* <a name="Segments"></a>
|
||||
* <a id="Segments"></a>
|
||||
* <h3>Segments</h3>
|
||||
* <p>Lucene indexes may be composed of multiple sub-indexes, or <i>segments</i>.
|
||||
* Each segment is a fully independent index, which could be searched separately.
|
||||
|
@ -93,7 +93,7 @@
|
|||
* </ol>
|
||||
* <p>Searches may involve multiple segments and/or multiple indexes, each index
|
||||
* potentially composed of a set of segments.</p>
|
||||
* <a name="Document_Numbers"></a>
|
||||
* <a id="Document_Numbers"></a>
|
||||
* <h3>Document Numbers</h3>
|
||||
* <p>Internally, Lucene refers to documents by an integer <i>document number</i>.
|
||||
* The first document added to an index is numbered zero, and each subsequent
|
||||
|
@ -122,7 +122,7 @@
|
|||
* </li>
|
||||
* </ul>
|
||||
* </div>
|
||||
* <a name="Overview"></a>
|
||||
* <a id="Overview"></a>
|
||||
* <h2>Index Structure Overview</h2>
|
||||
* <div>
|
||||
* <p>Each segment index maintains the following:</p>
|
||||
|
@ -194,7 +194,7 @@
|
|||
* </ul>
|
||||
* <p>Details on each of these are provided in their linked pages.</p>
|
||||
* </div>
|
||||
* <a name="File_Naming"></a>
|
||||
* <a id="File_Naming"></a>
|
||||
* <h2>File Naming</h2>
|
||||
* <div>
|
||||
* <p>All files belonging to a segment have the same name with varying extensions.
|
||||
|
@ -210,12 +210,13 @@
|
|||
* segments_1, then segments_2, etc. The generation is a sequential long integer
|
||||
* represented in alpha-numeric (base 36) form.</p>
|
||||
* </div>
|
||||
* <a name="file-names"></a>
|
||||
* <a id="file-names"></a>
|
||||
* <h2>Summary of File Extensions</h2>
|
||||
* <div>
|
||||
* <p>The following table summarizes the names and extensions of the files in
|
||||
* Lucene:</p>
|
||||
* <table cellspacing="1" cellpadding="4" summary="lucene filenames by extension">
|
||||
* <table class="padding4" style="border-spacing: 1px; border-collapse: separate">
|
||||
* <caption>lucene filenames by extension</caption>
|
||||
* <tr>
|
||||
* <th>Name</th>
|
||||
* <th>Extension</th>
|
||||
|
@ -315,7 +316,7 @@
|
|||
* </tr>
|
||||
* </table>
|
||||
* </div>
|
||||
* <a name="Lock_File"></a>
|
||||
* <a id="Lock_File"></a>
|
||||
* <h2>Lock File</h2>
|
||||
* The write lock, which is stored in the index directory by default, is named
|
||||
* "write.lock". If the lock directory is different from the index directory then
|
||||
|
@ -323,7 +324,7 @@
|
|||
* derived from the full path to the index directory. When this file is present, a
|
||||
* writer is currently modifying the index (adding or removing documents). This
|
||||
* lock file ensures that only one writer is modifying the index at a time.
|
||||
* <a name="History"></a>
|
||||
* <a id="History"></a>
|
||||
* <h2>History</h2>
|
||||
* <p>Compatibility notes are provided in this document, describing how file
|
||||
* formats have changed from prior versions:</p>
|
||||
|
@ -399,7 +400,7 @@
|
|||
* <li>In version 8.4, postings, positions, offsets and payload lengths have move to a more
|
||||
* performant encoding that is vectorized.</li>
|
||||
* </ul>
|
||||
* <a name="Limitations"></a>
|
||||
* <a id="Limitations"></a>
|
||||
* <h2>Limitations</h2>
|
||||
* <div>
|
||||
* <p>Lucene uses a Java <code>int</code> to refer to
|
||||
|
|
|
@ -54,8 +54,8 @@ import org.apache.lucene.util.IOUtils;
|
|||
* {@link ServiceLoader Service Provider Interface} to resolve format names.
|
||||
* <p>
|
||||
* Files written by each docvalues format have an additional suffix containing the
|
||||
* format name. For example, in a per-field configuration instead of <tt>_1.dat</tt>
|
||||
* filenames would look like <tt>_1_Lucene40_0.dat</tt>.
|
||||
* format name. For example, in a per-field configuration instead of <code>_1.dat</code>
|
||||
* filenames would look like <code>_1_Lucene40_0.dat</code>.
|
||||
* @see ServiceLoader
|
||||
* @lucene.experimental
|
||||
*/
|
||||
|
|
|
@ -61,8 +61,8 @@ import org.apache.lucene.util.RamUsageEstimator;
|
|||
* {@link ServiceLoader Service Provider Interface} to resolve format names.
|
||||
* <p>
|
||||
* Files written by each posting format have an additional suffix containing the
|
||||
* format name. For example, in a per-field configuration instead of <tt>_1.prx</tt>
|
||||
* filenames would look like <tt>_1_Lucene40_0.prx</tt>.
|
||||
* format name. For example, in a per-field configuration instead of <code>_1.prx</code>
|
||||
* filenames would look like <code>_1_Lucene40_0.prx</code>.
|
||||
* @see ServiceLoader
|
||||
* @lucene.experimental
|
||||
*/
|
||||
|
|
|
@ -37,7 +37,7 @@ import java.util.concurrent.atomic.AtomicInteger;
|
|||
* as documents are added to and deleted from an index. Clients should thus not
|
||||
* rely on a given document having the same number between sessions.
|
||||
*
|
||||
* <p><a name="thread-safety"></a><p><b>NOTE</b>: {@link
|
||||
* <p><a id="thread-safety"></a><p><b>NOTE</b>: {@link
|
||||
* IndexReader} instances are completely thread
|
||||
* safe, meaning multiple threads can call any of its methods,
|
||||
* concurrently. If your application requires external
|
||||
|
|
|
@ -42,7 +42,7 @@ import org.apache.lucene.store.*;
|
|||
rely on a given document having the same number between sessions.
|
||||
|
||||
<p>
|
||||
<a name="thread-safety"></a><p><b>NOTE</b>: {@link
|
||||
<a id="thread-safety"></a><p><b>NOTE</b>: {@link
|
||||
IndexReader} instances are completely thread
|
||||
safe, meaning multiple threads can call any of its methods,
|
||||
concurrently. If your application requires external
|
||||
|
|
|
@ -42,7 +42,7 @@ import org.apache.lucene.store.Directory;
|
|||
rely on a given document having the same number between sessions.
|
||||
|
||||
<p>
|
||||
<a name="thread-safety"></a><p><b>NOTE</b>: {@link
|
||||
<a id="thread-safety"></a><p><b>NOTE</b>: {@link
|
||||
IndexReader} instances are completely thread
|
||||
safe, meaning multiple threads can call any of its methods,
|
||||
concurrently. If your application requires external
|
||||
|
|
|
@ -68,7 +68,7 @@ import org.apache.lucene.util.Bits; // javadocs
|
|||
rely on a given document having the same number between sessions.
|
||||
|
||||
<p>
|
||||
<a name="thread-safety"></a><p><b>NOTE</b>: {@link
|
||||
<a id="thread-safety"></a><p><b>NOTE</b>: {@link
|
||||
IndexReader} instances are completely thread
|
||||
safe, meaning multiple threads can call any of its methods,
|
||||
concurrently. If your application requires external
|
||||
|
|
|
@ -28,9 +28,9 @@ public abstract class IndexReaderContext {
|
|||
public final CompositeReaderContext parent;
|
||||
/** <code>true</code> if this context struct represents the top level reader within the hierarchical context */
|
||||
public final boolean isTopLevel;
|
||||
/** the doc base for this reader in the parent, <tt>0</tt> if parent is null */
|
||||
/** the doc base for this reader in the parent, <code>0</code> if parent is null */
|
||||
public final int docBaseInParent;
|
||||
/** the ord for this reader in the parent, <tt>0</tt> if parent is null */
|
||||
/** the ord for this reader in the parent, <code>0</code> if parent is null */
|
||||
public final int ordInParent;
|
||||
|
||||
// An object that uniquely identifies this context without referencing
|
||||
|
|
|
@ -100,7 +100,7 @@ import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
|
|||
and then adds the entire document). When finished adding, deleting
|
||||
and updating documents, {@link #close() close} should be called.</p>
|
||||
|
||||
<a name="sequence_numbers"></a>
|
||||
<a id="sequence_numbers"></a>
|
||||
<p>Each method that changes the index returns a {@code long} sequence number, which
|
||||
expresses the effective order in which each change was applied.
|
||||
{@link #commit} also returns a sequence number, describing which
|
||||
|
@ -108,7 +108,7 @@ import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
|
|||
are transient (not saved into the index in any way) and only valid
|
||||
within a single {@code IndexWriter} instance.</p>
|
||||
|
||||
<a name="flush"></a>
|
||||
<a id="flush"></a>
|
||||
<p>These changes are buffered in memory and periodically
|
||||
flushed to the {@link Directory} (during the above method
|
||||
calls). A flush is triggered when there are enough added documents
|
||||
|
@ -134,7 +134,7 @@ import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
|
|||
another <code>IndexWriter</code> on the same directory will lead to a
|
||||
{@link LockObtainFailedException}.</p>
|
||||
|
||||
<a name="deletionPolicy"></a>
|
||||
<a id="deletionPolicy"></a>
|
||||
<p>Expert: <code>IndexWriter</code> allows an optional
|
||||
{@link IndexDeletionPolicy} implementation to be specified. You
|
||||
can use this to control when prior commits are deleted from
|
||||
|
@ -155,7 +155,7 @@ import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
|
|||
will likely result in poor performance compared to a local IO
|
||||
device. </p>
|
||||
|
||||
<a name="mergePolicy"></a> <p>Expert:
|
||||
<a id="mergePolicy"></a> <p>Expert:
|
||||
<code>IndexWriter</code> allows you to separately change
|
||||
the {@link MergePolicy} and the {@link MergeScheduler}.
|
||||
The {@link MergePolicy} is invoked whenever there are
|
||||
|
@ -167,14 +167,14 @@ import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
|
|||
it decides when and how to run the merges. The default is
|
||||
{@link ConcurrentMergeScheduler}. </p>
|
||||
|
||||
<a name="OOME"></a><p><b>NOTE</b>: if you hit a
|
||||
<a id="OOME"></a><p><b>NOTE</b>: if you hit a
|
||||
VirtualMachineError, or disaster strikes during a checkpoint
|
||||
then IndexWriter will close itself. This is a
|
||||
defensive measure in case any internal state (buffered
|
||||
documents, deletions, reference counts) were corrupted.
|
||||
Any subsequent calls will throw an AlreadyClosedException.</p>
|
||||
|
||||
<a name="thread-safety"></a><p><b>NOTE</b>: {@link
|
||||
<a id="thread-safety"></a><p><b>NOTE</b>: {@link
|
||||
IndexWriter} instances are completely thread
|
||||
safe, meaning multiple threads can call any of its
|
||||
methods, concurrently. If your application requires
|
||||
|
|
|
@ -34,7 +34,7 @@ import org.apache.lucene.util.Bits;
|
|||
rely on a given document having the same number between sessions.
|
||||
|
||||
<p>
|
||||
<a name="thread-safety"></a><p><b>NOTE</b>: {@link
|
||||
<a id="thread-safety"></a><p><b>NOTE</b>: {@link
|
||||
IndexReader} instances are completely thread
|
||||
safe, meaning multiple threads can call any of its methods,
|
||||
concurrently. If your application requires external
|
||||
|
|
|
@ -442,7 +442,7 @@ public abstract class MergePolicy {
|
|||
}
|
||||
|
||||
/**
|
||||
* Default ratio for compound file system usage. Set to <tt>1.0</tt>, always use
|
||||
* Default ratio for compound file system usage. Set to <code>1.0</code>, always use
|
||||
* compound file system.
|
||||
*/
|
||||
protected static final double DEFAULT_NO_CFS_RATIO = 1.0;
|
||||
|
|
|
@ -29,7 +29,7 @@ import java.io.IOException;
|
|||
* as documents are added to and deleted from an index. Clients should thus not
|
||||
* rely on a given document having the same number between sessions.
|
||||
*
|
||||
* <p><a name="thread-safety"></a><p><b>NOTE</b>: {@link
|
||||
* <p><a id="thread-safety"></a><p><b>NOTE</b>: {@link
|
||||
* IndexReader} instances are completely thread
|
||||
* safe, meaning multiple threads can call any of its methods,
|
||||
* concurrently. If your application requires external
|
||||
|
|
|
@ -39,7 +39,8 @@ import org.apache.lucene.util.bkd.BKDWriter;
|
|||
* These structures are optimized for operations such as <i>range</i>, <i>distance</i>, <i>nearest-neighbor</i>,
|
||||
* and <i>point-in-polygon</i> queries.
|
||||
* <h1>Basic Point Types</h1>
|
||||
* <table summary="Basic point types in Java and Lucene">
|
||||
* <table>
|
||||
* <caption>Basic point types in Java and Lucene</caption>
|
||||
* <tr><th>Java type</th><th>Lucene class</th></tr>
|
||||
* <tr><td>{@code int}</td><td>{@link IntPoint}</td></tr>
|
||||
* <tr><td>{@code long}</td><td>{@link LongPoint}</td></tr>
|
||||
|
|
|
@ -54,7 +54,7 @@ import org.apache.lucene.util.Version;
|
|||
* segments in relation to the file system.
|
||||
* <p>
|
||||
* The active segments in the index are stored in the segment info file,
|
||||
* <tt>segments_N</tt>. There may be one or more <tt>segments_N</tt> files in
|
||||
* <code>segments_N</code>. There may be one or more <code>segments_N</code> files in
|
||||
* the index; however, the one with the largest generation is the active one
|
||||
* (when older segments_N files are present it's because they temporarily cannot
|
||||
* be deleted, or a custom {@link IndexDeletionPolicy} is in
|
||||
|
@ -64,7 +64,7 @@ import org.apache.lucene.util.Version;
|
|||
* <p>
|
||||
* Files:
|
||||
* <ul>
|
||||
* <li><tt>segments_N</tt>: Header, LuceneVersion, Version, NameCounter, SegCount, MinSegmentLuceneVersion, <SegName,
|
||||
* <li><code>segments_N</code>: Header, LuceneVersion, Version, NameCounter, SegCount, MinSegmentLuceneVersion, <SegName,
|
||||
* SegID, SegCodec, DelGen, DeletionCount, FieldInfosGen, DocValuesGen,
|
||||
* UpdatesFiles><sup>SegCount</sup>, CommitUserData, Footer
|
||||
* </ul>
|
||||
|
|
|
@ -50,7 +50,7 @@ final class Sorter {
|
|||
}
|
||||
|
||||
/**
|
||||
* A permutation of doc IDs. For every document ID between <tt>0</tt> and
|
||||
* A permutation of doc IDs. For every document ID between <code>0</code> and
|
||||
* {@link IndexReader#maxDoc()}, <code>oldToNew(newToOld(docID))</code> must
|
||||
* return <code>docID</code>.
|
||||
*/
|
||||
|
@ -394,7 +394,7 @@ final class Sorter {
|
|||
* {@link #sort(int, DocComparator)} to compute the old-to-new permutation
|
||||
* given a list of documents and their corresponding values.
|
||||
* <p>
|
||||
* A return value of <tt>null</tt> is allowed and means that
|
||||
* A return value of <code>null</code> is allowed and means that
|
||||
* <code>reader</code> is already sorted.
|
||||
* <p>
|
||||
* <b>NOTE:</b> deleted documents are expected to appear in the mapping as
|
||||
|
|
|
@ -51,10 +51,10 @@
|
|||
* </ul>
|
||||
* </li>
|
||||
* </ol>
|
||||
* <a name="index"></a>
|
||||
* <a id="index"></a>
|
||||
* <h2>Index APIs</h2>
|
||||
|
||||
* <a name="writer"></a>
|
||||
* <a id="writer"></a>
|
||||
* <h3>IndexWriter</h3>
|
||||
|
||||
* <p>{@link org.apache.lucene.index.IndexWriter} is used to create an index, and to add, update and
|
||||
|
@ -66,7 +66,7 @@
|
|||
* org.apache.lucene.store.FSDirectory}), but it may also stand for some other storage, such as
|
||||
* RAM.</p>
|
||||
|
||||
* <a name="reader"></a>
|
||||
* <a id="reader"></a>
|
||||
* <h3>IndexReader</h3>
|
||||
|
||||
* <p>{@link org.apache.lucene.index.IndexReader} is used to read data from the index, and supports
|
||||
|
@ -76,7 +76,7 @@
|
|||
* org.apache.lucene.index.DirectoryReader#openIfChanged}) in order to incorporate writes that may
|
||||
* occur after it is opened.</p>
|
||||
|
||||
* <a name="segments"></a>
|
||||
* <a id="segments"></a>
|
||||
* <h3>Segments and docids</h3>
|
||||
|
||||
* <p>Lucene's index is composed of segments, each of which contains a subset of all the documents
|
||||
|
@ -101,10 +101,10 @@
|
|||
* not exposed as part of an application, nor stored or referenced outside of Lucene's internal
|
||||
* APIs.</p>
|
||||
|
||||
* <a name="field_types"></a>
|
||||
* <a id="field_types"></a>
|
||||
* <h2>Field Types</h2>
|
||||
*
|
||||
* <a name="postings-desc"></a>
|
||||
* <a id="postings-desc"></a>
|
||||
*
|
||||
* <p>Lucene supports a variety of different document field data structures. Lucene's core, the
|
||||
* inverted index, is comprised of "postings." The postings, with their term dictionary, can be
|
||||
|
@ -115,14 +115,14 @@
|
|||
* able to skip over low-scoring documents at search time. Postings do not provide any way of
|
||||
* retrieving terms given a document, short of scanning the entire index.</p>
|
||||
*
|
||||
* <a name="stored-fields"></a>
|
||||
* <a id="stored-fields"></a>
|
||||
* <p>Stored fields are essentially the opposite of postings, providing efficient retrieval of field
|
||||
* values given a docid. All stored field values for a document are stored together in a
|
||||
* block. Different types of stored field provide high-level datatypes such as strings and numbers
|
||||
* on top of the underlying bytes. Stored field values are usually retrieved by the searcher using
|
||||
* an implementation of {@link org.apache.lucene.index.StoredFieldVisitor}.</p>
|
||||
|
||||
* <a name="docvalues"></a>
|
||||
* <a id="docvalues"></a>
|
||||
* <p>{@link org.apache.lucene.index.DocValues} fields are what are sometimes referred to as
|
||||
* columnar, or column-stride fields, by analogy to relational database terminology, in which
|
||||
* documents are considered as rows, and fields, columns. DocValues fields store values per-field: a
|
||||
|
@ -130,14 +130,14 @@
|
|||
* lookup of a field-value given a docid. These fields are used for efficient value-based sorting,
|
||||
* and for faceting, but they are not useful for filtering.</p>
|
||||
|
||||
* <a name="points"></a>
|
||||
* <a id="points"></a>
|
||||
* <p>{@link org.apache.lucene.index.PointValues} represent numeric values using a kd-tree data
|
||||
* structure. Efficient 1- and higher dimensional implementations make these the choice for numeric
|
||||
* range and interval queries, and geo-spatial queries.</p>
|
||||
|
||||
* <a name="postings"></a>
|
||||
* <a id="postings"></a>
|
||||
* <h2>Postings APIs</h2>
|
||||
* <a name="fields"></a>
|
||||
* <a id="fields"></a>
|
||||
* <h3>
|
||||
* Fields
|
||||
* </h3>
|
||||
|
@ -159,7 +159,7 @@
|
|||
* Terms terms = fields.terms(field);
|
||||
* }
|
||||
* </pre>
|
||||
* <a name="terms"></a>
|
||||
* <a id="terms"></a>
|
||||
* <h3>
|
||||
* Terms
|
||||
* </h3>
|
||||
|
@ -195,7 +195,7 @@
|
|||
* PostingsEnum docsAndPositions = termsEnum.postings(null, null, PostingsEnum.FLAG_POSITIONS);
|
||||
* }
|
||||
* </pre>
|
||||
* <a name="documents"></a>
|
||||
* <a id="documents"></a>
|
||||
* <h3>
|
||||
* Documents
|
||||
* </h3>
|
||||
|
@ -210,7 +210,7 @@
|
|||
* System.out.println(docsEnum.freq());
|
||||
* }
|
||||
* </pre>
|
||||
* <a name="positions"></a>
|
||||
* <a id="positions"></a>
|
||||
* <h3>
|
||||
* Positions
|
||||
* </h3>
|
||||
|
@ -233,9 +233,9 @@
|
|||
* }
|
||||
* }
|
||||
* </pre>
|
||||
* <a name="stats"></a>
|
||||
* <a id="stats"></a>
|
||||
* <h2>Index Statistics</h2>
|
||||
* <a name="termstats"></a>
|
||||
* <a id="termstats"></a>
|
||||
* <h3>
|
||||
* Term statistics
|
||||
* </h3>
|
||||
|
@ -249,7 +249,7 @@
|
|||
* of occurrences of this term across all documents. Like docFreq(), it will
|
||||
* also count occurrences that appear in deleted documents.
|
||||
* </ul>
|
||||
* <a name="fieldstats"></a>
|
||||
* <a id="fieldstats"></a>
|
||||
* <h3>
|
||||
* Field statistics
|
||||
* </h3>
|
||||
|
@ -276,7 +276,7 @@
|
|||
* field, and like totalTermFreq() it will also count occurrences that appear in
|
||||
* deleted documents.
|
||||
* </ul>
|
||||
* <a name="segmentstats"></a>
|
||||
* <a id="segmentstats"></a>
|
||||
* <h3>
|
||||
* Segment statistics
|
||||
* </h3>
|
||||
|
@ -290,7 +290,7 @@
|
|||
* <li>{@link org.apache.lucene.index.Fields#size}: Returns the number of indexed
|
||||
* fields.
|
||||
* </ul>
|
||||
* <a name="documentstats"></a>
|
||||
* <a id="documentstats"></a>
|
||||
* <h3>
|
||||
* Document statistics
|
||||
* </h3>
|
||||
|
|
|
@ -38,7 +38,7 @@ import org.apache.lucene.util.ArrayUtil;
|
|||
* set is large this can easily be a very substantial amount
|
||||
* of RAM!
|
||||
*
|
||||
* <p>See the Lucene <tt>modules/grouping</tt> module for more
|
||||
* <p>See the Lucene <code>modules/grouping</code> module for more
|
||||
* details including a full code example.</p>
|
||||
*
|
||||
* @lucene.experimental
|
||||
|
|
|
@ -78,7 +78,7 @@ import org.apache.lucene.util.ThreadInterruptedException;
|
|||
* {@link TopScoreDocCollector#create} or {@link TopFieldCollector#create} and
|
||||
* call {@link #search(Query, Collector)}.
|
||||
*
|
||||
* <a name="thread-safety"></a><p><b>NOTE</b>: <code>{@link
|
||||
* <a id="thread-safety"></a><p><b>NOTE</b>: <code>{@link
|
||||
* IndexSearcher}</code> instances are completely
|
||||
* thread safe, meaning multiple threads can call any of its
|
||||
* methods, concurrently. If your application requires
|
||||
|
|
|
@ -143,7 +143,7 @@ public class LRUQueryCache implements QueryCache, Accountable {
|
|||
* than 3% of the total number of documents in the index.
|
||||
* This should guarantee that all leaves from the upper
|
||||
* {@link TieredMergePolicy tier} will be cached while ensuring that at most
|
||||
* <tt>33</tt> leaves can make it to the cache (very likely less than 10 in
|
||||
* <code>33</code> leaves can make it to the cache (very likely less than 10 in
|
||||
* practice), which is useful for this implementation since some operations
|
||||
* perform in linear time with the number of cached leaves.
|
||||
* Only clauses whose cost is at most 100x the cost of the top-level query will
|
||||
|
|
|
@ -39,7 +39,7 @@ public interface QueryCachingPolicy {
|
|||
/** Whether the given {@link Query} is worth caching.
|
||||
* This method will be called by the {@link QueryCache} to know whether to
|
||||
* cache. It will first attempt to load a {@link DocIdSet} from the cache.
|
||||
* If it is not cached yet and this method returns <tt>true</tt> then a
|
||||
* If it is not cached yet and this method returns <code>true</code> then a
|
||||
* cache entry will be generated. Otherwise an uncached scorer will be
|
||||
* returned. */
|
||||
boolean shouldCache(Query query) throws IOException;
|
||||
|
|
|
@ -29,7 +29,7 @@
|
|||
* </ol>
|
||||
*
|
||||
*
|
||||
* <a name="search"></a>
|
||||
* <a id="search"></a>
|
||||
* <h2>Search Basics</h2>
|
||||
* <p>
|
||||
* Lucene offers a wide variety of {@link org.apache.lucene.search.Query} implementations, most of which are in
|
||||
|
@ -50,7 +50,7 @@
|
|||
* <!-- TODO: this page over-links the same things too many times -->
|
||||
*
|
||||
*
|
||||
* <a name="query"></a>
|
||||
* <a id="query"></a>
|
||||
* <h2>Query Classes</h2>
|
||||
* <h3>
|
||||
* {@link org.apache.lucene.search.TermQuery TermQuery}
|
||||
|
@ -74,8 +74,8 @@
|
|||
* TermQuery tq = new TermQuery(new Term("fieldName", "term"));
|
||||
* </pre>In this example, the {@link org.apache.lucene.search.Query Query} identifies all
|
||||
* {@link org.apache.lucene.document.Document Document}s that have the
|
||||
* {@link org.apache.lucene.document.Field Field} named <tt>"fieldName"</tt>
|
||||
* containing the word <tt>"term"</tt>.
|
||||
* {@link org.apache.lucene.document.Field Field} named <code>"fieldName"</code>
|
||||
* containing the word <code>"term"</code>.
|
||||
* <h3>
|
||||
* {@link org.apache.lucene.search.BooleanQuery BooleanQuery}
|
||||
* </h3>
|
||||
|
@ -175,11 +175,11 @@
|
|||
* The {@link org.apache.lucene.search.PrefixQuery PrefixQuery} allows an application
|
||||
* to identify all documents with terms that begin with a certain string. The
|
||||
* {@link org.apache.lucene.search.WildcardQuery WildcardQuery} generalizes this by allowing
|
||||
* for the use of <tt>*</tt> (matches 0 or more characters) and <tt>?</tt> (matches exactly one character) wildcards.
|
||||
* for the use of <code>*</code> (matches 0 or more characters) and <code>?</code> (matches exactly one character) wildcards.
|
||||
* Note that the {@link org.apache.lucene.search.WildcardQuery WildcardQuery} can be quite slow. Also
|
||||
* note that
|
||||
* {@link org.apache.lucene.search.WildcardQuery WildcardQuery} should
|
||||
* not start with <tt>*</tt> and <tt>?</tt>, as these are extremely slow.
|
||||
* not start with <code>*</code> and <code>?</code>, as these are extremely slow.
|
||||
* Some QueryParsers may not allow this by default, but provide a <code>setAllowLeadingWildcard</code> method
|
||||
* to remove that protection.
|
||||
* The {@link org.apache.lucene.search.RegexpQuery RegexpQuery} is even more general than WildcardQuery,
|
||||
|
@ -196,7 +196,7 @@
|
|||
* This type of query can be useful when accounting for spelling variations in the collection.
|
||||
*
|
||||
*
|
||||
* <a name="scoring"></a>
|
||||
* <a id="scoring"></a>
|
||||
* <h2>Scoring — Introduction</h2>
|
||||
* <p>Lucene scoring is the heart of why we all love Lucene. It is blazingly fast and it hides
|
||||
* almost all of the complexity from the user. In a nutshell, it works. At least, that is,
|
||||
|
@ -229,7 +229,7 @@
|
|||
* Finally, we will finish up with some reference material in the <a href="#algorithm">Appendix</a>.
|
||||
*
|
||||
*
|
||||
* <a name="scoringBasics"></a>
|
||||
* <a id="scoringBasics"></a>
|
||||
* <h2>Scoring — Basics</h2>
|
||||
* <p>Scoring is very much dependent on the way documents are indexed, so it is important to understand
|
||||
* indexing. (see <a href="{@docRoot}/overview-summary.html#overview_description">Lucene overview</a>
|
||||
|
@ -257,7 +257,7 @@
|
|||
* <p>Lucene allows influencing the score contribution of various parts of the query by wrapping with
|
||||
* {@link org.apache.lucene.search.BoostQuery}.</p>
|
||||
*
|
||||
* <a name="changingScoring"></a>
|
||||
* <a id="changingScoring"></a>
|
||||
* <h2>Changing Scoring — Similarity</h2>
|
||||
* <h3>Changing the scoring formula</h3>
|
||||
* <p>
|
||||
|
@ -323,7 +323,7 @@
|
|||
* expr.getDoubleValuesSource(bindings));
|
||||
* </pre>
|
||||
*
|
||||
* <a name="customQueriesExpert"></a>
|
||||
* <a id="customQueriesExpert"></a>
|
||||
* <h2>Custom Queries — Expert Level</h2>
|
||||
*
|
||||
* <p>Custom queries are an expert level task, so tread carefully and be prepared to share your code if
|
||||
|
@ -374,7 +374,7 @@
|
|||
* {@link org.apache.lucene.search.BooleanQuery BooleanQuery}, <span
|
||||
* >and other queries that implement {@link org.apache.lucene.search.Query#createWeight(IndexSearcher,ScoreMode,float) createWeight(IndexSearcher searcher,ScoreMode scoreMode, float boost)}</span></li>
|
||||
* </ol>
|
||||
* <a name="weightClass"></a>
|
||||
* <a id="weightClass"></a>
|
||||
* <h3>The Weight Interface</h3>
|
||||
* <p>The
|
||||
* {@link org.apache.lucene.search.Weight Weight}
|
||||
|
@ -402,7 +402,7 @@
|
|||
* and offsets of matches. This is typically useful to implement highlighting.
|
||||
* </li>
|
||||
* </ol>
|
||||
* <a name="scorerClass"></a>
|
||||
* <a id="scorerClass"></a>
|
||||
* <h3>The Scorer Class</h3>
|
||||
* <p>The
|
||||
* {@link org.apache.lucene.search.Scorer Scorer}
|
||||
|
@ -431,7 +431,7 @@
|
|||
* details on the scoring process.
|
||||
* </li>
|
||||
* </ol>
|
||||
* <a name="bulkScorerClass"></a>
|
||||
* <a id="bulkScorerClass"></a>
|
||||
* <h3>The BulkScorer Class</h3>
|
||||
* <p>The
|
||||
* {@link org.apache.lucene.search.BulkScorer BulkScorer} scores a range of documents. There is only one
|
||||
|
@ -453,7 +453,7 @@
|
|||
* <!-- TODO: integrate this better, it's better served as an intro than an appendix -->
|
||||
*
|
||||
*
|
||||
* <a name="algorithm"></a>
|
||||
* <a id="algorithm"></a>
|
||||
* <h2>Appendix: Search Algorithm</h2>
|
||||
* <p>This section is mostly notes on stepping through the Scoring process and serves as
|
||||
* fertilizer for the earlier sections.
|
||||
|
|
|
@ -42,7 +42,7 @@ import org.apache.lucene.util.SmallFloat;
|
|||
* this class at both <a href="#indextime">index-time</a> and
|
||||
* <a href="#querytime">query-time</a>.
|
||||
* <p>
|
||||
* <a name="indextime">Indexing Time</a>
|
||||
* <a id="indextime">Indexing Time</a>
|
||||
* At indexing time, the indexer calls {@link #computeNorm(FieldInvertState)}, allowing
|
||||
* the Similarity implementation to set a per-document value for the field that will
|
||||
* be later accessible via {@link org.apache.lucene.index.LeafReader#getNormValues(String)}.
|
||||
|
@ -60,7 +60,7 @@ import org.apache.lucene.util.SmallFloat;
|
|||
* Additional scoring factors can be stored in named {@link NumericDocValuesField}s and
|
||||
* accessed at query-time with {@link org.apache.lucene.index.LeafReader#getNumericDocValues(String)}.
|
||||
* However this should not be done in the {@link Similarity} but externally, for instance
|
||||
* by using <tt>FunctionScoreQuery</tt>.
|
||||
* by using <code>FunctionScoreQuery</code>.
|
||||
* <p>
|
||||
* Finally, using index-time boosts (either via folding into the normalization byte or
|
||||
* via DocValues), is an inefficient way to boost the scores of different fields if the
|
||||
|
@ -68,7 +68,7 @@ import org.apache.lucene.util.SmallFloat;
|
|||
* boost parameter <i>C</i>, and {@link PerFieldSimilarityWrapper} can return different
|
||||
* instances with different boosts depending upon field name.
|
||||
* <p>
|
||||
* <a name="querytime">Query time</a>
|
||||
* <a id="querytime">Query time</a>
|
||||
* At query-time, Queries interact with the Similarity via these steps:
|
||||
* <ol>
|
||||
* <li>The {@link #scorer(float, CollectionStatistics, TermStatistics...)} method is called a single time,
|
||||
|
@ -80,7 +80,7 @@ import org.apache.lucene.util.SmallFloat;
|
|||
* <li>Then {@link SimScorer#score(float, long)} is called for every matching document to compute its score.
|
||||
* </ol>
|
||||
* <p>
|
||||
* <a name="explaintime">Explanations</a>
|
||||
* <a id="explaintime">Explanations</a>
|
||||
* When {@link IndexSearcher#explain(org.apache.lucene.search.Query, int)} is called, queries consult the Similarity's DocScorer for an
|
||||
* explanation of how it computed its score. The query passes in a the document id and an explanation of how the frequency
|
||||
* was computed.
|
||||
|
|
|
@ -79,20 +79,24 @@ import org.apache.lucene.util.SmallFloat;
|
|||
* of the weighted query vectors <i>V(q)</i> and <i>V(d)</i>:
|
||||
*
|
||||
* <br> <br>
|
||||
* <table cellpadding="2" cellspacing="2" border="0" style="width:auto; margin-left:auto; margin-right:auto" summary="formatting only">
|
||||
* <table class="padding2" style="border-spacing: 2px; border-collapse: separate; border: 0; width:auto; margin-left:auto; margin-right:auto">
|
||||
* <caption>formatting only</caption>
|
||||
* <tr><td>
|
||||
* <table cellpadding="1" cellspacing="0" border="1" style="margin-left:auto; margin-right:auto" summary="formatting only">
|
||||
* <table class="padding1" style="border-spacing: 0px; border-collapse: separate; border: 1px solid; margin-left:auto; margin-right:auto">
|
||||
* <caption>formatting only</caption>
|
||||
* <tr><td>
|
||||
* <table cellpadding="2" cellspacing="2" border="0" style="margin-left:auto; margin-right:auto" summary="cosine similarity formula">
|
||||
* <table class="padding2" style="border-spacing: 2px; border-collapse: separate; border: 0; margin-left:auto; margin-right:auto">
|
||||
* <caption>cosine similarity formula</caption>
|
||||
* <tr>
|
||||
* <td valign="middle" align="right" rowspan="1">
|
||||
* <td valign="middle" style="text-align: right" rowspan="1">
|
||||
* cosine-similarity(q,d) =
|
||||
* </td>
|
||||
* <td valign="middle" align="center">
|
||||
* <table summary="cosine similarity formula">
|
||||
* <tr><td align="center" style="text-align: center"><small>V(q) · V(d)</small></td></tr>
|
||||
* <tr><td align="center" style="text-align: center">–––––––––</td></tr>
|
||||
* <tr><td align="center" style="text-align: center"><small>|V(q)| |V(d)|</small></td></tr>
|
||||
* <td valign="middle" style="text-align: center">
|
||||
* <table>
|
||||
* <caption>cosine similarity formula</caption>
|
||||
* <tr><td style="text-align: center"><small>V(q) · V(d)</small></td></tr>
|
||||
* <tr><td style="text-align: center">–––––––––</td></tr>
|
||||
* <tr><td style="text-align: center"><small>|V(q)| |V(d)|</small></td></tr>
|
||||
* </table>
|
||||
* </td>
|
||||
* </tr>
|
||||
|
@ -101,7 +105,7 @@ import org.apache.lucene.util.SmallFloat;
|
|||
* </table>
|
||||
* </td></tr>
|
||||
* <tr><td>
|
||||
* <center><u>VSM Score</u></center>
|
||||
* <u style="text-align: center">VSM Score</u>
|
||||
* </td></tr>
|
||||
* </table>
|
||||
* <br> <br>
|
||||
|
@ -161,24 +165,28 @@ import org.apache.lucene.util.SmallFloat;
|
|||
* we get <i>Lucene's Conceptual scoring formula</i>:
|
||||
*
|
||||
* <br> <br>
|
||||
* <table cellpadding="2" cellspacing="2" border="0" style="width:auto; margin-left:auto; margin-right:auto" summary="formatting only">
|
||||
* <table class="padding2" style="border-spacing: 2px; border-collapse: separate; border: 0; width:auto; margin-left:auto; margin-right:auto">
|
||||
* <caption>formatting only</caption>
|
||||
* <tr><td>
|
||||
* <table cellpadding="1" cellspacing="0" border="1" style="margin-left:auto; margin-right:auto" summary="formatting only">
|
||||
* <table class="padding1" style="border-spacing: 0px; border-collapse: separate; border: 1px solid; margin-left:auto; margin-right:auto">
|
||||
* <caption>formatting only</caption>
|
||||
* <tr><td>
|
||||
* <table cellpadding="2" cellspacing="2" border="0" style="margin-left:auto; margin-right:auto" summary="formatting only">
|
||||
* <table class="padding2" style="border-spacing: 2px; border-collapse: separate; border: 0; margin-left:auto; margin-right:auto">
|
||||
* <caption>formatting only</caption>
|
||||
* <tr>
|
||||
* <td valign="middle" align="right" rowspan="1">
|
||||
* <td valign="middle" style="text-align: right" rowspan="1">
|
||||
* score(q,d) =
|
||||
* <span style="color: #CCCC00">query-boost(q)</span> ·
|
||||
* </td>
|
||||
* <td valign="middle" align="center">
|
||||
* <table summary="Lucene conceptual scoring formula">
|
||||
* <tr><td align="center" style="text-align: center"><small><span style="color: #993399">V(q) · V(d)</span></small></td></tr>
|
||||
* <tr><td align="center" style="text-align: center">–––––––––</td></tr>
|
||||
* <tr><td align="center" style="text-align: center"><small><span style="color: #FF33CC">|V(q)|</span></small></td></tr>
|
||||
* <td valign="middle" style="text-align: center">
|
||||
* <table>
|
||||
* <caption>Lucene conceptual scoring formula</caption>
|
||||
* <tr><td style="text-align: center"><small><span style="color: #993399">V(q) · V(d)</span></small></td></tr>
|
||||
* <tr><td style="text-align: center">–––––––––</td></tr>
|
||||
* <tr><td style="text-align: center"><small><span style="color: #FF33CC">|V(q)|</span></small></td></tr>
|
||||
* </table>
|
||||
* </td>
|
||||
* <td valign="middle" align="right" rowspan="1">
|
||||
* <td valign="middle" style="text-align: right" rowspan="1">
|
||||
* · <span style="color: #3399FF">doc-len-norm(d)</span>
|
||||
* · <span style="color: #3399FF">doc-boost(d)</span>
|
||||
* </td>
|
||||
|
@ -188,7 +196,7 @@ import org.apache.lucene.util.SmallFloat;
|
|||
* </table>
|
||||
* </td></tr>
|
||||
* <tr><td>
|
||||
* <center><u>Lucene Conceptual Scoring Formula</u></center>
|
||||
* <u style="text-align: center">Lucene Conceptual Scoring Formula</u>
|
||||
* </td></tr>
|
||||
* </table>
|
||||
* <br> <br>
|
||||
|
@ -246,28 +254,31 @@ import org.apache.lucene.util.SmallFloat;
|
|||
* The color codes demonstrate how it relates
|
||||
* to those of the <i>conceptual</i> formula:
|
||||
*
|
||||
* <table cellpadding="2" cellspacing="2" border="0" style="width:auto; margin-left:auto; margin-right:auto" summary="formatting only">
|
||||
* <table class="padding2" style="border-spacing: 2px; border-collapse: separate; border: 0; width:auto; margin-left:auto; margin-right:auto">
|
||||
* <caption>formatting only</caption>
|
||||
* <tr><td>
|
||||
* <table cellpadding="" cellspacing="2" border="2" style="margin-left:auto; margin-right:auto" summary="formatting only">
|
||||
* <table style="border-spacing: 2px; border-collapse: separate; border: 2px solid; margin-left:auto; margin-right:auto">
|
||||
* <caption>formatting only</caption>
|
||||
* <tr><td>
|
||||
* <table cellpadding="2" cellspacing="2" border="0" style="margin-left:auto; margin-right:auto" summary="Lucene conceptual scoring formula">
|
||||
* <table class="padding2" style="border-spacing: 2px; border-collapse: separate; border: 0; margin-left:auto; margin-right:auto">
|
||||
* <caption>Lucene conceptual scoring formula</caption>
|
||||
* <tr>
|
||||
* <td valign="middle" align="right" rowspan="1">
|
||||
* <td valign="middle" style="text-align: right" rowspan="1">
|
||||
* score(q,d) =
|
||||
* <big><big><big>∑</big></big></big>
|
||||
* <span style="font-size: larger">∑</span>
|
||||
* </td>
|
||||
* <td valign="middle" align="right" rowspan="1">
|
||||
* <big><big>(</big></big>
|
||||
* <td valign="middle" style="text-align: right" rowspan="1">
|
||||
* <span style="font-size: larger">(</span>
|
||||
* <A HREF="#formula_tf"><span style="color: #993399">tf(t in d)</span></A> ·
|
||||
* <A HREF="#formula_idf"><span style="color: #993399">idf(t)</span></A><sup>2</sup> ·
|
||||
* <A HREF="#formula_termBoost"><span style="color: #CCCC00">t.getBoost()</span></A> ·
|
||||
* <A HREF="#formula_norm"><span style="color: #3399FF">norm(t,d)</span></A>
|
||||
* <big><big>)</big></big>
|
||||
* <span style="font-size: larger">)</span>
|
||||
* </td>
|
||||
* </tr>
|
||||
* <tr valign="top">
|
||||
* <td></td>
|
||||
* <td align="center" style="text-align: center"><small>t in q</small></td>
|
||||
* <td style="text-align: center"><small>t in q</small></td>
|
||||
* <td></td>
|
||||
* </tr>
|
||||
* </table>
|
||||
|
@ -275,14 +286,14 @@ import org.apache.lucene.util.SmallFloat;
|
|||
* </table>
|
||||
* </td></tr>
|
||||
* <tr><td>
|
||||
* <center><u>Lucene Practical Scoring Function</u></center>
|
||||
* <u style="text-align: center">Lucene Practical Scoring Function</u>
|
||||
* </td></tr>
|
||||
* </table>
|
||||
*
|
||||
* <p> where
|
||||
* <ol>
|
||||
* <li>
|
||||
* <A NAME="formula_tf"></A>
|
||||
* <a id="formula_tf"></A>
|
||||
* <b><i>tf(t in d)</i></b>
|
||||
* correlates to the term's <i>frequency</i>,
|
||||
* defined as the number of times term <i>t</i> appears in the currently scored document <i>d</i>.
|
||||
|
@ -295,13 +306,14 @@ import org.apache.lucene.util.SmallFloat;
|
|||
* {@link org.apache.lucene.search.similarities.ClassicSimilarity#tf(float) ClassicSimilarity} is:
|
||||
*
|
||||
* <br> <br>
|
||||
* <table cellpadding="2" cellspacing="2" border="0" style="width:auto; margin-left:auto; margin-right:auto" summary="term frequency computation">
|
||||
* <table class="padding2" style="border-spacing: 2px; border-collapse: separate; border: 0; width:auto; margin-left:auto; margin-right:auto">
|
||||
* <caption>term frequency computation</caption>
|
||||
* <tr>
|
||||
* <td valign="middle" align="right" rowspan="1">
|
||||
* <td valign="middle" style="text-align: right" rowspan="1">
|
||||
* {@link org.apache.lucene.search.similarities.ClassicSimilarity#tf(float) tf(t in d)} =
|
||||
* </td>
|
||||
* <td valign="top" align="center" rowspan="1">
|
||||
* frequency<sup><big>½</big></sup>
|
||||
* <td valign="top" style="text-align: center" rowspan="1">
|
||||
* frequency<sup><span style="font-size: larger">½</span></sup>
|
||||
* </td>
|
||||
* </tr>
|
||||
* </table>
|
||||
|
@ -309,7 +321,7 @@ import org.apache.lucene.util.SmallFloat;
|
|||
* </li>
|
||||
*
|
||||
* <li>
|
||||
* <A NAME="formula_idf"></A>
|
||||
* <a id="formula_idf"></A>
|
||||
* <b><i>idf(t)</i></b> stands for Inverse Document Frequency. This value
|
||||
* correlates to the inverse of <i>docFreq</i>
|
||||
* (the number of documents in which the term <i>t</i> appears).
|
||||
|
@ -320,23 +332,25 @@ import org.apache.lucene.util.SmallFloat;
|
|||
* {@link org.apache.lucene.search.similarities.ClassicSimilarity#idf(long, long) ClassicSimilarity} is:
|
||||
*
|
||||
* <br> <br>
|
||||
* <table cellpadding="2" cellspacing="2" border="0" style="width:auto; margin-left:auto; margin-right:auto" summary="inverse document frequency computation">
|
||||
* <table class="padding2" style="border-spacing: 2px; border-collapse: separate; border: 0; width:auto; margin-left:auto; margin-right:auto">
|
||||
* <caption>inverse document frequency computation</caption>
|
||||
* <tr>
|
||||
* <td valign="middle" align="right">
|
||||
* <td valign="middle" style="text-align: right">
|
||||
* {@link org.apache.lucene.search.similarities.ClassicSimilarity#idf(long, long) idf(t)} =
|
||||
* </td>
|
||||
* <td valign="middle" align="center">
|
||||
* 1 + log <big>(</big>
|
||||
* <td valign="middle" style="text-align: center">
|
||||
* 1 + log <span style="font-size: larger">(</span>
|
||||
* </td>
|
||||
* <td valign="middle" align="center">
|
||||
* <table summary="inverse document frequency computation">
|
||||
* <tr><td align="center" style="text-align: center"><small>docCount+1</small></td></tr>
|
||||
* <tr><td align="center" style="text-align: center">–––––––––</td></tr>
|
||||
* <tr><td align="center" style="text-align: center"><small>docFreq+1</small></td></tr>
|
||||
* <td valign="middle" style="text-align: center">
|
||||
* <table>
|
||||
* <caption>inverse document frequency computation</caption>
|
||||
* <tr><td style="text-align: center"><small>docCount+1</small></td></tr>
|
||||
* <tr><td style="text-align: center">–––––––––</td></tr>
|
||||
* <tr><td style="text-align: center"><small>docFreq+1</small></td></tr>
|
||||
* </table>
|
||||
* </td>
|
||||
* <td valign="middle" align="center">
|
||||
* <big>)</big>
|
||||
* <td valign="middle" style="text-align: center">
|
||||
* <span style="font-size: larger">)</span>
|
||||
* </td>
|
||||
* </tr>
|
||||
* </table>
|
||||
|
@ -344,7 +358,7 @@ import org.apache.lucene.util.SmallFloat;
|
|||
* </li>
|
||||
*
|
||||
* <li>
|
||||
* <A NAME="formula_termBoost"></A>
|
||||
* <a id="formula_termBoost"></A>
|
||||
* <b><i>t.getBoost()</i></b>
|
||||
* is a search time boost of term <i>t</i> in the query <i>q</i> as
|
||||
* specified in the query text
|
||||
|
@ -360,7 +374,7 @@ import org.apache.lucene.util.SmallFloat;
|
|||
* </li>
|
||||
*
|
||||
* <li>
|
||||
* <A NAME="formula_norm"></A>
|
||||
* <a id="formula_norm"></A>
|
||||
* <b><i>norm(t,d)</i></b> is an index-time boost factor that solely
|
||||
* depends on the number of tokens of this field in the document, so
|
||||
* that shorter fields contribute more to the score.
|
||||
|
|
|
@ -29,7 +29,7 @@
|
|||
* </ol>
|
||||
*
|
||||
*
|
||||
* <a name="sims"></a>
|
||||
* <a id="sims"></a>
|
||||
* <h2>Summary of the Ranking Methods</h2>
|
||||
*
|
||||
* <p>{@link org.apache.lucene.search.similarities.BM25Similarity} is an optimized
|
||||
|
@ -46,7 +46,7 @@
|
|||
* Lucene ships the following methods built on
|
||||
* {@link org.apache.lucene.search.similarities.SimilarityBase}:
|
||||
*
|
||||
* <a name="framework"></a>
|
||||
* <a id="framework"></a>
|
||||
* <ul>
|
||||
* <li>Amati and Rijsbergen's {@linkplain org.apache.lucene.search.similarities.DFRSimilarity DFR} framework;</li>
|
||||
* <li>Clinchant and Gaussier's {@linkplain org.apache.lucene.search.similarities.IBSimilarity Information-based models}
|
||||
|
@ -66,7 +66,7 @@
|
|||
* optimizations can always be implemented in subclasses; see
|
||||
* <a href="#changingSimilarity">below</a>.
|
||||
*
|
||||
* <a name="changingSimilarity"></a>
|
||||
* <a id="changingSimilarity"></a>
|
||||
* <h2>Changing Similarity</h2>
|
||||
*
|
||||
* <p>Chances are the available Similarities are sufficient for all
|
||||
|
@ -94,12 +94,12 @@
|
|||
* <p>{@link org.apache.lucene.search.similarities.BM25Similarity} has
|
||||
* two parameters that may be tuned:
|
||||
* <ul>
|
||||
* <li><tt>k1</tt>, which calibrates term frequency saturation and must be
|
||||
* <li><code>k1</code>, which calibrates term frequency saturation and must be
|
||||
* positive or null. A value of {@code 0} makes term frequency completely
|
||||
* ignored, making documents scored only based on the value of the <tt>IDF</tt>
|
||||
* of the matched terms. Higher values of <tt>k1</tt> increase the impact of
|
||||
* ignored, making documents scored only based on the value of the <code>IDF</code>
|
||||
* of the matched terms. Higher values of <code>k1</code> increase the impact of
|
||||
* term frequency on the final score. Default value is {@code 1.2}.</li>
|
||||
* <li><tt>b</tt>, which controls how much document length should normalize
|
||||
* <li><code>b</code>, which controls how much document length should normalize
|
||||
* term frequency values and must be in {@code [0, 1]}. A value of {@code 0}
|
||||
* disables length normalization completely. Default value is {@code 0.75}.</li>
|
||||
* </ul>
|
||||
|
|
|
@ -90,12 +90,13 @@ public abstract class DataOutput {
|
|||
* resulting integer value. Thus values from zero to 127 may be stored in a single
|
||||
* byte, values from 128 to 16,383 may be stored in two bytes, and so on.</p>
|
||||
* <p>VByte Encoding Example</p>
|
||||
* <table cellspacing="0" cellpadding="2" border="0" summary="variable length encoding examples">
|
||||
* <table class="padding2" style="border-spacing: 0px; border-collapse: separate; border: 0">
|
||||
* <caption>variable length encoding examples</caption>
|
||||
* <tr valign="top">
|
||||
* <th align="left">Value</th>
|
||||
* <th align="left">Byte 1</th>
|
||||
* <th align="left">Byte 2</th>
|
||||
* <th align="left">Byte 3</th>
|
||||
* <th style="text-align:left">Value</th>
|
||||
* <th style="text-align:left">Byte 1</th>
|
||||
* <th style="text-align:left">Byte 2</th>
|
||||
* <th style="text-align:left">Byte 3</th>
|
||||
* </tr>
|
||||
* <tr valign="bottom">
|
||||
* <td>0</td>
|
||||
|
|
|
@ -47,7 +47,7 @@ import org.apache.lucene.util.IOUtils;
|
|||
/**
|
||||
* Base class for Directory implementations that store index
|
||||
* files in the file system.
|
||||
* <a name="subclasses"></a>
|
||||
* <a id="subclasses"></a>
|
||||
* There are currently three core
|
||||
* subclasses:
|
||||
*
|
||||
|
|
|
@ -35,7 +35,7 @@ public class OutputStreamIndexOutput extends IndexOutput {
|
|||
/**
|
||||
* Creates a new {@link OutputStreamIndexOutput} with the given buffer size.
|
||||
* @param bufferSize the buffer size in bytes used to buffer writes internally.
|
||||
* @throws IllegalArgumentException if the given buffer size is less or equal to <tt>0</tt>
|
||||
* @throws IllegalArgumentException if the given buffer size is less or equal to <code>0</code>
|
||||
*/
|
||||
public OutputStreamIndexOutput(String resourceDescription, String name, OutputStream out, int bufferSize) {
|
||||
super(resourceDescription, name);
|
||||
|
|
|
@ -167,8 +167,8 @@ public final class BitUtil {
|
|||
/**
|
||||
* <a href="https://developers.google.com/protocol-buffers/docs/encoding#types">Zig-zag</a>
|
||||
* encode the provided long. Assuming the input is a signed long whose
|
||||
* absolute value can be stored on <tt>n</tt> bits, the returned value will
|
||||
* be an unsigned long that can be stored on <tt>n+1</tt> bits.
|
||||
* absolute value can be stored on <code>n</code> bits, the returned value will
|
||||
* be an unsigned long that can be stored on <code>n+1</code> bits.
|
||||
*/
|
||||
public static long zigZagEncode(long l) {
|
||||
return (l >> 63) ^ (l << 1);
|
||||
|
|
|
@ -138,7 +138,7 @@ public final class ByteBlockPool implements Accountable {
|
|||
|
||||
/**
|
||||
* Resets the pool to its initial state reusing the first buffer and fills all
|
||||
* buffers with <tt>0</tt> bytes before they reused or passed to
|
||||
* buffers with <code>0</code> bytes before they reused or passed to
|
||||
* {@link Allocator#recycleByteBlocks(byte[][], int, int)}. Calling
|
||||
* {@link ByteBlockPool#nextBuffer()} is not needed after reset.
|
||||
*/
|
||||
|
@ -149,7 +149,7 @@ public final class ByteBlockPool implements Accountable {
|
|||
/**
|
||||
* Expert: Resets the pool to its initial state reusing the first buffer. Calling
|
||||
* {@link ByteBlockPool#nextBuffer()} is not needed after reset.
|
||||
* @param zeroFillBuffers if <code>true</code> the buffers are filled with <tt>0</tt>.
|
||||
* @param zeroFillBuffers if <code>true</code> the buffers are filled with <code>0</code>.
|
||||
* This should be set to <code>true</code> if this pool is used with slices.
|
||||
* @param reuseFirst if <code>true</code> the first buffer will be reused and calling
|
||||
* {@link ByteBlockPool#nextBuffer()} is not needed after reset iff the
|
||||
|
@ -349,7 +349,7 @@ public final class ByteBlockPool implements Accountable {
|
|||
|
||||
/**
|
||||
* Reads bytes out of the pool starting at the given offset with the given
|
||||
* length into the given byte array at offset <tt>off</tt>.
|
||||
* length into the given byte array at offset <code>off</code>.
|
||||
* <p>Note: this method allows to copy across block boundaries.</p>
|
||||
*/
|
||||
public void readBytes(final long offset, final byte bytes[], int bytesOffset, int bytesLength) {
|
||||
|
|
|
@ -33,10 +33,10 @@ public final class Constants {
|
|||
public static final String JVM_NAME = System.getProperty("java.vm.name");
|
||||
public static final String JVM_SPEC_VERSION = System.getProperty("java.specification.version");
|
||||
|
||||
/** The value of <tt>System.getProperty("java.version")</tt>. **/
|
||||
/** The value of <code>System.getProperty("java.version")</code>. **/
|
||||
public static final String JAVA_VERSION = System.getProperty("java.version");
|
||||
|
||||
/** The value of <tt>System.getProperty("os.name")</tt>. **/
|
||||
/** The value of <code>System.getProperty("os.name")</code>. **/
|
||||
public static final String OS_NAME = System.getProperty("os.name");
|
||||
/** True iff running on Linux. */
|
||||
public static final boolean LINUX = OS_NAME.startsWith("Linux");
|
||||
|
|
|
@ -23,7 +23,7 @@ import java.util.NoSuchElementException;
|
|||
* An {@link Iterator} implementation that filters elements with a boolean predicate.
|
||||
*
|
||||
* @param <T> generic parameter for this iterator instance: this iterator implements {@link Iterator Iterator<T>}
|
||||
* @param <InnerT> generic parameter of the wrapped iterator, must be <tt>T</tt> or extend <tt>T</tt>
|
||||
* @param <InnerT> generic parameter of the wrapped iterator, must be <code>T</code> or extend <code>T</code>
|
||||
* @see #predicateFunction
|
||||
* @lucene.internal
|
||||
*/
|
||||
|
|
|
@ -64,21 +64,21 @@ public final class IOUtils {
|
|||
private IOUtils() {} // no instance
|
||||
|
||||
/**
|
||||
* Closes all given <tt>Closeable</tt>s. Some of the
|
||||
* <tt>Closeable</tt>s may be null; they are
|
||||
* Closes all given <code>Closeable</code>s. Some of the
|
||||
* <code>Closeable</code>s may be null; they are
|
||||
* ignored. After everything is closed, the method either
|
||||
* throws the first exception it hit while closing, or
|
||||
* completes normally if there were no exceptions.
|
||||
*
|
||||
* @param objects
|
||||
* objects to call <tt>close()</tt> on
|
||||
* objects to call <code>close()</code> on
|
||||
*/
|
||||
public static void close(Closeable... objects) throws IOException {
|
||||
close(Arrays.asList(objects));
|
||||
}
|
||||
|
||||
/**
|
||||
* Closes all given <tt>Closeable</tt>s.
|
||||
* Closes all given <code>Closeable</code>s.
|
||||
* @see #close(Closeable...)
|
||||
*/
|
||||
public static void close(Iterable<? extends Closeable> objects) throws IOException {
|
||||
|
@ -99,18 +99,18 @@ public final class IOUtils {
|
|||
}
|
||||
|
||||
/**
|
||||
* Closes all given <tt>Closeable</tt>s, suppressing all thrown exceptions.
|
||||
* Some of the <tt>Closeable</tt>s may be null, they are ignored.
|
||||
* Closes all given <code>Closeable</code>s, suppressing all thrown exceptions.
|
||||
* Some of the <code>Closeable</code>s may be null, they are ignored.
|
||||
*
|
||||
* @param objects
|
||||
* objects to call <tt>close()</tt> on
|
||||
* objects to call <code>close()</code> on
|
||||
*/
|
||||
public static void closeWhileHandlingException(Closeable... objects) {
|
||||
closeWhileHandlingException(Arrays.asList(objects));
|
||||
}
|
||||
|
||||
/**
|
||||
* Closes all given <tt>Closeable</tt>s, suppressing all thrown non {@link VirtualMachineError} exceptions.
|
||||
* Closes all given <code>Closeable</code>s, suppressing all thrown non {@link VirtualMachineError} exceptions.
|
||||
* Even if a {@link VirtualMachineError} is thrown all given closeable are closed.
|
||||
* @see #closeWhileHandlingException(Closeable...)
|
||||
*/
|
||||
|
@ -261,8 +261,8 @@ public final class IOUtils {
|
|||
}
|
||||
|
||||
/**
|
||||
* Deletes all given <tt>Path</tt>s, if they exist. Some of the
|
||||
* <tt>File</tt>s may be null; they are
|
||||
* Deletes all given <code>Path</code>s, if they exist. Some of the
|
||||
* <code>File</code>s may be null; they are
|
||||
* ignored. After everything is deleted, the method either
|
||||
* throws the first exception it hit while deleting, or
|
||||
* completes normally if there were no exceptions.
|
||||
|
@ -274,8 +274,8 @@ public final class IOUtils {
|
|||
}
|
||||
|
||||
/**
|
||||
* Deletes all given <tt>Path</tt>s, if they exist. Some of the
|
||||
* <tt>File</tt>s may be null; they are
|
||||
* Deletes all given <code>Path</code>s, if they exist. Some of the
|
||||
* <code>File</code>s may be null; they are
|
||||
* ignored. After everything is deleted, the method either
|
||||
* throws the first exception it hit while deleting, or
|
||||
* completes normally if there were no exceptions.
|
||||
|
|
|
@ -99,7 +99,7 @@ public final class IntBlockPool {
|
|||
|
||||
/**
|
||||
* Expert: Resets the pool to its initial state reusing the first buffer.
|
||||
* @param zeroFillBuffers if <code>true</code> the buffers are filled with <tt>0</tt>.
|
||||
* @param zeroFillBuffers if <code>true</code> the buffers are filled with <code>0</code>.
|
||||
* This should be set to <code>true</code> if this pool is used with
|
||||
* {@link SliceWriter}.
|
||||
* @param reuseFirst if <code>true</code> the first buffer will be reused and calling
|
||||
|
|
|
@ -114,7 +114,7 @@ public abstract class PriorityQueue<T> implements Iterable<T> {
|
|||
|
||||
/** Determines the ordering of objects in this priority queue. Subclasses
|
||||
* must define this one method.
|
||||
* @return <code>true</code> iff parameter <tt>a</tt> is less than parameter <tt>b</tt>.
|
||||
* @return <code>true</code> iff parameter <code>a</code> is less than parameter <code>b</code>.
|
||||
*/
|
||||
protected abstract boolean lessThan(T a, T b);
|
||||
|
||||
|
|
|
@ -26,7 +26,7 @@ import org.apache.lucene.search.DocIdSetIterator;
|
|||
* The way it works is that the space of bits is divided into blocks of
|
||||
* 4096 bits, which is 64 longs. Then for each block, we have:<ul>
|
||||
* <li>a long[] which stores the non-zero longs for that block</li>
|
||||
* <li>a long so that bit <tt>i</tt> being set means that the <code>i-th</code>
|
||||
* <li>a long so that bit <code>i</code> being set means that the <code>i-th</code>
|
||||
* long of the block is non-null, and its offset in the array of longs is
|
||||
* the number of one bits on the right of the <code>i-th</code> bit.</li></ul>
|
||||
*
|
||||
|
@ -132,7 +132,7 @@ public class SparseFixedBitSet extends BitSet implements Bits, Accountable {
|
|||
}
|
||||
|
||||
/**
|
||||
* Set the bit at index <tt>i</tt>.
|
||||
* Set the bit at index <code>i</code>.
|
||||
*/
|
||||
public void set(int i) {
|
||||
assert consistent(i);
|
||||
|
@ -189,7 +189,7 @@ public class SparseFixedBitSet extends BitSet implements Bits, Accountable {
|
|||
}
|
||||
|
||||
/**
|
||||
* Clear the bit at index <tt>i</tt>.
|
||||
* Clear the bit at index <code>i</code>.
|
||||
*/
|
||||
public void clear(int i) {
|
||||
assert consistent(i);
|
||||
|
|
|
@ -26,7 +26,7 @@ import java.util.Arrays;
|
|||
* <p>This implementation is especially good at sorting partially-sorted
|
||||
* arrays and sorts small arrays with binary sort.
|
||||
* <p><b>NOTE</b>:There are a few differences with the original implementation:<ul>
|
||||
* <li><a name="maxTempSlots"></a>The extra amount of memory to perform merges is
|
||||
* <li><a id="maxTempSlots"></a>The extra amount of memory to perform merges is
|
||||
* configurable. This allows small merges to be very fast while large merges
|
||||
* will be performed in-place (slightly slower). You can make sure that the
|
||||
* fast merge routine will always be used by having <code>maxTempSlots</code>
|
||||
|
|
|
@ -45,7 +45,7 @@ import java.util.concurrent.ConcurrentHashMap;
|
|||
* on the values and not-GCed keys. Lucene's implementation also supports {@code null}
|
||||
* keys, but those are never weak!
|
||||
*
|
||||
* <p><a name="reapInfo"></a>The map supports two modes of operation:
|
||||
* <p><a id="reapInfo"></a>The map supports two modes of operation:
|
||||
* <ul>
|
||||
* <li>{@code reapOnRead = true}: This behaves identical to a {@link java.util.WeakHashMap}
|
||||
* where it also cleans up the reference queue on every read operation ({@link #get(Object)},
|
||||
|
|
|
@ -40,7 +40,8 @@ import java.util.Set;
|
|||
* Regular Expression extension to <code>Automaton</code>.
|
||||
* <p>
|
||||
* Regular expressions are built from the following abstract syntax:
|
||||
* <table border=0 summary="description of regular expression grammar">
|
||||
* <table style="border: 0">
|
||||
* <caption>description of regular expression grammar</caption>
|
||||
* <tr>
|
||||
* <td><i>regexp</i></td>
|
||||
* <td>::=</td>
|
||||
|
@ -59,7 +60,7 @@ import java.util.Set;
|
|||
* <tr>
|
||||
* <td><i>unionexp</i></td>
|
||||
* <td>::=</td>
|
||||
* <td><i>interexp</i> <tt><b>|</b></tt> <i>unionexp</i></td>
|
||||
* <td><i>interexp</i> <code><b>|</b></code> <i>unionexp</i></td>
|
||||
* <td>(union)</td>
|
||||
* <td></td>
|
||||
* </tr>
|
||||
|
@ -74,7 +75,7 @@ import java.util.Set;
|
|||
* <tr>
|
||||
* <td><i>interexp</i></td>
|
||||
* <td>::=</td>
|
||||
* <td><i>concatexp</i> <tt><b>&</b></tt> <i>interexp</i></td>
|
||||
* <td><i>concatexp</i> <code><b>&</b></code> <i>interexp</i></td>
|
||||
* <td>(intersection)</td>
|
||||
* <td><small>[OPTIONAL]</small></td>
|
||||
* </tr>
|
||||
|
@ -104,43 +105,43 @@ import java.util.Set;
|
|||
* <tr>
|
||||
* <td><i>repeatexp</i></td>
|
||||
* <td>::=</td>
|
||||
* <td><i>repeatexp</i> <tt><b>?</b></tt></td>
|
||||
* <td><i>repeatexp</i> <code><b>?</b></code></td>
|
||||
* <td>(zero or one occurrence)</td>
|
||||
* <td></td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td></td>
|
||||
* <td>|</td>
|
||||
* <td><i>repeatexp</i> <tt><b>*</b></tt></td>
|
||||
* <td><i>repeatexp</i> <code><b>*</b></code></td>
|
||||
* <td>(zero or more occurrences)</td>
|
||||
* <td></td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td></td>
|
||||
* <td>|</td>
|
||||
* <td><i>repeatexp</i> <tt><b>+</b></tt></td>
|
||||
* <td><i>repeatexp</i> <code><b>+</b></code></td>
|
||||
* <td>(one or more occurrences)</td>
|
||||
* <td></td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td></td>
|
||||
* <td>|</td>
|
||||
* <td><i>repeatexp</i> <tt><b>{</b><i>n</i><b>}</b></tt></td>
|
||||
* <td>(<tt><i>n</i></tt> occurrences)</td>
|
||||
* <td><i>repeatexp</i> <code><b>{</b><i>n</i><b>}</b></code></td>
|
||||
* <td>(<code><i>n</i></code> occurrences)</td>
|
||||
* <td></td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td></td>
|
||||
* <td>|</td>
|
||||
* <td><i>repeatexp</i> <tt><b>{</b><i>n</i><b>,}</b></tt></td>
|
||||
* <td>(<tt><i>n</i></tt> or more occurrences)</td>
|
||||
* <td><i>repeatexp</i> <code><b>{</b><i>n</i><b>,}</b></code></td>
|
||||
* <td>(<code><i>n</i></code> or more occurrences)</td>
|
||||
* <td></td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td></td>
|
||||
* <td>|</td>
|
||||
* <td><i>repeatexp</i> <tt><b>{</b><i>n</i><b>,</b><i>m</i><b>}</b></tt></td>
|
||||
* <td>(<tt><i>n</i></tt> to <tt><i>m</i></tt> occurrences, including both)</td>
|
||||
* <td><i>repeatexp</i> <code><b>{</b><i>n</i><b>,</b><i>m</i><b>}</b></code></td>
|
||||
* <td>(<code><i>n</i></code> to <code><i>m</i></code> occurrences, including both)</td>
|
||||
* <td></td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
|
@ -154,7 +155,7 @@ import java.util.Set;
|
|||
* <tr>
|
||||
* <td><i>complexp</i></td>
|
||||
* <td>::=</td>
|
||||
* <td><tt><b>~</b></tt> <i>complexp</i></td>
|
||||
* <td><code><b>~</b></code> <i>complexp</i></td>
|
||||
* <td>(complement)</td>
|
||||
* <td><small>[OPTIONAL]</small></td>
|
||||
* </tr>
|
||||
|
@ -169,14 +170,14 @@ import java.util.Set;
|
|||
* <tr>
|
||||
* <td><i>charclassexp</i></td>
|
||||
* <td>::=</td>
|
||||
* <td><tt><b>[</b></tt> <i>charclasses</i> <tt><b>]</b></tt></td>
|
||||
* <td><code><b>[</b></code> <i>charclasses</i> <code><b>]</b></code></td>
|
||||
* <td>(character class)</td>
|
||||
* <td></td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td></td>
|
||||
* <td>|</td>
|
||||
* <td><tt><b>[^</b></tt> <i>charclasses</i> <tt><b>]</b></tt></td>
|
||||
* <td><code><b>[^</b></code> <i>charclasses</i> <code><b>]</b></code></td>
|
||||
* <td>(negated character class)</td>
|
||||
* <td></td>
|
||||
* </tr>
|
||||
|
@ -206,7 +207,7 @@ import java.util.Set;
|
|||
* <tr>
|
||||
* <td><i>charclass</i></td>
|
||||
* <td>::=</td>
|
||||
* <td><i>charexp</i> <tt><b>-</b></tt> <i>charexp</i></td>
|
||||
* <td><i>charexp</i> <code><b>-</b></code> <i>charexp</i></td>
|
||||
* <td>(character range, including end-points)</td>
|
||||
* <td></td>
|
||||
* </tr>
|
||||
|
@ -228,56 +229,56 @@ import java.util.Set;
|
|||
* <tr>
|
||||
* <td></td>
|
||||
* <td>|</td>
|
||||
* <td><tt><b>.</b></tt></td>
|
||||
* <td><code><b>.</b></code></td>
|
||||
* <td>(any single character)</td>
|
||||
* <td></td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td></td>
|
||||
* <td>|</td>
|
||||
* <td><tt><b>#</b></tt></td>
|
||||
* <td><code><b>#</b></code></td>
|
||||
* <td>(the empty language)</td>
|
||||
* <td><small>[OPTIONAL]</small></td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td></td>
|
||||
* <td>|</td>
|
||||
* <td><tt><b>@</b></tt></td>
|
||||
* <td><code><b>@</b></code></td>
|
||||
* <td>(any string)</td>
|
||||
* <td><small>[OPTIONAL]</small></td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td></td>
|
||||
* <td>|</td>
|
||||
* <td><tt><b>"</b></tt> <Unicode string without double-quotes> <tt><b>"</b></tt></td>
|
||||
* <td><code><b>"</b></code> <Unicode string without double-quotes> <code><b>"</b></code></td>
|
||||
* <td>(a string)</td>
|
||||
* <td></td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td></td>
|
||||
* <td>|</td>
|
||||
* <td><tt><b>(</b></tt> <tt><b>)</b></tt></td>
|
||||
* <td><code><b>(</b></code> <code><b>)</b></code></td>
|
||||
* <td>(the empty string)</td>
|
||||
* <td></td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td></td>
|
||||
* <td>|</td>
|
||||
* <td><tt><b>(</b></tt> <i>unionexp</i> <tt><b>)</b></tt></td>
|
||||
* <td><code><b>(</b></code> <i>unionexp</i> <code><b>)</b></code></td>
|
||||
* <td>(precedence override)</td>
|
||||
* <td></td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td></td>
|
||||
* <td>|</td>
|
||||
* <td><tt><b><</b></tt> <identifier> <tt><b>></b></tt></td>
|
||||
* <td><code><b><</b></code> <identifier> <code><b>></b></code></td>
|
||||
* <td>(named automaton)</td>
|
||||
* <td><small>[OPTIONAL]</small></td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td></td>
|
||||
* <td>|</td>
|
||||
* <td><tt><b><</b><i>n</i>-<i>m</i><b>></b></tt></td>
|
||||
* <td><code><b><</b><i>n</i>-<i>m</i><b>></b></code></td>
|
||||
* <td>(numerical interval)</td>
|
||||
* <td><small>[OPTIONAL]</small></td>
|
||||
* </tr>
|
||||
|
@ -292,7 +293,7 @@ import java.util.Set;
|
|||
* <tr>
|
||||
* <td></td>
|
||||
* <td>|</td>
|
||||
* <td><tt><b>\</b></tt> <Unicode character> </td>
|
||||
* <td><code><b>\</b></code> <Unicode character> </td>
|
||||
* <td>(a single character)</td>
|
||||
* <td></td>
|
||||
* </tr>
|
||||
|
@ -301,13 +302,13 @@ import java.util.Set;
|
|||
* The productions marked <small>[OPTIONAL]</small> are only allowed if
|
||||
* specified by the syntax flags passed to the <code>RegExp</code> constructor.
|
||||
* The reserved characters used in the (enabled) syntax must be escaped with
|
||||
* backslash (<tt><b>\</b></tt>) or double-quotes (<tt><b>"..."</b></tt>). (In
|
||||
* backslash (<code><b>\</b></code>) or double-quotes (<code><b>"..."</b></code>). (In
|
||||
* contrast to other regexp syntaxes, this is required also in character
|
||||
* classes.) Be aware that dash (<tt><b>-</b></tt>) has a special meaning in
|
||||
* classes.) Be aware that dash (<code><b>-</b></code>) has a special meaning in
|
||||
* <i>charclass</i> expressions. An identifier is a string not containing right
|
||||
* angle bracket (<tt><b>></b></tt>) or dash (<tt><b>-</b></tt>). Numerical
|
||||
* angle bracket (<code><b>></b></code>) or dash (<code><b>-</b></code>). Numerical
|
||||
* intervals are specified by non-negative decimal integers and include both end
|
||||
* points, and if <tt><i>n</i></tt> and <tt><i>m</i></tt> have the same number
|
||||
* points, and if <code><i>n</i></code> and <code><i>m</i></code> have the same number
|
||||
* of digits, then the conforming strings must have that length (i.e. prefixed
|
||||
* by 0's).
|
||||
*
|
||||
|
@ -320,33 +321,33 @@ public class RegExp {
|
|||
}
|
||||
|
||||
/**
|
||||
* Syntax flag, enables intersection (<tt>&</tt>).
|
||||
* Syntax flag, enables intersection (<code>&</code>).
|
||||
*/
|
||||
public static final int INTERSECTION = 0x0001;
|
||||
|
||||
/**
|
||||
* Syntax flag, enables complement (<tt>~</tt>).
|
||||
* Syntax flag, enables complement (<code>~</code>).
|
||||
*/
|
||||
public static final int COMPLEMENT = 0x0002;
|
||||
|
||||
/**
|
||||
* Syntax flag, enables empty language (<tt>#</tt>).
|
||||
* Syntax flag, enables empty language (<code>#</code>).
|
||||
*/
|
||||
public static final int EMPTY = 0x0004;
|
||||
|
||||
/**
|
||||
* Syntax flag, enables anystring (<tt>@</tt>).
|
||||
* Syntax flag, enables anystring (<code>@</code>).
|
||||
*/
|
||||
public static final int ANYSTRING = 0x0008;
|
||||
|
||||
/**
|
||||
* Syntax flag, enables named automata (<tt><</tt>identifier<tt>></tt>).
|
||||
* Syntax flag, enables named automata (<code><</code>identifier<code>></code>).
|
||||
*/
|
||||
public static final int AUTOMATON = 0x0010;
|
||||
|
||||
/**
|
||||
* Syntax flag, enables numerical intervals (
|
||||
* <tt><<i>n</i>-<i>m</i>></tt>).
|
||||
* <code><<i>n</i>-<i>m</i>></code>).
|
||||
*/
|
||||
public static final int INTERVAL = 0x0020;
|
||||
|
||||
|
|
|
@ -61,7 +61,7 @@ public class StatePair {
|
|||
* Checks for equality.
|
||||
*
|
||||
* @param obj object to compare with
|
||||
* @return true if <tt>obj</tt> represents the same pair of states as this
|
||||
* @return true if <code>obj</code> represents the same pair of states as this
|
||||
* pair
|
||||
*/
|
||||
@Override
|
||||
|
|
|
@ -35,12 +35,12 @@
|
|||
* regular expression operations.
|
||||
* <p>
|
||||
* The most commonly used functionality is located in the classes
|
||||
* <tt>{@link org.apache.lucene.util.automaton.Automaton}</tt> and
|
||||
* <tt>{@link org.apache.lucene.util.automaton.RegExp}</tt>.
|
||||
* <code>{@link org.apache.lucene.util.automaton.Automaton}</code> and
|
||||
* <code>{@link org.apache.lucene.util.automaton.RegExp}</code>.
|
||||
* <p>
|
||||
* For more information, go to the package home page at
|
||||
* <tt><a href="http://www.brics.dk/automaton/"
|
||||
* target="_top">http://www.brics.dk/automaton/</a></tt>.
|
||||
* <code><a href="http://www.brics.dk/automaton/"
|
||||
* target="_top">http://www.brics.dk/automaton/</a></code>.
|
||||
* @lucene.experimental
|
||||
*/
|
||||
package org.apache.lucene.util.automaton;
|
||||
|
|
|
@ -50,7 +50,7 @@ abstract class AbstractBlockPackedWriter {
|
|||
|
||||
/**
|
||||
* Sole constructor.
|
||||
* @param blockSize the number of values of a single block, must be a multiple of <tt>64</tt>
|
||||
* @param blockSize the number of values of a single block, must be a multiple of <code>64</code>
|
||||
*/
|
||||
public AbstractBlockPackedWriter(DataOutput out, int blockSize) {
|
||||
checkBlockSize(blockSize, MIN_BLOCK_SIZE, MAX_BLOCK_SIZE);
|
||||
|
|
|
@ -177,7 +177,7 @@ public final class BlockPackedReaderIterator {
|
|||
return value;
|
||||
}
|
||||
|
||||
/** Read between <tt>1</tt> and <code>count</code> values. */
|
||||
/** Read between <code>1</code> and <code>count</code> values. */
|
||||
public LongsRef next(int count) throws IOException {
|
||||
assert count > 0;
|
||||
if (ord == valueCount) {
|
||||
|
|
|
@ -39,17 +39,17 @@ import org.apache.lucene.store.DataOutput;
|
|||
* <li>Block: <Header, (Ints)>
|
||||
* <li>Header: <Token, (MinValue)>
|
||||
* <li>Token: a {@link DataOutput#writeByte(byte) byte}, first 7 bits are the
|
||||
* number of bits per value (<tt>bitsPerValue</tt>). If the 8th bit is 1,
|
||||
* then MinValue (see next) is <tt>0</tt>, otherwise MinValue and needs to
|
||||
* number of bits per value (<code>bitsPerValue</code>). If the 8th bit is 1,
|
||||
* then MinValue (see next) is <code>0</code>, otherwise MinValue and needs to
|
||||
* be decoded
|
||||
* <li>MinValue: a
|
||||
* <a href="https://developers.google.com/protocol-buffers/docs/encoding#types">zigzag-encoded</a>
|
||||
* {@link DataOutput#writeVLong(long) variable-length long} whose value
|
||||
* should be added to every int from the block to restore the original
|
||||
* values
|
||||
* <li>Ints: If the number of bits per value is <tt>0</tt>, then there is
|
||||
* <li>Ints: If the number of bits per value is <code>0</code>, then there is
|
||||
* nothing to decode and all ints are equal to MinValue. Otherwise: BlockSize
|
||||
* {@link PackedInts packed ints} encoded on exactly <tt>bitsPerValue</tt>
|
||||
* {@link PackedInts packed ints} encoded on exactly <code>bitsPerValue</code>
|
||||
* bits per value. They are the subtraction of the original values and
|
||||
* MinValue
|
||||
* </ul>
|
||||
|
|
|
@ -45,7 +45,7 @@ import org.apache.lucene.util.BitUtil;
|
|||
* {@link Float#floatToIntBits(float)} on
|
||||
* {@link DataOutput#writeInt(int) 4 bytes}
|
||||
* <li>BitsPerValue: a {@link DataOutput#writeVInt(int) variable-length int}
|
||||
* <li>Ints: if BitsPerValue is <tt>0</tt>, then there is nothing to read and
|
||||
* <li>Ints: if BitsPerValue is <code>0</code>, then there is nothing to read and
|
||||
* all values perfectly match the result of the function. Otherwise, these
|
||||
* are the {@link PackedInts packed} deltas from the expected value
|
||||
* (computed from the function) using exactly BitsPerValue bits per value.
|
||||
|
|
|
@ -147,22 +147,22 @@ index for all the files contained in a directory.</li>
|
|||
queries and searches an index.</li>
|
||||
</ul>
|
||||
To demonstrate these, try something like:
|
||||
<blockquote><tt>> <b>java -cp lucene-core.jar:lucene-demo.jar:lucene-analyzers-common.jar org.apache.lucene.demo.IndexFiles -index index -docs rec.food.recipes/soups</b></tt>
|
||||
<br><tt>adding rec.food.recipes/soups/abalone-chowder</tt>
|
||||
<br><tt> </tt>[ ... ]
|
||||
<blockquote><code>> <b>java -cp lucene-core.jar:lucene-demo.jar:lucene-analyzers-common.jar org.apache.lucene.demo.IndexFiles -index index -docs rec.food.recipes/soups</b></code>
|
||||
<br><code>adding rec.food.recipes/soups/abalone-chowder</code>
|
||||
<br><code> </code>[ ... ]
|
||||
|
||||
<p><tt>> <b>java -cp lucene-core.jar:lucene-demo.jar:lucene-queryparser.jar:lucene-analyzers-common.jar org.apache.lucene.demo.SearchFiles</b></tt>
|
||||
<br><tt>Query: <b>chowder</b></tt>
|
||||
<br><tt>Searching for: chowder</tt>
|
||||
<br><tt>34 total matching documents</tt>
|
||||
<br><tt>1. rec.food.recipes/soups/spam-chowder</tt>
|
||||
<br><tt> </tt>[ ... thirty-four documents contain the word "chowder" ... ]
|
||||
<p><code>> <b>java -cp lucene-core.jar:lucene-demo.jar:lucene-queryparser.jar:lucene-analyzers-common.jar org.apache.lucene.demo.SearchFiles</b></code>
|
||||
<br><code>Query: <b>chowder</b></code>
|
||||
<br><code>Searching for: chowder</code>
|
||||
<br><code>34 total matching documents</code>
|
||||
<br><code>1. rec.food.recipes/soups/spam-chowder</code>
|
||||
<br><code> </code>[ ... thirty-four documents contain the word "chowder" ... ]
|
||||
|
||||
<p><tt>Query: <b>"clam chowder" AND Manhattan</b></tt>
|
||||
<br><tt>Searching for: +"clam chowder" +manhattan</tt>
|
||||
<br><tt>2 total matching documents</tt>
|
||||
<br><tt>1. rec.food.recipes/soups/clam-chowder</tt>
|
||||
<br><tt> </tt>[ ... two documents contain the phrase "clam chowder"
|
||||
<p><code>Query: <b>"clam chowder" AND Manhattan</b></code>
|
||||
<br><code>Searching for: +"clam chowder" +manhattan</code>
|
||||
<br><code>2 total matching documents</code>
|
||||
<br><code>1. rec.food.recipes/soups/clam-chowder</code>
|
||||
<br><code> </code>[ ... two documents contain the phrase "clam chowder"
|
||||
and the word "manhattan" ... ]
|
||||
<br> [ Note: "+" and "-" are canonical, but "AND", "OR"
|
||||
and "NOT" may be used. ]</blockquote>
|
||||
|
|
|
@ -34,21 +34,21 @@
|
|||
<li><a href="#Searching_Files">Searching Files</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
<a name="About_this_Document"></a>
|
||||
<a id="About_this_Document"></a>
|
||||
<h2 class="boxed">About this Document</h2>
|
||||
<div class="section">
|
||||
<p>This document is intended as a "getting started" guide to using and running
|
||||
the Lucene demos. It walks you through some basic installation and
|
||||
configuration.</p>
|
||||
</div>
|
||||
<a name="About_the_Demo"></a>
|
||||
<a id="About_the_Demo"></a>
|
||||
<h2 class="boxed">About the Demo</h2>
|
||||
<div class="section">
|
||||
<p>The Lucene command-line demo code consists of an application that
|
||||
demonstrates various functionalities of Lucene and how you can add Lucene to
|
||||
your applications.</p>
|
||||
</div>
|
||||
<a name="Setting_your_CLASSPATH"></a>
|
||||
<a id="Setting_your_CLASSPATH"></a>
|
||||
<h2 class="boxed">Setting your CLASSPATH</h2>
|
||||
<div class="section">
|
||||
<p>First, you should <a href=
|
||||
|
@ -65,7 +65,7 @@ files called <span class="codefrag">lucene-queryparser-{version}.jar</span>,
|
|||
respectively.</p>
|
||||
<p>Put all four of these files in your Java CLASSPATH.</p>
|
||||
</div>
|
||||
<a name="Indexing_Files"></a>
|
||||
<a id="Indexing_Files"></a>
|
||||
<h2 class="boxed">Indexing Files</h2>
|
||||
<div class="section">
|
||||
<p>Once you've gotten this far you're probably itching to go. Let's <b>build an
|
||||
|
@ -85,7 +85,7 @@ You'll see that there are no maching results in the lucene source code.
|
|||
Now try entering the word "string". That should return a whole bunch
|
||||
of documents. The results will page at every tenth result and ask you whether
|
||||
you want more results.</div>
|
||||
<a name="About_the_code"></a>
|
||||
<a id="About_the_code"></a>
|
||||
<h2 class="boxed">About the code</h2>
|
||||
<div class="section">
|
||||
<p>In this section we walk through the sources behind the command-line Lucene
|
||||
|
@ -93,7 +93,7 @@ demo: where to find them, their parts and their function. This section is
|
|||
intended for Java developers wishing to understand how to use Lucene in their
|
||||
applications.</p>
|
||||
</div>
|
||||
<a name="Location_of_the_source"></a>
|
||||
<a id="Location_of_the_source"></a>
|
||||
<h2 class="boxed">Location of the source</h2>
|
||||
<div class="section">
|
||||
<p>The files discussed here are linked into this documentation directly:
|
||||
|
@ -101,9 +101,8 @@ applications.</p>
|
|||
<li><a href="src-html/org/apache/lucene/demo/IndexFiles.html">IndexFiles.java</a>: code to create a Lucene index.
|
||||
<li><a href="src-html/org/apache/lucene/demo/SearchFiles.html">SearchFiles.java</a>: code to search a Lucene index.
|
||||
</ul>
|
||||
</p>
|
||||
</div>
|
||||
<a name="IndexFiles" id="IndexFiles"></a>
|
||||
<a id="IndexFiles"></a>
|
||||
<h2 class="boxed">IndexFiles</h2>
|
||||
<div class="section">
|
||||
<p>As we discussed in the previous walk-through, the <a href=
|
||||
|
@ -174,7 +173,7 @@ already-indexed document with the same identifier (in our case, the file path
|
|||
serves as the identifier); deleting it from the index if it exists; and then
|
||||
adding the new document to the index.</p>
|
||||
</div>
|
||||
<a name="Searching_Files"></a>
|
||||
<a id="Searching_Files"></a>
|
||||
<h2 class="boxed">Searching Files</h2>
|
||||
<div class="section">
|
||||
<p>The <a href=
|
||||
|
|
|
@ -31,11 +31,9 @@
|
|||
<li>{@link org.apache.lucene.expressions} - The abstractions and simple utilities for common operations like sorting on an expression</li>
|
||||
<li>{@link org.apache.lucene.expressions.js} - A compiler for a subset of JavaScript expressions</li>
|
||||
</ol>
|
||||
</p>
|
||||
<p>
|
||||
For sample code showing how to use the API, see {@link org.apache.lucene.expressions.Expression}.
|
||||
</p>
|
||||
<p>
|
||||
|
||||
</body>
|
||||
</html>
|
||||
</html>
|
||||
|
|
|
@ -35,7 +35,8 @@
|
|||
* <p>To explain the algorithm, let's use the following sample text
|
||||
* (to be highlighted) and user query:</p>
|
||||
*
|
||||
* <table border=1 summary="sample document and query">
|
||||
* <table style="border: 1px solid">
|
||||
* <caption>sample document and query</caption>
|
||||
* <tr>
|
||||
* <td><b>Sample Text</b></td>
|
||||
* <td>Lucene is a search engine library.</td>
|
||||
|
@ -164,7 +165,7 @@
|
|||
* <li><code>WeightedFragListBuilder using WeightedFieldFragList</code>: <i>sum-of-distinct-weights</i>-approach. The totalBoost is calculated by summarizing the IDF-weights of distinct terms.</li>
|
||||
* </ul>
|
||||
* <p>Comparison of the two approaches:</p>
|
||||
* <table border="1">
|
||||
* <table style="border: 1px solid">
|
||||
* <caption>
|
||||
* query = das alte testament (The Old Testament)
|
||||
* </caption>
|
||||
|
|
|
@ -336,7 +336,7 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
|||
}
|
||||
|
||||
/**
|
||||
* This method intended for use with <tt>testHighlightingWithDefaultField()</tt>
|
||||
* This method intended for use with <code>testHighlightingWithDefaultField()</code>
|
||||
*/
|
||||
private String highlightField(Query query, String fieldName, String text)
|
||||
throws IOException, InvalidTokenOffsetsException {
|
||||
|
|
|
@ -101,7 +101,7 @@ public class HighlightCustomQueryTest extends LuceneTestCase {
|
|||
|
||||
/**
|
||||
* This method intended for use with
|
||||
* <tt>testHighlightingWithDefaultField()</tt>
|
||||
* <code>testHighlightingWithDefaultField()</code>
|
||||
*/
|
||||
private String highlightField(Query query, String fieldName,
|
||||
String text) throws IOException, InvalidTokenOffsetsException {
|
||||
|
|
|
@ -27,7 +27,7 @@
|
|||
The misc package has various tools for splitting/merging indices,
|
||||
changing norms, finding high freq terms, and others.
|
||||
|
||||
<a name="NativeUnixDirectory"></a>
|
||||
<a id="NativeUnixDirectory"></a>
|
||||
<h2>NativeUnixDirectory</h2>
|
||||
|
||||
<p>
|
||||
|
@ -47,15 +47,15 @@ for details.
|
|||
|
||||
Steps to build:
|
||||
<ul>
|
||||
<li> <tt>cd lucene/misc/</tt>
|
||||
<li> <code>cd lucene/misc/</code>
|
||||
|
||||
<li> To compile NativePosixUtil.cpp -> libNativePosixUtil.so, run<tt> ant build-native-unix</tt>.
|
||||
<li> To compile NativePosixUtil.cpp -> libNativePosixUtil.so, run<code> ant build-native-unix</code>.
|
||||
|
||||
<li><tt>libNativePosixUtil.so</tt> will be located in the <tt>lucene/build/native/</tt> folder
|
||||
<li><code>libNativePosixUtil.so</code> will be located in the <code>lucene/build/native/</code> folder
|
||||
|
||||
<li> Make sure libNativePosixUtil.so is on your LD_LIBRARY_PATH so java can find it (something like <tt>export LD_LIBRARY_PATH=/path/to/dir:$LD_LIBRARY_PATH</tt>, where /path/to/dir contains libNativePosixUtil.so)
|
||||
<li> Make sure libNativePosixUtil.so is on your LD_LIBRARY_PATH so java can find it (something like <code>export LD_LIBRARY_PATH=/path/to/dir:$LD_LIBRARY_PATH</code>, where /path/to/dir contains libNativePosixUtil.so)
|
||||
|
||||
<li> <tt>ant jar</tt> to compile the java source and put that JAR on your CLASSPATH
|
||||
<li> <code>ant jar</code> to compile the java source and put that JAR on your CLASSPATH
|
||||
</ul>
|
||||
|
||||
<p>
|
||||
|
|
|
@ -256,7 +256,7 @@ public class CommonTermsQuery extends Query {
|
|||
* satisfied in order to produce a match on the low frequency terms query
|
||||
* part. This method accepts a float value in the range [0..1) as a fraction
|
||||
* of the actual query terms in the low frequent clause or a number
|
||||
* <tt>>=1</tt> as an absolut number of clauses that need to match.
|
||||
* <code>>=1</code> as an absolut number of clauses that need to match.
|
||||
*
|
||||
* <p>
|
||||
* By default no optional clauses are necessary for a match (unless there are
|
||||
|
@ -284,7 +284,7 @@ public class CommonTermsQuery extends Query {
|
|||
* satisfied in order to produce a match on the low frequency terms query
|
||||
* part. This method accepts a float value in the range [0..1) as a fraction
|
||||
* of the actual query terms in the low frequent clause or a number
|
||||
* <tt>>=1</tt> as an absolut number of clauses that need to match.
|
||||
* <code>>=1</code> as an absolut number of clauses that need to match.
|
||||
*
|
||||
* <p>
|
||||
* By default no optional clauses are necessary for a match (unless there are
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue