mirror of https://github.com/apache/lucene.git
LUCENE-4797: enable doclint html verification
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1658040 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
8f3a71443b
commit
376256316b
|
@ -37,7 +37,6 @@ import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
|
|||
* This analyzer implements light-stemming as specified by: <i> Searching
|
||||
* Strategies for the Bulgarian Language </i>
|
||||
* http://members.unine.ch/jacques.savoy/Papers/BUIR.pdf
|
||||
* <p>
|
||||
*/
|
||||
public final class BulgarianAnalyzer extends StopwordAnalyzerBase {
|
||||
|
||||
|
|
|
@ -17,7 +17,6 @@
|
|||
|
||||
/**
|
||||
* Normalization of text before the tokenizer.
|
||||
* </p>
|
||||
* <p>
|
||||
* CharFilters are chainable filters that normalize text before tokenization
|
||||
* and provide mappings between normalized text offsets and the corresponding
|
||||
|
|
|
@ -38,7 +38,7 @@ public final class CJKAnalyzer extends StopwordAnalyzerBase {
|
|||
|
||||
/**
|
||||
* File containing default CJK stopwords.
|
||||
* <p/>
|
||||
* <p>
|
||||
* Currently it contains some common English words that are not usually
|
||||
* useful for searching and some double-byte interpunctions.
|
||||
*/
|
||||
|
|
|
@ -32,6 +32,5 @@
|
|||
* <li>CJKAnalyzer: 我是-是中-中国-国人</li>
|
||||
* <li>SmartChineseAnalyzer: 我-是-中国-人</li>
|
||||
* </ol>
|
||||
* </p>
|
||||
*/
|
||||
package org.apache.lucene.analysis.cjk;
|
||||
|
|
|
@ -27,7 +27,6 @@ import org.apache.lucene.analysis.util.CharArraySet;
|
|||
* "Donaudampfschiff" becomes Donau, dampf, schiff so that you can find
|
||||
* "Donaudampfschiff" even when you only enter "schiff".
|
||||
* It uses a brute-force algorithm to achieve this.
|
||||
* <p>
|
||||
*/
|
||||
public class DictionaryCompoundWordTokenFilter extends CompoundWordTokenFilterBase {
|
||||
|
||||
|
|
|
@ -48,7 +48,7 @@ import org.xml.sax.InputSource;
|
|||
* <li><code>onlyLongestMatch</code> (optional): if true, adds only the longest matching subword
|
||||
* to the stream. defaults to false.
|
||||
* </ul>
|
||||
* <p>
|
||||
* <br>
|
||||
* <pre class="prettyprint">
|
||||
* <fieldType name="text_hyphncomp" class="solr.TextField" positionIncrementGap="100">
|
||||
* <analyzer>
|
||||
|
|
|
@ -92,7 +92,7 @@
|
|||
* ). The files you need are in the subfolder
|
||||
* <i>offo-hyphenation/hyph/</i>
|
||||
* .
|
||||
* <br />
|
||||
* <br>
|
||||
* Credits for the hyphenation code go to the
|
||||
* <a href="http://xmlgraphics.apache.org/fop/">Apache FOP project</a>
|
||||
* .
|
||||
|
|
|
@ -29,14 +29,12 @@ import org.apache.lucene.analysis.util.StemmerUtil;
|
|||
* of the <a href="http://snowball.tartarus.org/algorithms/german2/stemmer.html">
|
||||
* German2 snowball algorithm</a>.
|
||||
* It allows for the fact that ä, ö and ü are sometimes written as ae, oe and ue.
|
||||
* <p>
|
||||
* <ul>
|
||||
* <li> 'ß' is replaced by 'ss'
|
||||
* <li> 'ä', 'ö', 'ü' are replaced by 'a', 'o', 'u', respectively.
|
||||
* <li> 'ae' and 'oe' are replaced by 'a', and 'o', respectively.
|
||||
* <li> 'ue' is replaced by 'u', when not following a vowel or q.
|
||||
* </ul>
|
||||
* <p>
|
||||
* This is useful if you want this normalization without using
|
||||
* the German2 stemmer, or perhaps no stemming at all.
|
||||
*/
|
||||
|
|
|
@ -25,12 +25,12 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
|||
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
|
||||
/** A high-performance kstem filter for english.
|
||||
* <p/>
|
||||
* <p>
|
||||
* See <a href="http://ciir.cs.umass.edu/pubfiles/ir-35.pdf">
|
||||
* "Viewing Morphology as an Inference Process"</a>
|
||||
* (Krovetz, R., Proceedings of the Sixteenth Annual International ACM SIGIR
|
||||
* Conference on Research and Development in Information Retrieval, 191-203, 1993).
|
||||
* <p/>
|
||||
* <p>
|
||||
* All terms must already be lowercased for this filter to work correctly.
|
||||
*
|
||||
* <p>
|
||||
|
|
|
@ -33,7 +33,7 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
|||
Analyzer class that sets up the TokenStream chain as you want it.
|
||||
To use this with LowerCaseTokenizer, for example, you'd write an
|
||||
analyzer like this:
|
||||
<P>
|
||||
<br>
|
||||
<PRE class="prettyprint">
|
||||
class MyAnalyzer extends Analyzer {
|
||||
{@literal @Override}
|
||||
|
|
|
@ -28,7 +28,7 @@ import org.apache.lucene.analysis.util.CharArraySet;
|
|||
/**
|
||||
* A filter to apply normal capitalization rules to Tokens. It will make the first letter
|
||||
* capital and the rest lower case.
|
||||
* <p/>
|
||||
* <p>
|
||||
* This filter is particularly useful to build nice looking facet parameters. This filter
|
||||
* is not appropriate if you intend to use a prefix query.
|
||||
*/
|
||||
|
|
|
@ -29,19 +29,21 @@ import java.util.Set;
|
|||
|
||||
/**
|
||||
* Factory for {@link CapitalizationFilter}.
|
||||
* <p/>
|
||||
* The factory takes parameters:<br/>
|
||||
* "onlyFirstWord" - should each word be capitalized or all of the words?<br/>
|
||||
* "keep" - a keep word list. Each word that should be kept separated by whitespace.<br/>
|
||||
* "keepIgnoreCase - true or false. If true, the keep list will be considered case-insensitive.<br/>
|
||||
* "forceFirstLetter" - Force the first letter to be capitalized even if it is in the keep list<br/>
|
||||
* "okPrefix" - do not change word capitalization if a word begins with something in this list.
|
||||
* <p>
|
||||
* The factory takes parameters:
|
||||
* <ul>
|
||||
* <li> "onlyFirstWord" - should each word be capitalized or all of the words?
|
||||
* <li> "keep" - a keep word list. Each word that should be kept separated by whitespace.
|
||||
* <li> "keepIgnoreCase - true or false. If true, the keep list will be considered case-insensitive.
|
||||
* <li> "forceFirstLetter" - Force the first letter to be capitalized even if it is in the keep list
|
||||
* <li> "okPrefix" - do not change word capitalization if a word begins with something in this list.
|
||||
* for example if "McK" is on the okPrefix list, the word "McKinley" should not be changed to
|
||||
* "Mckinley"<br/>
|
||||
* "minWordLength" - how long the word needs to be to get capitalization applied. If the
|
||||
* minWordLength is 3, "and" > "And" but "or" stays "or"<br/>
|
||||
* "maxWordCount" - if the token contains more then maxWordCount words, the capitalization is
|
||||
* assumed to be correct.<br/>
|
||||
* "Mckinley"
|
||||
* <li> "minWordLength" - how long the word needs to be to get capitalization applied. If the
|
||||
* minWordLength is 3, "and" > "And" but "or" stays "or"
|
||||
* <li>"maxWordCount" - if the token contains more then maxWordCount words, the capitalization is
|
||||
* assumed to be correct.
|
||||
* </ul>
|
||||
*
|
||||
* <pre class="prettyprint">
|
||||
* <fieldType name="text_cptlztn" class="solr.TextField" positionIncrementGap="100">
|
||||
|
|
|
@ -24,7 +24,7 @@ import java.io.IOException;
|
|||
|
||||
/**
|
||||
* Links two {@link PrefixAwareTokenFilter}.
|
||||
* <p/>
|
||||
* <p>
|
||||
* <b>NOTE:</b> This filter might not behave correctly if used with custom Attributes, i.e. Attributes other than
|
||||
* the ones located in org.apache.lucene.analysis.tokenattributes.
|
||||
*/
|
||||
|
|
|
@ -35,7 +35,7 @@ import java.io.IOException;
|
|||
* to be used when updating the token values in the second stream based on that token.
|
||||
*
|
||||
* The default implementation adds last prefix token end offset to the suffix token start and end offsets.
|
||||
* <p/>
|
||||
* <p>
|
||||
* <b>NOTE:</b> This filter might not behave correctly if used with custom Attributes, i.e. Attributes other than
|
||||
* the ones located in org.apache.lucene.analysis.tokenattributes.
|
||||
*/
|
||||
|
|
|
@ -27,25 +27,24 @@ import java.io.IOException;
|
|||
/**
|
||||
* This filter folds Scandinavian characters åÅäæÄÆ->a and öÖøØ->o.
|
||||
* It also discriminate against use of double vowels aa, ae, ao, oe and oo, leaving just the first one.
|
||||
* <p/>
|
||||
* <p>
|
||||
* It's a semantically more destructive solution than {@link ScandinavianNormalizationFilter} but
|
||||
* can in addition help with matching raksmorgas as räksmörgås.
|
||||
* <p/>
|
||||
* <p>
|
||||
* blåbærsyltetøj == blåbärsyltetöj == blaabaarsyltetoej == blabarsyltetoj
|
||||
* räksmörgås == ræksmørgås == ræksmörgaos == raeksmoergaas == raksmorgas
|
||||
* <p/>
|
||||
* <p>
|
||||
* Background:
|
||||
* Swedish åäö are in fact the same letters as Norwegian and Danish åæø and thus interchangeable
|
||||
* when used between these languages. They are however folded differently when people type
|
||||
* them on a keyboard lacking these characters.
|
||||
* <p/>
|
||||
* <p>
|
||||
* In that situation almost all Swedish people use a, a, o instead of å, ä, ö.
|
||||
* <p/>
|
||||
* <p>
|
||||
* Norwegians and Danes on the other hand usually type aa, ae and oe instead of å, æ and ø.
|
||||
* Some do however use a, a, o, oo, ao and sometimes permutations of everything above.
|
||||
* <p/>
|
||||
* <p>
|
||||
* This filter solves that mismatch problem, but might also cause new.
|
||||
* <p/>
|
||||
* @see ScandinavianNormalizationFilter
|
||||
*/
|
||||
public final class ScandinavianFoldingFilter extends TokenFilter {
|
||||
|
|
|
@ -27,14 +27,13 @@ import java.io.IOException;
|
|||
/**
|
||||
* This filter normalize use of the interchangeable Scandinavian characters æÆäÄöÖøØ
|
||||
* and folded variants (aa, ao, ae, oe and oo) by transforming them to åÅæÆøØ.
|
||||
* <p/>
|
||||
* <p>
|
||||
* It's a semantically less destructive solution than {@link ScandinavianFoldingFilter},
|
||||
* most useful when a person with a Norwegian or Danish keyboard queries a Swedish index
|
||||
* and vice versa. This filter does <b>not</b> the common Swedish folds of å and ä to a nor ö to o.
|
||||
* <p/>
|
||||
* <p>
|
||||
* blåbærsyltetøj == blåbärsyltetöj == blaabaarsyltetoej but not blabarsyltetoj
|
||||
* räksmörgås == ræksmørgås == ræksmörgaos == raeksmoergaas but not raksmorgas
|
||||
* <p/>
|
||||
* @see ScandinavianFoldingFilter
|
||||
*/
|
||||
public final class ScandinavianNormalizationFilter extends TokenFilter {
|
||||
|
|
|
@ -96,42 +96,42 @@ public final class WordDelimiterFilter extends TokenFilter {
|
|||
|
||||
/**
|
||||
* Causes parts of words to be generated:
|
||||
* <p/>
|
||||
* <p>
|
||||
* "PowerShot" => "Power" "Shot"
|
||||
*/
|
||||
public static final int GENERATE_WORD_PARTS = 1;
|
||||
|
||||
/**
|
||||
* Causes number subwords to be generated:
|
||||
* <p/>
|
||||
* <p>
|
||||
* "500-42" => "500" "42"
|
||||
*/
|
||||
public static final int GENERATE_NUMBER_PARTS = 2;
|
||||
|
||||
/**
|
||||
* Causes maximum runs of word parts to be catenated:
|
||||
* <p/>
|
||||
* <p>
|
||||
* "wi-fi" => "wifi"
|
||||
*/
|
||||
public static final int CATENATE_WORDS = 4;
|
||||
|
||||
/**
|
||||
* Causes maximum runs of word parts to be catenated:
|
||||
* <p/>
|
||||
* <p>
|
||||
* "wi-fi" => "wifi"
|
||||
*/
|
||||
public static final int CATENATE_NUMBERS = 8;
|
||||
|
||||
/**
|
||||
* Causes all subword parts to be catenated:
|
||||
* <p/>
|
||||
* <p>
|
||||
* "wi-fi-4000" => "wifi4000"
|
||||
*/
|
||||
public static final int CATENATE_ALL = 16;
|
||||
|
||||
/**
|
||||
* Causes original words are preserved and added to the subword list (Defaults to false)
|
||||
* <p/>
|
||||
* <p>
|
||||
* "500-42" => "500" "42" "500-42"
|
||||
*/
|
||||
public static final int PRESERVE_ORIGINAL = 32;
|
||||
|
@ -150,7 +150,7 @@ public final class WordDelimiterFilter extends TokenFilter {
|
|||
|
||||
/**
|
||||
* Causes trailing "'s" to be removed for each subword
|
||||
* <p/>
|
||||
* <p>
|
||||
* "O'Neil's" => "O", "Neil"
|
||||
*/
|
||||
public static final int STEM_ENGLISH_POSSESSIVE = 256;
|
||||
|
|
|
@ -31,7 +31,7 @@ import org.apache.lucene.analysis.util.CharacterUtils;
|
|||
* Tokenizes the given token into n-grams of given size(s).
|
||||
* <p>
|
||||
* This {@link TokenFilter} create n-grams from the beginning edge of a input token.
|
||||
* <p><a name="match_version" />As of Lucene 4.4, this filter handles correctly
|
||||
* <p><a name="match_version"></a>As of Lucene 4.4, this filter handles correctly
|
||||
* supplementary characters.
|
||||
*/
|
||||
public final class EdgeNGramTokenFilter extends TokenFilter {
|
||||
|
|
|
@ -25,7 +25,7 @@ import org.apache.lucene.util.Version;
|
|||
* Tokenizes the input from an edge into n-grams of given size(s).
|
||||
* <p>
|
||||
* This {@link Tokenizer} create n-grams from the beginning edge of a input token.
|
||||
* <p><a name="match_version" />As of Lucene 4.4, this class supports
|
||||
* <p><a name="match_version"></a>As of Lucene 4.4, this class supports
|
||||
* {@link #isTokenChar(int) pre-tokenization} and correctly handles
|
||||
* supplementary characters.
|
||||
*/
|
||||
|
|
|
@ -39,7 +39,7 @@ import org.apache.lucene.util.AttributeFactory;
|
|||
* <tr><th>Position length</th><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td></tr>
|
||||
* <tr><th>Offsets</th><td>[0,2[</td><td>[0,3[</td><td>[1,3[</td><td>[1,4[</td><td>[2,4[</td><td>[2,5[</td><td>[3,5[</td></tr>
|
||||
* </table>
|
||||
* <a name="version"/>
|
||||
* <a name="version"></a>
|
||||
* <p>This tokenizer changed a lot in Lucene 4.4 in order to:<ul>
|
||||
* <li>tokenize in a streaming fashion to support streams which are larger
|
||||
* than 1024 chars (limit of the previous version),
|
||||
|
|
|
@ -58,9 +58,9 @@ import org.apache.lucene.util.CharsRefBuilder;
|
|||
* </p>
|
||||
* <p>
|
||||
* <code>
|
||||
* "([A-Z]{2,})", <br />
|
||||
* "(?<![A-Z])([A-Z][a-z]+)", <br />
|
||||
* "(?:^|\\b|(?<=[0-9_])|(?<=[A-Z]{2}))([a-z]+)", <br />
|
||||
* "([A-Z]{2,})",
|
||||
* "(?<![A-Z])([A-Z][a-z]+)",
|
||||
* "(?:^|\\b|(?<=[0-9_])|(?<=[A-Z]{2}))([a-z]+)",
|
||||
* "([0-9]+)"
|
||||
* </code>
|
||||
* </p>
|
||||
|
|
|
@ -30,7 +30,7 @@ import org.apache.lucene.analysis.charfilter.BaseCharFilter;
|
|||
* The pattern match will be done in each "block" in char stream.
|
||||
*
|
||||
* <p>
|
||||
* ex1) source="aa bb aa bb", pattern="(aa)\\s+(bb)" replacement="$1#$2"<br/>
|
||||
* ex1) source="aa bb aa bb", pattern="(aa)\\s+(bb)" replacement="$1#$2"<br>
|
||||
* output="aa#bb aa#bb"
|
||||
* </p>
|
||||
*
|
||||
|
@ -39,9 +39,9 @@ import org.apache.lucene.analysis.charfilter.BaseCharFilter;
|
|||
* face a trouble.
|
||||
*
|
||||
* <p>
|
||||
* ex2) source="aa123bb", pattern="(aa)\\d+(bb)" replacement="$1 $2"<br/>
|
||||
* output="aa bb"<br/>
|
||||
* and you want to search bb and highlight it, you will get<br/>
|
||||
* ex2) source="aa123bb", pattern="(aa)\\d+(bb)" replacement="$1 $2"<br>
|
||||
* output="aa bb"<br>
|
||||
* and you want to search bb and highlight it, you will get<br>
|
||||
* highlight snippet="aa1<em>23bb</em>"
|
||||
* </p>
|
||||
*
|
||||
|
|
|
@ -30,7 +30,6 @@ import org.apache.lucene.util.AttributeFactory;
|
|||
/**
|
||||
* This tokenizer uses regex pattern matching to construct distinct tokens
|
||||
* for the input stream. It takes two arguments: "pattern" and "group".
|
||||
* <p/>
|
||||
* <ul>
|
||||
* <li>"pattern" is the regular expression.</li>
|
||||
* <li>"group" says which group to extract into tokens.</li>
|
||||
|
@ -41,7 +40,7 @@ import org.apache.lucene.util.AttributeFactory;
|
|||
* {@link String#split(java.lang.String)}
|
||||
* </p>
|
||||
* <p>
|
||||
* Using group >= 0 selects the matching group as the token. For example, if you have:<br/>
|
||||
* Using group >= 0 selects the matching group as the token. For example, if you have:<br>
|
||||
* <pre>
|
||||
* pattern = \'([^\']+)\'
|
||||
* group = 0
|
||||
|
@ -49,7 +48,6 @@ import org.apache.lucene.util.AttributeFactory;
|
|||
*</pre>
|
||||
* the output will be two tokens: 'bbb' and 'ccc' (including the ' marks). With the same input
|
||||
* but using group=1, the output would be: bbb and ccc (no ' marks)
|
||||
* </p>
|
||||
* <p>NOTE: This Tokenizer does not output tokens that are of zero length.</p>
|
||||
*
|
||||
* @see Pattern
|
||||
|
|
|
@ -27,7 +27,7 @@ import org.apache.lucene.util.AttributeFactory;
|
|||
* Factory for {@link PatternTokenizer}.
|
||||
* This tokenizer uses regex pattern matching to construct distinct tokens
|
||||
* for the input stream. It takes two arguments: "pattern" and "group".
|
||||
* <p/>
|
||||
* <br>
|
||||
* <ul>
|
||||
* <li>"pattern" is the regular expression.</li>
|
||||
* <li>"group" says which group to extract into tokens.</li>
|
||||
|
@ -38,7 +38,7 @@ import org.apache.lucene.util.AttributeFactory;
|
|||
* {@link String#split(java.lang.String)}
|
||||
* </p>
|
||||
* <p>
|
||||
* Using group >= 0 selects the matching group as the token. For example, if you have:<br/>
|
||||
* Using group >= 0 selects the matching group as the token. For example, if you have:<br>
|
||||
* <pre>
|
||||
* pattern = \'([^\']+)\'
|
||||
* group = 0
|
||||
|
@ -46,7 +46,6 @@ import org.apache.lucene.util.AttributeFactory;
|
|||
* </pre>
|
||||
* the output will be two tokens: 'bbb' and 'ccc' (including the ' marks). With the same input
|
||||
* but using group=1, the output would be: bbb and ccc (no ' marks)
|
||||
* </p>
|
||||
* <p>NOTE: This Tokenizer does not output tokens that are of zero length.</p>
|
||||
*
|
||||
* <pre class="prettyprint">
|
||||
|
|
|
@ -26,12 +26,12 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
|||
|
||||
/**
|
||||
* Characters before the delimiter are the "token", those after are the payload.
|
||||
* <p/>
|
||||
* <p>
|
||||
* For example, if the delimiter is '|', then for the string "foo|bar", foo is the token
|
||||
* and "bar" is a payload.
|
||||
* <p/>
|
||||
* <p>
|
||||
* Note, you can also include a {@link org.apache.lucene.analysis.payloads.PayloadEncoder} to convert the payload in an appropriate way (from characters to bytes).
|
||||
* <p/>
|
||||
* <p>
|
||||
* Note make sure your Tokenizer doesn't split on the delimiter, or this won't work
|
||||
*
|
||||
* @see PayloadEncoder
|
||||
|
|
|
@ -20,8 +20,7 @@ import org.apache.lucene.util.BytesRef;
|
|||
*/
|
||||
|
||||
/**
|
||||
* Encode a character array Float as a {@link BytesRef}.
|
||||
* <p/>
|
||||
* Encode a character array Float as a {@link BytesRef}.
|
||||
* @see org.apache.lucene.analysis.payloads.PayloadHelper#encodeFloat(float, byte[], int)
|
||||
*
|
||||
**/
|
||||
|
|
|
@ -22,7 +22,7 @@ import org.apache.lucene.util.BytesRef;
|
|||
|
||||
/**
|
||||
* Encode a character array Integer as a {@link BytesRef}.
|
||||
* <p/>
|
||||
* <p>
|
||||
* See {@link org.apache.lucene.analysis.payloads.PayloadHelper#encodeInt(int, byte[], int)}.
|
||||
*
|
||||
**/
|
||||
|
|
|
@ -23,7 +23,7 @@ import org.apache.lucene.util.BytesRef;
|
|||
/**
|
||||
* Mainly for use with the DelimitedPayloadTokenFilter, converts char buffers to
|
||||
* {@link BytesRef}.
|
||||
* <p/>
|
||||
* <p>
|
||||
* NOTE: This interface is subject to change
|
||||
*
|
||||
**/
|
||||
|
|
|
@ -28,7 +28,6 @@ import org.apache.lucene.util.AttributeSource;
|
|||
/**
|
||||
* Attempts to parse the {@link CharTermAttribute#buffer()} as a Date using a {@link java.text.DateFormat}.
|
||||
* If the value is a Date, it will add it to the sink.
|
||||
* <p/>
|
||||
*
|
||||
**/
|
||||
public class DateRecognizerSinkFilter extends TeeSinkTokenFilter.SinkFilter {
|
||||
|
|
|
@ -32,7 +32,7 @@ import org.apache.lucene.util.AttributeSource;
|
|||
* This TokenFilter provides the ability to set aside attribute states
|
||||
* that have already been analyzed. This is useful in situations where multiple fields share
|
||||
* many common analysis steps and then go their separate ways.
|
||||
* <p/>
|
||||
* <p>
|
||||
* It is also useful for doing things like entity extraction or proper noun analysis as
|
||||
* part of the analysis workflow and saving off those tokens for use in another field.
|
||||
*
|
||||
|
|
|
@ -36,7 +36,6 @@ import org.tartarus.snowball.SnowballProgram;
|
|||
* <li>For the Turkish language, see {@link TurkishLowerCaseFilter}.
|
||||
* <li>For other languages, see {@link LowerCaseFilter}.
|
||||
* </ul>
|
||||
* </p>
|
||||
*
|
||||
* <p>
|
||||
* Note: This filter is aware of the {@link KeywordAttribute}. To prevent
|
||||
|
|
|
@ -31,7 +31,6 @@ import org.apache.lucene.util.AttributeFactory;
|
|||
* This class implements the Word Break rules from the
|
||||
* Unicode Text Segmentation algorithm, as specified in
|
||||
* <a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a>.
|
||||
* <p/>
|
||||
* <p>Many applications have specific tokenizer needs. If this tokenizer does
|
||||
* not suit your application, please consider copying this source code
|
||||
* directory to your project and maintaining your own grammar-based tokenizer.
|
||||
|
|
|
@ -25,7 +25,7 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
|||
* This class implements Word Break rules from the Unicode Text Segmentation
|
||||
* algorithm, as specified in
|
||||
* <a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a>.
|
||||
* <p/>
|
||||
* <p>
|
||||
* Tokens produced are of the following types:
|
||||
* <ul>
|
||||
* <li><ALPHANUM>: A sequence of alphabetic and numeric characters</li>
|
||||
|
|
|
@ -23,7 +23,7 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
|||
* This class implements Word Break rules from the Unicode Text Segmentation
|
||||
* algorithm, as specified in
|
||||
* <a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a>.
|
||||
* <p/>
|
||||
* <p>
|
||||
* Tokens produced are of the following types:
|
||||
* <ul>
|
||||
* <li><ALPHANUM>: A sequence of alphabetic and numeric characters</li>
|
||||
|
|
|
@ -31,7 +31,7 @@ import org.apache.lucene.util.AttributeFactory;
|
|||
* algorithm, as specified in
|
||||
* <a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a>
|
||||
* URLs and email addresses are also tokenized according to the relevant RFCs.
|
||||
* <p/>
|
||||
* <p>
|
||||
* Tokens produced are of the following types:
|
||||
* <ul>
|
||||
* <li><ALPHANUM>: A sequence of alphabetic and numeric characters</li>
|
||||
|
|
|
@ -26,7 +26,7 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
|||
* algorithm, as specified in
|
||||
* <a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a>
|
||||
* URLs and email addresses are also tokenized according to the relevant RFCs.
|
||||
* <p/>
|
||||
* <p>
|
||||
* Tokens produced are of the following types:
|
||||
* <ul>
|
||||
* <li><ALPHANUM>: A sequence of alphabetic and numeric characters</li>
|
||||
|
|
|
@ -24,7 +24,7 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
|||
* algorithm, as specified in
|
||||
* <a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a>
|
||||
* URLs and email addresses are also tokenized according to the relevant RFCs.
|
||||
* <p/>
|
||||
* <p>
|
||||
* Tokens produced are of the following types:
|
||||
* <ul>
|
||||
* <li><ALPHANUM>: A sequence of alphabetic and numeric characters</li>
|
||||
|
|
|
@ -58,7 +58,7 @@ import org.apache.lucene.util.fst.FST;
|
|||
* Then input <code>a b c d e</code> parses to <code>y b c
|
||||
* d</code>, ie the 2nd rule "wins" because it started
|
||||
* earliest and matched the most input tokens of other rules
|
||||
* starting at that point.</p>
|
||||
* starting at that point.
|
||||
*
|
||||
* <p>A future improvement to this filter could allow
|
||||
* non-greedy parsing, such that the 3rd rule would win, and
|
||||
|
|
|
@ -73,7 +73,6 @@ import org.apache.lucene.util.Version;
|
|||
* <li><code>boolean expand</code> - true if conflation groups should be expanded, false if they are one-directional</li>
|
||||
* <li><code>{@link Analyzer} analyzer</code> - an analyzer used for each raw synonym</li>
|
||||
* </ul>
|
||||
* </p>
|
||||
* @see SolrSynonymParser SolrSynonymParser: default format
|
||||
*/
|
||||
public class SynonymFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {
|
||||
|
|
|
@ -130,7 +130,6 @@ public class WordlistLoader {
|
|||
* <li>The comment character is the vertical line (|).
|
||||
* <li>Lines may contain trailing comments.
|
||||
* </ul>
|
||||
* </p>
|
||||
*
|
||||
* @param reader Reader containing a Snowball stopword list
|
||||
* @param result the {@link CharArraySet} to fill with the readers words
|
||||
|
@ -164,7 +163,6 @@ public class WordlistLoader {
|
|||
* <li>The comment character is the vertical line (|).
|
||||
* <li>Lines may contain trailing comments.
|
||||
* </ul>
|
||||
* </p>
|
||||
*
|
||||
* @param reader Reader containing a Snowball stopword list
|
||||
* @return A {@link CharArraySet} with the reader's words
|
||||
|
|
|
@ -34,8 +34,6 @@ import java.util.*;
|
|||
/**
|
||||
* Extension of StandardTokenizer that is aware of Wikipedia syntax. It is based off of the
|
||||
* Wikipedia tutorial available at http://en.wikipedia.org/wiki/Wikipedia:Tutorial, but it may not be complete.
|
||||
* <p/>
|
||||
* <p/>
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public final class WikipediaTokenizer extends Tokenizer {
|
||||
|
|
|
@ -1,5 +1,22 @@
|
|||
package org.apache.lucene.analysis.standard;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.document.Document;
|
||||
|
@ -20,23 +37,7 @@ import java.io.IOException;
|
|||
import java.util.Arrays;
|
||||
import java.util.Random;
|
||||
|
||||
|
||||
/**
|
||||
* Copyright 2004 The Apache Software Foundation
|
||||
* <p/>
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
* <p/>
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* <p/>
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/** tests for classicanalyzer */
|
||||
public class TestClassicAnalyzer extends BaseTokenStreamTestCase {
|
||||
|
||||
private Analyzer a = new ClassicAnalyzer();
|
||||
|
|
|
@ -28,14 +28,14 @@ import org.apache.lucene.util.Version;
|
|||
* Base class for testing tokenstream factories.
|
||||
* <p>
|
||||
* Example usage:
|
||||
* <code><pre>
|
||||
* <pre class="prettyprint">
|
||||
* Reader reader = new StringReader("Some Text to Analyze");
|
||||
* reader = charFilterFactory("htmlstrip").create(reader);
|
||||
* TokenStream stream = tokenizerFactory("standard").create(reader);
|
||||
* stream = tokenFilterFactory("lowercase").create(stream);
|
||||
* stream = tokenFilterFactory("asciifolding").create(stream);
|
||||
* assertTokenStreamContents(stream, new String[] { "some", "text", "to", "analyze" });
|
||||
* </pre></code>
|
||||
* </pre>
|
||||
*/
|
||||
// TODO: this has to be here, since the abstract factories are not in lucene-core,
|
||||
// so test-framework doesnt know about them...
|
||||
|
|
|
@ -39,7 +39,7 @@ import java.util.regex.Pattern;
|
|||
* Generates a file containing JFlex macros to accept valid ASCII TLDs
|
||||
* (top level domains), for inclusion in JFlex grammars that can accept
|
||||
* domain names.
|
||||
* <p/>
|
||||
* <p>
|
||||
* The IANA Root Zone Database is queried via HTTP from URL cmdline arg #0, the
|
||||
* response is parsed, and the results are written out to a file containing
|
||||
* a JFlex macro that will accept all valid ASCII-only TLDs, including punycode
|
||||
|
|
|
@ -46,11 +46,10 @@ import com.ibm.icu.text.UnicodeSet;
|
|||
* <li>Conversion from Fullwidth to Halfwidth forms.
|
||||
* <li>Script conversions, for example Serbian Cyrillic to Latin
|
||||
* </ul>
|
||||
* </p>
|
||||
* <p>
|
||||
* Example usage: <blockquote>stream = new ICUTransformFilter(stream,
|
||||
* Transliterator.getInstance("Traditional-Simplified"));</blockquote>
|
||||
* </p>
|
||||
* <br>
|
||||
* For more details, see the <a
|
||||
* href="http://userguide.icu-project.org/transforms/general">ICU User
|
||||
* Guide</a>.
|
||||
|
|
|
@ -42,8 +42,7 @@ import com.ibm.icu.text.RuleBasedBreakIterator;
|
|||
* Words are broken across script boundaries, then segmented according to
|
||||
* the BreakIterator and typing provided by the {@link DefaultICUTokenizerConfig}.
|
||||
*
|
||||
* <p/>
|
||||
*
|
||||
* <p>
|
||||
* To use the default set of per-script rules:
|
||||
*
|
||||
* <pre class="prettyprint" >
|
||||
|
@ -53,13 +52,13 @@ import com.ibm.icu.text.RuleBasedBreakIterator;
|
|||
* </analyzer>
|
||||
* </fieldType></pre>
|
||||
*
|
||||
* <p/>
|
||||
*
|
||||
* <p>
|
||||
* You can customize this tokenizer's behavior by specifying per-script rule files,
|
||||
* which are compiled by the ICU RuleBasedBreakIterator. See the
|
||||
* <a href="http://userguide.icu-project.org/boundaryanalysis#TOC-RBBI-Rules"
|
||||
* >ICU RuleBasedBreakIterator syntax reference</a>.
|
||||
*
|
||||
* <p>
|
||||
* To add per-script rules, add a "rulefiles" argument, which should contain a
|
||||
* comma-separated list of <tt>code:rulefile</tt> pairs in the following format:
|
||||
* <a href="http://unicode.org/iso15924/iso15924-codes.html"
|
||||
|
|
|
@ -28,11 +28,11 @@ import java.io.Reader;
|
|||
* <p>
|
||||
* Sequences of iteration marks are supported. In case an illegal sequence of iteration
|
||||
* marks is encountered, the implementation emits the illegal source character as-is
|
||||
* without considering its script. For example, with input "?ゝ", we get
|
||||
* "??" even though "?" isn't hiragana.
|
||||
* without considering its script. For example, with input "?ゝ", we get
|
||||
* "??" even though the question mark isn't hiragana.
|
||||
* </p>
|
||||
* <p>
|
||||
* Note that a full stop punctuation character "。" (U+3002) can not be iterated
|
||||
* Note that a full stop punctuation character "。" (U+3002) can not be iterated
|
||||
* (see below). Iteration marks themselves can be emitted in case they are illegal,
|
||||
* i.e. if they go back past the beginning of the character stream.
|
||||
* </p>
|
||||
|
|
|
@ -24,7 +24,7 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
|
|||
|
||||
/**
|
||||
* Factory for {@link JapaneseNumberFilter}.
|
||||
* <p>
|
||||
* <br>
|
||||
* <pre class="prettyprint">
|
||||
* <fieldType name="text_ja" class="solr.TextField">
|
||||
* <analyzer>
|
||||
|
|
|
@ -54,7 +54,6 @@ public class TestJapaneseKatakanaStemFilter extends BaseTokenStreamTestCase {
|
|||
* <li>center</li>
|
||||
* </ul>
|
||||
* Note that we remove a long sound in the case of "coffee" that is required.
|
||||
* </p>
|
||||
*/
|
||||
public void testStemVariants() throws IOException {
|
||||
assertAnalyzesTo(analyzer, "コピー コーヒー タクシー パーティー パーティ センター",
|
||||
|
|
|
@ -40,9 +40,8 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
|
|||
/**
|
||||
* Factory for {@link PhoneticFilter}.
|
||||
*
|
||||
* Create tokens based on phonetic encoders from <a href="
|
||||
* http://commons.apache.org/codec/api-release/org/apache/commons/codec/language/package-summary.html
|
||||
* ">Apache Commons Codec</a>.
|
||||
* Create tokens based on phonetic encoders from
|
||||
* <a href="http://commons.apache.org/codec/api-release/org/apache/commons/codec/language/package-summary.html">Apache Commons Codec</a>.
|
||||
* <p>
|
||||
* This takes one required argument, "encoder", and the rest are optional:
|
||||
* <dl>
|
||||
|
|
|
@ -54,7 +54,7 @@ public abstract class BaseUIMATokenizer extends Tokenizer {
|
|||
|
||||
/**
|
||||
* analyzes the tokenizer input using the given analysis engine
|
||||
* <p/>
|
||||
* <p>
|
||||
* {@link #cas} will be filled with extracted metadata (UIMA annotations, feature structures)
|
||||
*
|
||||
* @throws IOException If there is a low-level I/O error.
|
||||
|
|
|
@ -38,7 +38,7 @@ import java.util.List;
|
|||
* File can be specified as a absolute, relative or resource.
|
||||
* Two properties can be set:
|
||||
* file.query.maker.file=<Full path to file containing queries>
|
||||
* <br/>
|
||||
* <br>
|
||||
* file.query.maker.default.field=<Name of default field - Default value is "body">
|
||||
*
|
||||
* Example:
|
||||
|
|
|
@ -63,7 +63,7 @@ import java.util.regex.Pattern;
|
|||
* Each component analysis factory may specify <tt>luceneMatchVersion</tt> (defaults to
|
||||
* {@link Version#LATEST}) and any of the args understood by the specified
|
||||
* *Factory class, in the above-describe param format.
|
||||
* <p/>
|
||||
* <p>
|
||||
* Example:
|
||||
* <pre>
|
||||
* -AnalyzerFactory(name:'strip html, fold to ascii, whitespace tokenize, max 10k tokens',
|
||||
|
@ -75,7 +75,7 @@ import java.util.regex.Pattern;
|
|||
* [...]
|
||||
* -NewAnalyzer('strip html, fold to ascii, whitespace tokenize, max 10k tokens')
|
||||
* </pre>
|
||||
* <p/>
|
||||
* <p>
|
||||
* AnalyzerFactory will direct analysis component factories to look for resources
|
||||
* under the directory specified in the "work.dir" property.
|
||||
*/
|
||||
|
|
|
@ -96,11 +96,11 @@ public class NewAnalyzerTask extends PerfTask {
|
|||
/**
|
||||
* Set the params (analyzerName only), Comma-separate list of Analyzer class names. If the Analyzer lives in
|
||||
* org.apache.lucene.analysis, the name can be shortened by dropping the o.a.l.a part of the Fully Qualified Class Name.
|
||||
* <p/>
|
||||
* <p>
|
||||
* Analyzer names may also refer to previously defined AnalyzerFactory's.
|
||||
* <p/>
|
||||
* <p>
|
||||
* Example Declaration: {"NewAnalyzer" NewAnalyzer(WhitespaceAnalyzer, SimpleAnalyzer, StopAnalyzer, standard.StandardAnalyzer) >
|
||||
* <p/>
|
||||
* <p>
|
||||
* Example AnalyzerFactory usage:
|
||||
* <pre>
|
||||
* -AnalyzerFactory(name:'whitespace tokenized',WhitespaceTokenizer)
|
||||
|
|
|
@ -27,12 +27,11 @@ import org.apache.lucene.benchmark.byTask.PerfRunData;
|
|||
|
||||
/**
|
||||
* Task to support benchmarking collation.
|
||||
* <p>
|
||||
* <br>
|
||||
* <ul>
|
||||
* <li> <code>NewCollationAnalyzer</code> with the default jdk impl
|
||||
* <li> <code>NewCollationAnalyzer(impl:icu)</code> specify an impl (jdk,icu)
|
||||
* </ul>
|
||||
* </p>
|
||||
*/
|
||||
public class NewCollationAnalyzerTask extends PerfTask {
|
||||
/**
|
||||
|
|
|
@ -33,7 +33,6 @@ import org.apache.lucene.benchmark.byTask.PerfRunData;
|
|||
* <li><code>ROOT</code>: The root (language-agnostic) Locale
|
||||
* <li><empty string>: Erase the Locale (null)
|
||||
* </ul>
|
||||
* </p>
|
||||
*/
|
||||
public class NewLocaleTask extends PerfTask {
|
||||
private String language;
|
||||
|
|
|
@ -51,7 +51,6 @@ import org.apache.lucene.util.Bits;
|
|||
* Read index (abstract) task.
|
||||
* Sub classes implement withSearch(), withWarm(), withTraverse() and withRetrieve()
|
||||
* methods to configure the actual action.
|
||||
* <p/>
|
||||
* <p>Note: All ReadTasks reuse the reader if it is already open.
|
||||
* Otherwise a reader is opened at start and closed at the end.
|
||||
* <p>
|
||||
|
@ -238,7 +237,7 @@ public abstract class ReadTask extends PerfTask {
|
|||
/**
|
||||
* Specify the number of hits to traverse. Tasks should override this if they want to restrict the number
|
||||
* of hits that are traversed when {@link #withTraverse()} is true. Must be greater than 0.
|
||||
* <p/>
|
||||
* <p>
|
||||
* Read task calculates the traversal as: Math.min(hits.length(), traversalSize())
|
||||
*
|
||||
* @return Integer.MAX_VALUE
|
||||
|
|
|
@ -25,7 +25,6 @@ import org.apache.lucene.benchmark.byTask.feeds.QueryMaker;
|
|||
*
|
||||
* <p>Note: This task reuses the reader if it is already open.
|
||||
* Otherwise a reader is opened at start and closed at the end.
|
||||
* <p/>
|
||||
*
|
||||
* <p>Takes optional param: traversal size (otherwise all results are traversed).</p>
|
||||
*
|
||||
|
|
|
@ -30,11 +30,11 @@ import java.util.StringTokenizer;
|
|||
|
||||
/**
|
||||
* Perf run configuration properties.
|
||||
* <p/>
|
||||
* <p>
|
||||
* Numeric property containing ":", e.g. "10:100:5" is interpreted
|
||||
* as array of numeric values. It is extracted once, on first use, and
|
||||
* maintain a round number to return the appropriate value.
|
||||
* <p/>
|
||||
* <p>
|
||||
* The config property "work.dir" tells where is the root of
|
||||
* docs data dirs and indexes dirs. It is set to either of: <ul>
|
||||
* <li>value supplied for it in the alg file;</li>
|
||||
|
|
|
@ -42,7 +42,7 @@ import org.apache.lucene.util.BytesRef;
|
|||
/**
|
||||
* A simplistic Lucene based NaiveBayes classifier, with caching feature, see
|
||||
* <code>http://en.wikipedia.org/wiki/Naive_Bayes_classifier</code>
|
||||
* <p/>
|
||||
* <p>
|
||||
* This is NOT an online classifier.
|
||||
*
|
||||
* @lucene.experimental
|
||||
|
|
|
@ -38,7 +38,6 @@ import org.apache.lucene.util.RamUsageEstimator;
|
|||
* </p>
|
||||
* Another application of the set is that it can be used to perform fuzzy counting because
|
||||
* it can estimate reasonably accurately how many unique values are contained in the set.
|
||||
* </p>
|
||||
* <p>This class is NOT threadsafe.</p>
|
||||
* <p>
|
||||
* Internally a Bitset is used to record values and once a client has finished recording
|
||||
|
|
|
@ -63,9 +63,8 @@ import org.apache.lucene.util.fst.Util;
|
|||
* <li><tt>.tix</tt>: <a href="#Termindex">Term Index</a></li>
|
||||
* <li><tt>.tbk</tt>: <a href="#Termblock">Term Block</a></li>
|
||||
* </ul>
|
||||
* </p>
|
||||
*
|
||||
* <a name="Termindex" id="Termindex"></a>
|
||||
* <a name="Termindex"></a>
|
||||
* <h3>Term Index</h3>
|
||||
* <p>
|
||||
* The .tix contains a list of FSTs, one for each field.
|
||||
|
@ -87,7 +86,7 @@ import org.apache.lucene.util.fst.Util;
|
|||
* </li>
|
||||
* </ul>
|
||||
*
|
||||
* <a name="Termblock" id="Termblock"></a>
|
||||
* <a name="Termblock"></a>
|
||||
* <h3>Term Block</h3>
|
||||
* <p>
|
||||
* The .tbk contains all the statistics and metadata for terms, along with field summary (e.g.
|
||||
|
@ -98,7 +97,6 @@ import org.apache.lucene.util.fst.Util;
|
|||
* <li>metadata bytes block: encodes other parts of metadata; </li>
|
||||
* <li>skip block: contains skip data, to speed up metadata seeking and decoding</li>
|
||||
* </ul>
|
||||
* </p>
|
||||
*
|
||||
* <p>File Format:</p>
|
||||
* <ul>
|
||||
|
|
|
@ -61,7 +61,7 @@ import org.apache.lucene.util.fst.Util;
|
|||
* </ul>
|
||||
* <p>
|
||||
*
|
||||
* <a name="Termdictionary" id="Termdictionary"></a>
|
||||
* <a name="Termdictionary"></a>
|
||||
* <h3>Term Dictionary</h3>
|
||||
* <p>
|
||||
* The .tst contains a list of FSTs, one for each field.
|
||||
|
@ -80,7 +80,6 @@ import org.apache.lucene.util.fst.Util;
|
|||
* Generic byte array: Used to store non-monotonic metadata.
|
||||
* </li>
|
||||
* </ul>
|
||||
* </p>
|
||||
*
|
||||
* File format:
|
||||
* <ul>
|
||||
|
|
|
@ -164,7 +164,7 @@
|
|||
<property name="javac.debug" value="on"/>
|
||||
<property name="javac.source" value="1.8"/>
|
||||
<property name="javac.target" value="1.8"/>
|
||||
<property name="javac.args" value="-Xlint -Xlint:-deprecation -Xlint:-serial -Xlint:-options -Xdoclint:all/protected -Xdoclint:-html -Xdoclint:-missing"/>
|
||||
<property name="javac.args" value="-Xlint -Xlint:-deprecation -Xlint:-serial -Xlint:-options -Xdoclint:all/protected -Xdoclint:-missing"/>
|
||||
<property name="javadoc.link" value="http://download.oracle.com/javase/8/docs/api/"/>
|
||||
<property name="javadoc.link.junit" value="http://junit.sourceforge.net/javadoc/"/>
|
||||
<property name="javadoc.packagelist.dir" location="${common.dir}/tools/javadoc"/>
|
||||
|
@ -373,7 +373,7 @@
|
|||
</target>
|
||||
|
||||
<!-- for now enable only some doclint: -->
|
||||
<property name="javadoc.args" value="-Xdoclint:all -Xdoclint:-html -Xdoclint:-missing"/>
|
||||
<property name="javadoc.args" value="-Xdoclint:all -Xdoclint:-missing"/>
|
||||
|
||||
<!-- Import custom ANT tasks. -->
|
||||
<import file="${common.dir}/tools/custom-tasks.xml" />
|
||||
|
|
|
@ -29,7 +29,7 @@ import org.apache.lucene.util.AttributeSource;
|
|||
* are intended to be consumed more than once. It caches
|
||||
* all token attribute states locally in a List when the first call to
|
||||
* {@link #incrementToken()} is called. Subsequent calls will used the cache.
|
||||
* <p/>
|
||||
* <p>
|
||||
* <em>Important:</em> Like any proper TokenFilter, {@link #reset()} propagates
|
||||
* to the input, although only before {@link #incrementToken()} is called the
|
||||
* first time. Prior to Lucene 5, it was never propagated.
|
||||
|
|
|
@ -34,7 +34,7 @@ import org.apache.lucene.util.BytesRef;
|
|||
<p>
|
||||
The start and end offsets permit applications to re-associate a token with
|
||||
its source text, e.g., to display highlighted query terms in a document
|
||||
browser, or to show matching text fragments in a <abbr title="KeyWord In Context">KWIC</abbr>
|
||||
browser, or to show matching text fragments in a <a href="http://en.wikipedia.org/wiki/Key_Word_in_Context">KWIC</a>
|
||||
display, etc.
|
||||
<p>
|
||||
The type is a string, assigned by a lexical analyzer
|
||||
|
@ -61,12 +61,10 @@ import org.apache.lucene.util.BytesRef;
|
|||
<li>The startOffset and endOffset represent the start and offset in the source text, so be careful in adjusting them.</li>
|
||||
<li>When caching a reusable token, clone it. When injecting a cached token into a stream that can be reset, clone it again.</li>
|
||||
</ul>
|
||||
</p>
|
||||
<p>
|
||||
<b>Please note:</b> With Lucene 3.1, the <code>{@linkplain #toString toString()}</code> method had to be changed to match the
|
||||
{@link CharSequence} interface introduced by the interface {@link org.apache.lucene.analysis.tokenattributes.CharTermAttribute}.
|
||||
This method now only prints the term text, no additional information anymore.
|
||||
</p>
|
||||
@deprecated This class is outdated and no longer used since Lucene 2.9. Nuke it finally!
|
||||
*/
|
||||
@Deprecated
|
||||
|
|
|
@ -161,7 +161,7 @@ public abstract class TokenStream extends AttributeSource implements Closeable {
|
|||
* consumed, after {@link #incrementToken()} returned <code>false</code>
|
||||
* (using the new <code>TokenStream</code> API). Streams implementing the old API
|
||||
* should upgrade to use this feature.
|
||||
* <p/>
|
||||
* <p>
|
||||
* This method can be used to perform any end-of-stream operations, such as
|
||||
* setting the final offset of a stream. The final offset of a stream might
|
||||
* differ from the offset of the last token eg in case one or more whitespaces
|
||||
|
|
|
@ -21,13 +21,11 @@
|
|||
* <h2>Parsing? Tokenization? Analysis!</h2>
|
||||
* <p>
|
||||
* Lucene, an indexing and search library, accepts only plain text input.
|
||||
* <p>
|
||||
* <h2>Parsing</h2>
|
||||
* <p>
|
||||
* Applications that build their search capabilities upon Lucene may support documents in various formats – HTML, XML, PDF, Word – just to name a few.
|
||||
* Lucene does not care about the <i>Parsing</i> of these and other document formats, and it is the responsibility of the
|
||||
* application using Lucene to use an appropriate <i>Parser</i> to convert the original format into plain text before passing that plain text to Lucene.
|
||||
* <p>
|
||||
* <h2>Tokenization</h2>
|
||||
* <p>
|
||||
* Plain text passed to Lucene for indexing goes through a process generally called tokenization. Tokenization is the process
|
||||
|
@ -67,8 +65,7 @@
|
|||
* Adding in synonyms at the same token position as the current word can mean better
|
||||
* matching when users search with words in the synonym set.
|
||||
* </li>
|
||||
* </ul>
|
||||
* <p>
|
||||
* </ul>
|
||||
* <h2>Core Analysis</h2>
|
||||
* <p>
|
||||
* The analysis package provides the mechanism to convert Strings and Readers
|
||||
|
@ -249,7 +246,6 @@
|
|||
* This might sometimes require a modified analyzer – see the next section on how to do that.
|
||||
* </li>
|
||||
* </ol>
|
||||
* </p>
|
||||
* <h2>Implementing your own Analyzer and Analysis Components</h2>
|
||||
* <p>
|
||||
* Creating your own Analyzer is straightforward. Your Analyzer should subclass {@link org.apache.lucene.analysis.Analyzer}. It can use
|
||||
|
@ -416,7 +412,7 @@
|
|||
* This new attribute makes clear that "IBM" and "International Business Machines" start and end
|
||||
* at the same positions.
|
||||
* </p>
|
||||
* <a name="corrupt" />
|
||||
* <a name="corrupt"></a>
|
||||
* <h3>How to not write corrupt token streams</h3>
|
||||
* <p>
|
||||
* There are a few rules to observe when writing custom Tokenizers and TokenFilters:
|
||||
|
@ -586,7 +582,6 @@
|
|||
* a chain of a TokenStream and multiple TokenFilters is used, then all TokenFilters in that chain share the Attributes
|
||||
* with the TokenStream.
|
||||
* </li>
|
||||
* <br>
|
||||
* <li>
|
||||
* Attribute instances are reused for all tokens of a document. Thus, a TokenStream/-Filter needs to update
|
||||
* the appropriate Attribute(s) in incrementToken(). The consumer, commonly the Lucene indexer, consumes the data in the
|
||||
|
@ -594,13 +589,11 @@
|
|||
* was reached. This means that in each call of incrementToken() a TokenStream/-Filter can safely overwrite the data in
|
||||
* the Attribute instances.
|
||||
* </li>
|
||||
* <br>
|
||||
* <li>
|
||||
* For performance reasons a TokenStream/-Filter should add/get Attributes during instantiation; i.e., create an attribute in the
|
||||
* constructor and store references to it in an instance variable. Using an instance variable instead of calling addAttribute()/getAttribute()
|
||||
* in incrementToken() will avoid attribute lookups for every token in the document.
|
||||
* </li>
|
||||
* <br>
|
||||
* <li>
|
||||
* All methods in AttributeSource are idempotent, which means calling them multiple times always yields the same
|
||||
* result. This is especially important to know for addAttribute(). The method takes the <b>type</b> (<code>Class</code>)
|
||||
|
|
|
@ -26,8 +26,6 @@ import org.apache.lucene.store.IOContext;
|
|||
/**
|
||||
* Expert: Controls the format of the
|
||||
* {@link SegmentInfo} (segment metadata file).
|
||||
* <p>
|
||||
*
|
||||
* @see SegmentInfo
|
||||
* @lucene.experimental
|
||||
*/
|
||||
|
|
|
@ -32,7 +32,6 @@ import org.apache.lucene.util.BytesRef;
|
|||
|
||||
/**
|
||||
* Codec API for writing stored fields:
|
||||
* <p>
|
||||
* <ol>
|
||||
* <li>For every document, {@link #startDocument()} is called,
|
||||
* informing the Codec that a new document has started.
|
||||
|
|
|
@ -36,7 +36,6 @@ import org.apache.lucene.util.BytesRefBuilder;
|
|||
|
||||
/**
|
||||
* Codec API for writing term vectors:
|
||||
* <p>
|
||||
* <ol>
|
||||
* <li>For every document, {@link #startDocument(int)} is called,
|
||||
* informing the Codec how many fields will be written.
|
||||
|
|
|
@ -93,7 +93,7 @@ import org.apache.lucene.util.packed.PackedInts;
|
|||
* <li><tt>.tip</tt>: <a href="#Termindex">Term Index</a></li>
|
||||
* </ul>
|
||||
* <p>
|
||||
* <a name="Termdictionary" id="Termdictionary"></a>
|
||||
* <a name="Termdictionary"></a>
|
||||
* <h3>Term Dictionary</h3>
|
||||
*
|
||||
* <p>The .tim file contains the list of terms in each
|
||||
|
@ -152,7 +152,7 @@ import org.apache.lucene.util.packed.PackedInts;
|
|||
* <li>For inner nodes of the tree, every entry will steal one bit to mark whether it points
|
||||
* to child nodes(sub-block). If so, the corresponding TermStats and TermMetaData are omitted </li>
|
||||
* </ul>
|
||||
* <a name="Termindex" id="Termindex"></a>
|
||||
* <a name="Termindex"></a>
|
||||
* <h3>Term Index</h3>
|
||||
* <p>The .tip file contains an index into the term dictionary, so that it can be
|
||||
* accessed randomly. The index is also used to determine
|
||||
|
|
|
@ -37,8 +37,8 @@ import org.apache.lucene.util.packed.PackedInts;
|
|||
* 1024 chunks, this index computes the average number of bytes per
|
||||
* chunk and for every chunk, only stores the difference between<ul>
|
||||
* <li>${chunk number} * ${average length of a chunk}</li>
|
||||
* <li>and the actual start offset of the chunk</li></ul></p>
|
||||
* <p>Data is written as follows:</p>
|
||||
* <li>and the actual start offset of the chunk</li></ul>
|
||||
* <p>Data is written as follows:
|
||||
* <ul>
|
||||
* <li>PackedIntsVersion, <Block><sup>BlockCount</sup>, BlocksEndMarker</li>
|
||||
* <li>PackedIntsVersion --> {@link PackedInts#VERSION_CURRENT} as a {@link DataOutput#writeVInt VInt}</li>
|
||||
|
@ -57,7 +57,7 @@ import org.apache.lucene.util.packed.PackedInts;
|
|||
* <li>StartPointerDeltas --> {@link PackedInts packed} array of BlockChunks elements of BitsPerStartPointerDelta bits each, representing the deltas from the average start pointer using <a href="https://developers.google.com/protocol-buffers/docs/encoding#types">ZigZag encoding</a></li>
|
||||
* <li>Footer --> {@link CodecUtil#writeFooter CodecFooter}</li>
|
||||
* </ul>
|
||||
* <p>Notes</p>
|
||||
* <p>Notes
|
||||
* <ul>
|
||||
* <li>For any block, the doc base of the n-th chunk can be restored with
|
||||
* <code>DocBase + AvgChunkDocs * n + DocBaseDeltas[n]</code>.</li>
|
||||
|
|
|
@ -89,7 +89,7 @@ import org.apache.lucene.util.packed.MonotonicBlockPackedWriter;
|
|||
* <li><tt>.dvm</tt>: DocValues metadata</li>
|
||||
* </ol>
|
||||
* <ol>
|
||||
* <li><a name="dvm" id="dvm"></a>
|
||||
* <li><a name="dvm"></a>
|
||||
* <p>The DocValues metadata or .dvm file.</p>
|
||||
* <p>For DocValues field, this stores metadata, such as the offset into the
|
||||
* DocValues data (.dvd)</p>
|
||||
|
@ -150,7 +150,7 @@ import org.apache.lucene.util.packed.MonotonicBlockPackedWriter;
|
|||
* is written for the addresses.
|
||||
* <p>MissingOffset points to a byte[] containing a bitset of all documents that had a value for the field.
|
||||
* If it's -1, then there are no missing values. If it's -2, all values are missing.
|
||||
* <li><a name="dvd" id="dvd"></a>
|
||||
* <li><a name="dvd"></a>
|
||||
* <p>The DocValues data or .dvd file.</p>
|
||||
* <p>For DocValues field, this stores the actual per-document data (the heavy-lifting)</p>
|
||||
* <p>DocValues data (.dvd) --> Header,<NumericData | BinaryData | SortedData><sup>NumFields</sup>,Footer</p>
|
||||
|
|
|
@ -40,10 +40,9 @@ import org.apache.lucene.store.IndexOutput;
|
|||
|
||||
/**
|
||||
* Lucene 5.0 Field Infos format.
|
||||
* <p>
|
||||
* <p>Field names are stored in the field info file, with suffix <tt>.fnm</tt>.</p>
|
||||
* <p>Field names are stored in the field info file, with suffix <tt>.fnm</tt>.
|
||||
* <p>FieldInfos (.fnm) --> Header,FieldsCount, <FieldName,FieldNumber,
|
||||
* FieldBits,DocValuesBits,DocValuesGen,Attributes> <sup>FieldsCount</sup>,Footer</p>
|
||||
* FieldBits,DocValuesBits,DocValuesGen,Attributes> <sup>FieldsCount</sup>,Footer
|
||||
* <p>Data types:
|
||||
* <ul>
|
||||
* <li>Header --> {@link CodecUtil#checkIndexHeader IndexHeader}</li>
|
||||
|
@ -55,7 +54,6 @@ import org.apache.lucene.store.IndexOutput;
|
|||
* <li>DocValuesGen --> {@link DataOutput#writeLong(long) Int64}</li>
|
||||
* <li>Footer --> {@link CodecUtil#writeFooter CodecFooter}</li>
|
||||
* </ul>
|
||||
* </p>
|
||||
* Field Descriptions:
|
||||
* <ul>
|
||||
* <li>FieldsCount: the number of fields in this file.</li>
|
||||
|
|
|
@ -36,12 +36,11 @@ import org.apache.lucene.util.MutableBits;
|
|||
|
||||
/**
|
||||
* Lucene 5.0 live docs format
|
||||
* <p>
|
||||
* <p>The .liv file is optional, and only exists when a segment contains
|
||||
* deletions.</p>
|
||||
* deletions.
|
||||
* <p>Although per-segment, this file is maintained exterior to compound segment
|
||||
* files.</p>
|
||||
* <p>Deletions (.liv) --> IndexHeader,Generation,Bits</p>
|
||||
* files.
|
||||
* <p>Deletions (.liv) --> IndexHeader,Generation,Bits
|
||||
* <ul>
|
||||
* <li>SegmentHeader --> {@link CodecUtil#writeIndexHeader IndexHeader}</li>
|
||||
* <li>Bits --> <{@link DataOutput#writeLong Int64}> <sup>LongCount</sup></li>
|
||||
|
|
|
@ -35,7 +35,6 @@ import org.apache.lucene.util.packed.PackedInts;
|
|||
* Lucene 5.0 Score normalization format.
|
||||
* <p>
|
||||
* Encodes normalization values with these strategies:
|
||||
* <p>
|
||||
* <ul>
|
||||
* <li>Uncompressed: when values fit into a single byte and would require more than 4 bits
|
||||
* per value, they are just encoded as an uncompressed byte array.
|
||||
|
@ -65,7 +64,7 @@ import org.apache.lucene.util.packed.PackedInts;
|
|||
* <li><tt>.nvm</tt>: Norms metadata</li>
|
||||
* </ol>
|
||||
* <ol>
|
||||
* <li><a name="nvm" id="nvm"></a>
|
||||
* <li><a name="nvm"></a>
|
||||
* <p>The Norms metadata or .nvm file.</p>
|
||||
* <p>For each norms field, this stores metadata, such as the offset into the
|
||||
* Norms data (.nvd)</p>
|
||||
|
@ -94,7 +93,7 @@ import org.apache.lucene.util.packed.PackedInts;
|
|||
* <li>6 --> patched table. Documents with very common values are written with a lookup table.
|
||||
* Other values are written using a nested indirect.
|
||||
* </ul>
|
||||
* <li><a name="nvd" id="nvd"></a>
|
||||
* <li><a name="nvd"></a>
|
||||
* <p>The Norms data or .nvd file.</p>
|
||||
* <p>For each Norms field, this stores the actual per-document data (the heavy-lifting)</p>
|
||||
* <p>Norms data (.nvd) --> Header,<Uncompressed | TableCompressed | DeltaCompressed | MonotonicCompressed ><sup>NumFields</sup>,Footer</p>
|
||||
|
|
|
@ -95,7 +95,6 @@ import org.apache.lucene.util.packed.PackedInts;
|
|||
* this reduces disk pre-fetches.</p>
|
||||
* </li>
|
||||
* </ul>
|
||||
* </p>
|
||||
*
|
||||
* <p>
|
||||
* Files and detailed format:
|
||||
|
@ -106,9 +105,8 @@ import org.apache.lucene.util.packed.PackedInts;
|
|||
* <li><tt>.pos</tt>: <a href="#Positions">Positions</a></li>
|
||||
* <li><tt>.pay</tt>: <a href="#Payloads">Payloads and Offsets</a></li>
|
||||
* </ul>
|
||||
* </p>
|
||||
*
|
||||
* <a name="Termdictionary" id="Termdictionary"></a>
|
||||
* <a name="Termdictionary"></a>
|
||||
* <dl>
|
||||
* <dd>
|
||||
* <b>Term Dictionary</b>
|
||||
|
@ -118,11 +116,10 @@ import org.apache.lucene.util.packed.PackedInts;
|
|||
* and pointers to the frequencies, positions, payload and
|
||||
* skip data in the .doc, .pos, and .pay files.
|
||||
* See {@link BlockTreeTermsWriter} for more details on the format.
|
||||
* </p>
|
||||
*
|
||||
* <p>NOTE: The term dictionary can plug into different postings implementations:
|
||||
* the postings writer/reader are actually responsible for encoding
|
||||
* and decoding the PostingsHeader and TermMetadata sections described here:</p>
|
||||
* and decoding the PostingsHeader and TermMetadata sections described here:
|
||||
*
|
||||
* <ul>
|
||||
* <li>PostingsHeader --> Header, PackedBlockSize</li>
|
||||
|
@ -133,7 +130,7 @@ import org.apache.lucene.util.packed.PackedInts;
|
|||
* <li>DocFPDelta, PosFPDelta, PayFPDelta, PosVIntBlockFPDelta, SkipFPDelta --> {@link DataOutput#writeVLong VLong}</li>
|
||||
* <li>Footer --> {@link CodecUtil#writeFooter CodecFooter}</li>
|
||||
* </ul>
|
||||
* <p>Notes:</p>
|
||||
* <p>Notes:
|
||||
* <ul>
|
||||
* <li>Header is a {@link CodecUtil#writeIndexHeader IndexHeader} storing the version information
|
||||
* for the postings.</li>
|
||||
|
@ -169,17 +166,17 @@ import org.apache.lucene.util.packed.PackedInts;
|
|||
* </dd>
|
||||
* </dl>
|
||||
*
|
||||
* <a name="Termindex" id="Termindex"></a>
|
||||
* <a name="Termindex"></a>
|
||||
* <dl>
|
||||
* <dd>
|
||||
* <b>Term Index</b>
|
||||
* <p>The .tip file contains an index into the term dictionary, so that it can be
|
||||
* accessed randomly. See {@link BlockTreeTermsWriter} for more details on the format.</p>
|
||||
* accessed randomly. See {@link BlockTreeTermsWriter} for more details on the format.
|
||||
* </dd>
|
||||
* </dl>
|
||||
*
|
||||
*
|
||||
* <a name="Frequencies" id="Frequencies"></a>
|
||||
* <a name="Frequencies"></a>
|
||||
* <dl>
|
||||
* <dd>
|
||||
* <b>Frequencies and Skip Data</b>
|
||||
|
@ -208,7 +205,7 @@ import org.apache.lucene.util.packed.PackedInts;
|
|||
* <li>SkipChildLevelPointer --> {@link DataOutput#writeVLong VLong}</li>
|
||||
* <li>Footer --> {@link CodecUtil#writeFooter CodecFooter}</li>
|
||||
* </ul>
|
||||
* <p>Notes:</p>
|
||||
* <p>Notes:
|
||||
* <ul>
|
||||
* <li>PackedDocDeltaBlock is theoretically generated from two steps:
|
||||
* <ol>
|
||||
|
@ -267,7 +264,7 @@ import org.apache.lucene.util.packed.PackedInts;
|
|||
* </dd>
|
||||
* </dl>
|
||||
*
|
||||
* <a name="Positions" id="Positions"></a>
|
||||
* <a name="Positions"></a>
|
||||
* <dl>
|
||||
* <dd>
|
||||
* <b>Positions</b>
|
||||
|
@ -286,7 +283,7 @@ import org.apache.lucene.util.packed.PackedInts;
|
|||
* <li>PayloadData --> {@link DataOutput#writeByte byte}<sup>PayLength</sup></li>
|
||||
* <li>Footer --> {@link CodecUtil#writeFooter CodecFooter}</li>
|
||||
* </ul>
|
||||
* <p>Notes:</p>
|
||||
* <p>Notes:
|
||||
* <ul>
|
||||
* <li>TermPositions are order by term (terms are implicit, from the term dictionary), and position
|
||||
* values for each term document pair are incremental, and ordered by document number.</li>
|
||||
|
@ -320,12 +317,12 @@ import org.apache.lucene.util.packed.PackedInts;
|
|||
* </dd>
|
||||
* </dl>
|
||||
*
|
||||
* <a name="Payloads" id="Payloads"></a>
|
||||
* <a name="Payloads"></a>
|
||||
* <dl>
|
||||
* <dd>
|
||||
* <b>Payloads and Offsets</b>
|
||||
* <p>The .pay file will store payloads and offsets associated with certain term-document positions.
|
||||
* Some payloads and offsets will be separated out into .pos file, for performance reasons.</p>
|
||||
* Some payloads and offsets will be separated out into .pos file, for performance reasons.
|
||||
* <ul>
|
||||
* <li>PayFile(.pay): --> Header, <TermPayloads, TermOffsets?> <sup>TermCount</sup>, Footer</li>
|
||||
* <li>Header --> {@link CodecUtil#writeIndexHeader IndexHeader}</li>
|
||||
|
@ -336,7 +333,7 @@ import org.apache.lucene.util.packed.PackedInts;
|
|||
* <li>PayData --> {@link DataOutput#writeByte byte}<sup>SumPayLength</sup></li>
|
||||
* <li>Footer --> {@link CodecUtil#writeFooter CodecFooter}</li>
|
||||
* </ul>
|
||||
* <p>Notes:</p>
|
||||
* <p>Notes:
|
||||
* <ul>
|
||||
* <li>The order of TermPayloads/TermOffsets will be the same as TermPositions, note that part of
|
||||
* payload/offsets are stored in .pos.</li>
|
||||
|
@ -352,7 +349,6 @@ import org.apache.lucene.util.packed.PackedInts;
|
|||
* </ul>
|
||||
* </dd>
|
||||
* </dl>
|
||||
* </p>
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
|
|
|
@ -43,9 +43,7 @@ import org.apache.lucene.util.Version;
|
|||
* <ul>
|
||||
* <li><tt>.si</tt>: Header, SegVersion, SegSize, IsCompoundFile, Diagnostics, Files, Attributes, Footer
|
||||
* </ul>
|
||||
* </p>
|
||||
* Data types:
|
||||
* <p>
|
||||
* <ul>
|
||||
* <li>Header --> {@link CodecUtil#writeIndexHeader IndexHeader}</li>
|
||||
* <li>SegSize --> {@link DataOutput#writeInt Int32}</li>
|
||||
|
@ -55,9 +53,7 @@ import org.apache.lucene.util.Version;
|
|||
* <li>IsCompoundFile --> {@link DataOutput#writeByte Int8}</li>
|
||||
* <li>Footer --> {@link CodecUtil#writeFooter CodecFooter}</li>
|
||||
* </ul>
|
||||
* </p>
|
||||
* Field Descriptions:
|
||||
* <p>
|
||||
* <ul>
|
||||
* <li>SegVersion is the code version that created the segment.</li>
|
||||
* <li>SegSize is the number of documents contained in the segment index.</li>
|
||||
|
@ -70,7 +66,6 @@ import org.apache.lucene.util.Version;
|
|||
* addIndexes), etc.</li>
|
||||
* <li>Files is a list of files referred to by this segment.</li>
|
||||
* </ul>
|
||||
* </p>
|
||||
*
|
||||
* @see SegmentInfos
|
||||
* @lucene.experimental
|
||||
|
|
|
@ -38,7 +38,7 @@ import org.apache.lucene.util.packed.PackedInts;
|
|||
/**
|
||||
* Lucene 5.0 stored fields format.
|
||||
*
|
||||
* <p><b>Principle</b></p>
|
||||
* <p><b>Principle</b>
|
||||
* <p>This {@link StoredFieldsFormat} compresses blocks of documents in
|
||||
* order to improve the compression ratio compared to document-level
|
||||
* compression. It uses the <a href="http://code.google.com/p/lz4/">LZ4</a>
|
||||
|
@ -50,17 +50,17 @@ import org.apache.lucene.util.packed.PackedInts;
|
|||
* compression, you can choose ({@link Mode#BEST_COMPRESSION BEST_COMPRESSION}), which uses
|
||||
* the <a href="http://en.wikipedia.org/wiki/DEFLATE">DEFLATE</a> algorithm with 60KB blocks
|
||||
* for a better ratio at the expense of slower performance.
|
||||
* These two options can be configured like this: </p>
|
||||
* These two options can be configured like this:
|
||||
* <pre class="prettyprint">
|
||||
* // the default: for high performance
|
||||
* indexWriterConfig.setCodec(new Lucene50Codec(Mode.BEST_SPEED));
|
||||
* // instead for higher performance (but slower):
|
||||
* // indexWriterConfig.setCodec(new Lucene50Codec(Mode.BEST_COMPRESSION));
|
||||
* </pre>
|
||||
* <p><b>File formats</b></p>
|
||||
* <p>Stored fields are represented by two files:</p>
|
||||
* <p><b>File formats</b>
|
||||
* <p>Stored fields are represented by two files:
|
||||
* <ol>
|
||||
* <li><a name="field_data" id="field_data"></a>
|
||||
* <li><a name="field_data"></a>
|
||||
* <p>A fields data file (extension <tt>.fdt</tt>). This file stores a compact
|
||||
* representation of documents in compressed blocks of 16KB or more. When
|
||||
* writing a segment, documents are appended to an in-memory <tt>byte[]</tt>
|
||||
|
@ -106,7 +106,7 @@ import org.apache.lucene.util.packed.PackedInts;
|
|||
* <li>DirtyChunkCount --> the number of prematurely flushed chunks in this file</li>
|
||||
* <li>Footer --> {@link CodecUtil#writeFooter CodecFooter}</li>
|
||||
* </ul>
|
||||
* <p>Notes</p>
|
||||
* <p>Notes
|
||||
* <ul>
|
||||
* <li>If documents are larger than 16KB then chunks will likely contain only
|
||||
* one document. However, documents can never spread across several chunks (all
|
||||
|
@ -123,7 +123,7 @@ import org.apache.lucene.util.packed.PackedInts;
|
|||
* 0.5% larger than Docs.</li>
|
||||
* </ul>
|
||||
* </li>
|
||||
* <li><a name="field_index" id="field_index"></a>
|
||||
* <li><a name="field_index"></a>
|
||||
* <p>A fields index file (extension <tt>.fdx</tt>).</p>
|
||||
* <ul>
|
||||
* <li>FieldsIndex (.fdx) --> <Header>, <ChunkIndex>, Footer</li>
|
||||
|
@ -133,9 +133,9 @@ import org.apache.lucene.util.packed.PackedInts;
|
|||
* </ul>
|
||||
* </li>
|
||||
* </ol>
|
||||
* <p><b>Known limitations</b></p>
|
||||
* <p><b>Known limitations</b>
|
||||
* <p>This {@link StoredFieldsFormat} does not support individual documents
|
||||
* larger than (<tt>2<sup>31</sup> - 2<sup>14</sup></tt>) bytes.</p>
|
||||
* larger than (<tt>2<sup>31</sup> - 2<sup>14</sup></tt>) bytes.
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public final class Lucene50StoredFieldsFormat extends StoredFieldsFormat {
|
||||
|
|
|
@ -48,7 +48,7 @@ import org.apache.lucene.util.packed.PackedInts;
|
|||
* Looking up term vectors for any document requires at most 1 disk seek.
|
||||
* <p><b>File formats</b>
|
||||
* <ol>
|
||||
* <li><a name="vector_data" id="vector_data"></a>
|
||||
* <li><a name="vector_data"></a>
|
||||
* <p>A vector data file (extension <tt>.tvd</tt>). This file stores terms,
|
||||
* frequencies, positions, offsets and payloads for every document. Upon writing
|
||||
* a new segment, it accumulates data into memory until the buffer used to store
|
||||
|
@ -111,8 +111,8 @@ import org.apache.lucene.util.packed.PackedInts;
|
|||
* <li>Footer --> {@link CodecUtil#writeFooter CodecFooter}</li>
|
||||
* </ul>
|
||||
* </li>
|
||||
* <li><a name="vector_index" id="vector_index"></a>
|
||||
* <p>An index file (extension <tt>.tvx</tt>).</p>
|
||||
* <li><a name="vector_index"></a>
|
||||
* <p>An index file (extension <tt>.tvx</tt>).
|
||||
* <ul>
|
||||
* <li>VectorIndex (.tvx) --> <Header>, <ChunkIndex>, Footer</li>
|
||||
* <li>Header --> {@link CodecUtil#writeIndexHeader IndexHeader}</li>
|
||||
|
|
|
@ -32,12 +32,13 @@
|
|||
* </li>
|
||||
* <li><a href="#Overview">Index Structure Overview</a></li>
|
||||
* <li><a href="#File_Naming">File Naming</a></li>
|
||||
* <li><a href="#file-names">Summary of File Extensions</a></li>
|
||||
* <li><a href="#file-names">Summary of File Extensions</a>
|
||||
* <ul>
|
||||
* <li><a href="#Lock_File">Lock File</a></li>
|
||||
* <li><a href="#History">History</a></li>
|
||||
* <li><a href="#Limitations">Limitations</a></li>
|
||||
* </ul>
|
||||
* </li>
|
||||
* </ul>
|
||||
* </div>
|
||||
* <a name="Introduction"></a>
|
||||
|
@ -57,7 +58,7 @@
|
|||
* different programming languages should endeavor to agree on file formats, and
|
||||
* generate new versions of this document.</p>
|
||||
* </div>
|
||||
* <a name="Definitions" id="Definitions"></a>
|
||||
* <a name="Definitions"></a>
|
||||
* <h2>Definitions</h2>
|
||||
* <div>
|
||||
* <p>The fundamental concepts in Lucene are index, document, field and term.</p>
|
||||
|
@ -88,7 +89,7 @@
|
|||
* indexed literally.</p>
|
||||
* <p>See the {@link org.apache.lucene.document.Field Field}
|
||||
* java docs for more information on Fields.</p>
|
||||
* <a name="Segments" id="Segments"></a>
|
||||
* <a name="Segments"></a>
|
||||
* <h3>Segments</h3>
|
||||
* <p>Lucene indexes may be composed of multiple sub-indexes, or <i>segments</i>.
|
||||
* Each segment is a fully independent index, which could be searched separately.
|
||||
|
@ -128,7 +129,7 @@
|
|||
* </li>
|
||||
* </ul>
|
||||
* </div>
|
||||
* <a name="Overview" id="Overview"></a>
|
||||
* <a name="Overview"></a>
|
||||
* <h2>Index Structure Overview</h2>
|
||||
* <div>
|
||||
* <p>Each segment index maintains the following:</p>
|
||||
|
@ -211,7 +212,7 @@
|
|||
* segments_1, then segments_2, etc. The generation is a sequential long integer
|
||||
* represented in alpha-numeric (base 36) form.</p>
|
||||
* </div>
|
||||
* <a name="file-names" id="file-names"></a>
|
||||
* <a name="file-names"></a>
|
||||
* <h2>Summary of File Extensions</h2>
|
||||
* <div>
|
||||
* <p>The following table summarizes the names and extensions of the files in
|
||||
|
@ -316,14 +317,14 @@
|
|||
* </tr>
|
||||
* </table>
|
||||
* </div>
|
||||
* <a name="Lock_File" id="Lock_File"></a>
|
||||
* <a name="Lock_File"></a>
|
||||
* <h2>Lock File</h2>
|
||||
* The write lock, which is stored in the index directory by default, is named
|
||||
* "write.lock". If the lock directory is different from the index directory then
|
||||
* the write lock will be named "XXXX-write.lock" where XXXX is a unique prefix
|
||||
* derived from the full path to the index directory. When this file is present, a
|
||||
* writer is currently modifying the index (adding or removing documents). This
|
||||
* lock file ensures that only one writer is modifying the index at a time.</p>
|
||||
* lock file ensures that only one writer is modifying the index at a time.
|
||||
* <a name="History"></a>
|
||||
* <h2>History</h2>
|
||||
* <p>Compatibility notes are provided in this document, describing how file
|
||||
|
@ -386,7 +387,7 @@
|
|||
* that is suitable for faceting/sorting/analytics.
|
||||
* </li>
|
||||
* </ul>
|
||||
* <a name="Limitations" id="Limitations"></a>
|
||||
* <a name="Limitations"></a>
|
||||
* <h2>Limitations</h2>
|
||||
* <div>
|
||||
* <p>Lucene uses a Java <code>int</code> to refer to
|
||||
|
|
|
@ -30,7 +30,6 @@
|
|||
* <li>Norms - see {@link org.apache.lucene.codecs.NormsFormat}</li>
|
||||
* <li>Live documents - see {@link org.apache.lucene.codecs.LiveDocsFormat}</li>
|
||||
* </ul>
|
||||
* </p>
|
||||
*
|
||||
* For some concrete implementations beyond Lucene's official index format, see
|
||||
* the <a href="{@docRoot}/../codecs/overview-summary.html">Codecs module</a>.
|
||||
|
@ -51,7 +50,6 @@
|
|||
* You will need to register the Codec class so that the {@link java.util.ServiceLoader ServiceLoader} can find it, by including a
|
||||
* META-INF/services/org.apache.lucene.codecs.Codec file on your classpath that contains the package-qualified
|
||||
* name of your codec.
|
||||
* </p>
|
||||
*
|
||||
* <p>
|
||||
* If you just want to customise the {@link org.apache.lucene.codecs.PostingsFormat}, or use different postings
|
||||
|
|
|
@ -42,13 +42,13 @@ import org.apache.lucene.util.BytesRef;
|
|||
* NumericDocValuesField}, {@link SortedDocValuesField}, {@link
|
||||
* StringField}, {@link TextField}, {@link StoredField}.
|
||||
*
|
||||
* <p/> A field is a section of a Document. Each field has three
|
||||
* <p> A field is a section of a Document. Each field has three
|
||||
* parts: name, type and value. Values may be text
|
||||
* (String, Reader or pre-analyzed TokenStream), binary
|
||||
* (byte[]), or numeric (a Number). Fields are optionally stored in the
|
||||
* index, so that they may be returned with hits on the document.
|
||||
*
|
||||
* <p/>
|
||||
* <p>
|
||||
* NOTE: the field type is an {@link IndexableFieldType}. Making changes
|
||||
* to the state of the IndexableFieldType will impact any
|
||||
* Field it is used in. It is strongly recommended that no
|
||||
|
|
|
@ -59,21 +59,21 @@ import org.apache.lucene.util.NumericUtils;
|
|||
* value, either by dividing the result of
|
||||
* {@link java.util.Date#getTime} or using the separate getters
|
||||
* (for year, month, etc.) to construct an <code>int</code> or
|
||||
* <code>long</code> value.</p>
|
||||
* <code>long</code> value.
|
||||
*
|
||||
* <p>To perform range querying or filtering against a
|
||||
* <code>LongField</code>, use {@link NumericRangeQuery} or {@link
|
||||
* NumericRangeFilter}. To sort according to a
|
||||
* <code>LongField</code>, use the normal numeric sort types, eg
|
||||
* {@link org.apache.lucene.search.SortField.Type#LONG}. <code>LongField</code>
|
||||
* values can also be loaded directly from {@link org.apache.lucene.index.LeafReader#getNumericDocValues}.</p>
|
||||
* values can also be loaded directly from {@link org.apache.lucene.index.LeafReader#getNumericDocValues}.
|
||||
*
|
||||
* <p>You may add the same field name as an <code>LongField</code> to
|
||||
* the same document more than once. Range querying and
|
||||
* filtering will be the logical OR of all values; so a range query
|
||||
* will hit all documents that have at least one value in
|
||||
* the range. However sort behavior is not defined. If you need to sort,
|
||||
* you should separately index a single-valued <code>LongField</code>.</p>
|
||||
* you should separately index a single-valued <code>LongField</code>.
|
||||
*
|
||||
* <p>A <code>LongField</code> will consume somewhat more disk space
|
||||
* in the index than an ordinary single-valued field.
|
||||
|
@ -111,7 +111,7 @@ import org.apache.lucene.util.NumericUtils;
|
|||
* <p>If you only need to sort by numeric value, and never
|
||||
* run range querying/filtering, you can index using a
|
||||
* <code>precisionStep</code> of {@link Integer#MAX_VALUE}.
|
||||
* This will minimize disk space consumed. </p>
|
||||
* This will minimize disk space consumed.
|
||||
*
|
||||
* <p>More advanced users can instead use {@link
|
||||
* NumericTokenStream} directly, when indexing numbers. This
|
||||
|
|
|
@ -384,7 +384,6 @@ public abstract class DirectoryReader extends BaseCompositeReader<LeafReader> {
|
|||
|
||||
/**
|
||||
* Expert: return the IndexCommit that this reader has opened.
|
||||
* <p/>
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public abstract IndexCommit getIndexCommit() throws IOException;
|
||||
|
|
|
@ -63,9 +63,7 @@ import org.apache.lucene.util.StringHelper;
|
|||
* HasSegID, SegID, SegCodec, DelGen, DeletionCount, FieldInfosGen, DocValuesGen,
|
||||
* UpdatesFiles><sup>SegCount</sup>, CommitUserData, Footer
|
||||
* </ul>
|
||||
* </p>
|
||||
* Data types:
|
||||
* <p>
|
||||
* <ul>
|
||||
* <li>Header --> {@link CodecUtil#writeIndexHeader IndexHeader}</li>
|
||||
* <li>NameCounter, SegCount, DeletionCount -->
|
||||
|
@ -81,9 +79,7 @@ import org.apache.lucene.util.StringHelper;
|
|||
* {@link DataOutput#writeStringSet(Set) Set<String>}></li>
|
||||
* <li>Footer --> {@link CodecUtil#writeFooter CodecFooter}</li>
|
||||
* </ul>
|
||||
* </p>
|
||||
* Field Descriptions:
|
||||
* <p>
|
||||
* <ul>
|
||||
* <li>Version counts how often the index has been changed by adding or deleting
|
||||
* documents.</li>
|
||||
|
@ -113,7 +109,6 @@ import org.apache.lucene.util.StringHelper;
|
|||
* <li>UpdatesFiles stores the set of files that were updated in that segment
|
||||
* per field.</li>
|
||||
* </ul>
|
||||
* </p>
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
|
|
|
@ -19,7 +19,6 @@
|
|||
* Code to maintain and access indices.
|
||||
* <!-- TODO: add IndexWriter, IndexWriterConfig, DocValues, etc etc -->
|
||||
* <h2>Table Of Contents</h2>
|
||||
* <p>
|
||||
* <ol>
|
||||
* <li><a href="#postings">Postings APIs</a>
|
||||
* <ul>
|
||||
|
@ -38,7 +37,6 @@
|
|||
* </ul>
|
||||
* </li>
|
||||
* </ol>
|
||||
* </p>
|
||||
* <a name="postings"></a>
|
||||
* <h2>Postings APIs</h2>
|
||||
* <a name="fields"></a>
|
||||
|
@ -63,7 +61,6 @@
|
|||
* Terms terms = fields.terms(field);
|
||||
* }
|
||||
* </pre>
|
||||
* </p>
|
||||
* <a name="terms"></a>
|
||||
* <h3>
|
||||
* Terms
|
||||
|
@ -100,7 +97,6 @@
|
|||
* DocsAndPositionsEnum docsAndPositions = termsEnum.docsAndPositions(null, null);
|
||||
* }
|
||||
* </pre>
|
||||
* </p>
|
||||
* <a name="documents"></a>
|
||||
* <h3>
|
||||
* Documents
|
||||
|
@ -116,7 +112,6 @@
|
|||
* System.out.println(docsEnum.freq());
|
||||
* }
|
||||
* </pre>
|
||||
* </p>
|
||||
* <a name="positions"></a>
|
||||
* <h3>
|
||||
* Positions
|
||||
|
@ -140,14 +135,12 @@
|
|||
* }
|
||||
* }
|
||||
* </pre>
|
||||
* </p>
|
||||
* <a name="stats"></a>
|
||||
* <h2>Index Statistics</h2>
|
||||
* <a name="termstats"></a>
|
||||
* <h3>
|
||||
* Term statistics
|
||||
* </h3>
|
||||
* <p>
|
||||
* <ul>
|
||||
* <li>{@link org.apache.lucene.index.TermsEnum#docFreq}: Returns the number of
|
||||
* documents that contain at least one occurrence of the term. This statistic
|
||||
|
@ -162,12 +155,10 @@
|
|||
* for the field. Like docFreq(), it will also count occurrences that appear in
|
||||
* deleted documents.
|
||||
* </ul>
|
||||
* </p>
|
||||
* <a name="fieldstats"></a>
|
||||
* <h3>
|
||||
* Field statistics
|
||||
* </h3>
|
||||
* <p>
|
||||
* <ul>
|
||||
* <li>{@link org.apache.lucene.index.Terms#size}: Returns the number of
|
||||
* unique terms in the field. This statistic may be unavailable
|
||||
|
@ -194,12 +185,10 @@
|
|||
* ({@link org.apache.lucene.index.IndexOptions#DOCS DOCS})
|
||||
* for the field.
|
||||
* </ul>
|
||||
* </p>
|
||||
* <a name="segmentstats"></a>
|
||||
* <h3>
|
||||
* Segment statistics
|
||||
* </h3>
|
||||
* <p>
|
||||
* <ul>
|
||||
* <li>{@link org.apache.lucene.index.IndexReader#maxDoc}: Returns the number of
|
||||
* documents (including deleted documents) in the index.
|
||||
|
@ -210,7 +199,6 @@
|
|||
* <li>{@link org.apache.lucene.index.Fields#size}: Returns the number of indexed
|
||||
* fields.
|
||||
* </ul>
|
||||
* </p>
|
||||
* <a name="documentstats"></a>
|
||||
* <h3>
|
||||
* Document statistics
|
||||
|
@ -220,8 +208,6 @@
|
|||
* a {@link org.apache.lucene.search.similarities.Similarity} implementation will store some
|
||||
* of these values (possibly in a lossy way), into the normalization value for the document in
|
||||
* its {@link org.apache.lucene.search.similarities.Similarity#computeNorm} method.
|
||||
* </p>
|
||||
* <p>
|
||||
* <ul>
|
||||
* <li>{@link org.apache.lucene.index.FieldInvertState#getLength}: Returns the number of
|
||||
* tokens for this field in the document. Note that this is just the number
|
||||
|
@ -248,11 +234,8 @@
|
|||
* <li>{@link org.apache.lucene.index.FieldInvertState#getMaxTermFrequency}: Returns the maximum
|
||||
* frequency across all unique terms encountered for this field in the document.
|
||||
* </ul>
|
||||
* </p>
|
||||
* <p>
|
||||
* Additional user-supplied statistics can be added to the document as DocValues fields and
|
||||
* accessed via {@link org.apache.lucene.index.LeafReader#getNumericDocValues}.
|
||||
* </p>
|
||||
* <p>
|
||||
*/
|
||||
package org.apache.lucene.index;
|
||||
|
|
|
@ -32,8 +32,7 @@ import org.apache.lucene.util.FixedBitSet;
|
|||
* term value in the specified field is contained in the
|
||||
* provided set of allowed terms.
|
||||
*
|
||||
* <p/>
|
||||
*
|
||||
* <p>
|
||||
* This is the same functionality as TermsFilter (from
|
||||
* queries/), except this filter requires that the
|
||||
* field contains only a single term for all documents.
|
||||
|
@ -41,9 +40,7 @@ import org.apache.lucene.util.FixedBitSet;
|
|||
* also have different performance characteristics, as
|
||||
* described below.
|
||||
*
|
||||
*
|
||||
* <p/>
|
||||
*
|
||||
* <p>
|
||||
* With each search, this filter translates the specified
|
||||
* set of Terms into a private {@link FixedBitSet} keyed by
|
||||
* term number per unique {@link IndexReader} (normally one
|
||||
|
@ -58,8 +55,7 @@ import org.apache.lucene.util.FixedBitSet;
|
|||
* index with a great many small documents may find this
|
||||
* linear scan too costly.
|
||||
*
|
||||
* <p/>
|
||||
*
|
||||
* <p>
|
||||
* In contrast, TermsFilter builds up an {@link FixedBitSet},
|
||||
* keyed by docID, every time it's created, by enumerating
|
||||
* through all matching docs using {@link org.apache.lucene.index.PostingsEnum} to seek
|
||||
|
@ -70,8 +66,7 @@ import org.apache.lucene.util.FixedBitSet;
|
|||
* to the number of terms, which can be exceptionally costly
|
||||
* when there are cache misses in the OS's IO cache.
|
||||
*
|
||||
* <p/>
|
||||
*
|
||||
* <p>
|
||||
* Generally, this filter will be slower on the first
|
||||
* invocation for a given field, but subsequent invocations,
|
||||
* even if you change the allowed set of Terms, should be
|
||||
|
@ -81,8 +76,7 @@ import org.apache.lucene.util.FixedBitSet;
|
|||
* match a very small number of documents, TermsFilter may
|
||||
* perform faster.
|
||||
*
|
||||
* <p/>
|
||||
*
|
||||
* <p>
|
||||
* Which filter is best is very application dependent.
|
||||
*/
|
||||
|
||||
|
|
|
@ -29,8 +29,7 @@ import org.apache.lucene.util.RamUsageEstimator;
|
|||
* that provides on-demand filtering/validation
|
||||
* mechanism on a given DocIdSet.
|
||||
*
|
||||
* <p/>
|
||||
*
|
||||
* <p>
|
||||
* Technically, this same functionality could be achieved
|
||||
* with ChainedFilter (under queries/), however the
|
||||
* benefit of this class is it never materializes the full
|
||||
|
|
|
@ -109,7 +109,7 @@ import org.apache.lucene.index.Term; // for javadocs
|
|||
* In practice, we have seen up to 300 terms in most cases (index with 500,000 metadata records
|
||||
* and a uniform value distribution).</p>
|
||||
*
|
||||
* <a name="precisionStepDesc"><h3>Precision Step</h3>
|
||||
* <h3><a name="precisionStepDesc">Precision Step</a></h3>
|
||||
* <p>You can choose any <code>precisionStep</code> when encoding values.
|
||||
* Lower step values mean more precisions and so more terms in index (and index gets larger). The number
|
||||
* of indexed terms per value is (those are generated by {@link NumericTokenStream}):
|
||||
|
@ -123,14 +123,14 @@ import org.apache.lucene.index.Term; // for javadocs
|
|||
* of the term dictionary in comparison to one term per value:
|
||||
* <p>
|
||||
* <!-- the formula in the alt attribute was transformed from latex to PNG with http://1.618034.com/latex.php (with 110 dpi): -->
|
||||
* <img src="doc-files/nrq-formula-1.png" alt="\mathrm{termDictOverhead} = \sum\limits_{i=0}^{\mathrm{indexedTermsPerValue}-1} \frac{1}{2^{\mathrm{precisionStep}\cdot i}}" />
|
||||
* <img src="doc-files/nrq-formula-1.png" alt="\mathrm{termDictOverhead} = \sum\limits_{i=0}^{\mathrm{indexedTermsPerValue}-1} \frac{1}{2^{\mathrm{precisionStep}\cdot i}}">
|
||||
* </p>
|
||||
* <p>On the other hand, if the <code>precisionStep</code> is smaller, the maximum number of terms to match reduces,
|
||||
* which optimizes query speed. The formula to calculate the maximum number of terms that will be visited while
|
||||
* executing the query is:
|
||||
* <p>
|
||||
* <!-- the formula in the alt attribute was transformed from latex to PNG with http://1.618034.com/latex.php (with 110 dpi): -->
|
||||
* <img src="doc-files/nrq-formula-2.png" alt="\mathrm{maxQueryTerms} = \left[ \left( \mathrm{indexedTermsPerValue} - 1 \right) \cdot \left(2^\mathrm{precisionStep} - 1 \right) \cdot 2 \right] + \left( 2^\mathrm{precisionStep} - 1 \right)" />
|
||||
* <img src="doc-files/nrq-formula-2.png" alt="\mathrm{maxQueryTerms} = \left[ \left( \mathrm{indexedTermsPerValue} - 1 \right) \cdot \left(2^\mathrm{precisionStep} - 1 \right) \cdot 2 \right] + \left( 2^\mathrm{precisionStep} - 1 \right)">
|
||||
* </p>
|
||||
* <p>For longs stored using a precision step of 4, <code>maxQueryTerms = 15*15*2 + 15 = 465</code>, and for a precision
|
||||
* step of 2, <code>maxQueryTerms = 31*3*2 + 3 = 189</code>. But the faster search speed is reduced by more seeking
|
||||
|
|
|
@ -34,7 +34,7 @@ import java.util.Arrays;
|
|||
* <p><code>document.add (new Field ("byNumber", Integer.toString(x), Field.Store.NO, Field.Index.NOT_ANALYZED));</code></p>
|
||||
*
|
||||
*
|
||||
* <p><h3>Valid Types of Values</h3>
|
||||
* <h3>Valid Types of Values</h3>
|
||||
*
|
||||
* <p>There are four possible kinds of term values which may be put into
|
||||
* sorting fields: Integers, Longs, Floats, or Strings. Unless
|
||||
|
@ -67,14 +67,14 @@ import java.util.Arrays;
|
|||
* of term value has higher memory requirements than the other
|
||||
* two types.
|
||||
*
|
||||
* <p><h3>Object Reuse</h3>
|
||||
* <h3>Object Reuse</h3>
|
||||
*
|
||||
* <p>One of these objects can be
|
||||
* used multiple times and the sort order changed between usages.
|
||||
*
|
||||
* <p>This class is thread safe.
|
||||
*
|
||||
* <p><h3>Memory Usage</h3>
|
||||
* <h3>Memory Usage</h3>
|
||||
*
|
||||
* <p>Sorting uses of caches of term values maintained by the
|
||||
* internal HitQueue(s). The cache is static and contains an integer
|
||||
|
|
|
@ -35,7 +35,6 @@ import org.apache.lucene.index.SortedNumericDocValues;
|
|||
* <p>
|
||||
* Like sorting by string, this also supports sorting missing values as first or last,
|
||||
* via {@link #setMissingValue(Object)}.
|
||||
* <p>
|
||||
* @see SortedNumericSelector
|
||||
*/
|
||||
public class SortedNumericSortField extends SortField {
|
||||
|
|
|
@ -36,7 +36,6 @@ import org.apache.lucene.index.SortedSetDocValues;
|
|||
* <p>
|
||||
* Like sorting by string, this also supports sorting missing values as first or last,
|
||||
* via {@link #setMissingValue(Object)}.
|
||||
* <p>
|
||||
* @see SortedSetSelector
|
||||
*/
|
||||
public class SortedSetSortField extends SortField {
|
||||
|
|
|
@ -24,7 +24,7 @@ import org.apache.lucene.util.BytesRef;
|
|||
/**
|
||||
* Subclass of FilteredTermEnum for enumerating all terms that match the
|
||||
* specified range parameters. Each term in the enumeration is
|
||||
* greater than all that precede it.</p>
|
||||
* greater than all that precede it.
|
||||
*/
|
||||
public class TermRangeTermsEnum extends FilteredTermsEnum {
|
||||
|
||||
|
|
|
@ -94,7 +94,6 @@ public class TimeLimitingCollector implements Collector {
|
|||
* collector.setBaseline(baseline);
|
||||
* indexSearcher.search(query, collector);
|
||||
* </pre>
|
||||
* </p>
|
||||
* @see #setBaseline()
|
||||
*/
|
||||
public void setBaseline(long clockTime) {
|
||||
|
|
|
@ -26,7 +26,7 @@ import org.apache.lucene.util.PriorityQueue;
|
|||
/**
|
||||
* A {@link Collector} that sorts by {@link SortField} using
|
||||
* {@link FieldComparator}s.
|
||||
* <p/>
|
||||
* <p>
|
||||
* See the {@link #create(org.apache.lucene.search.Sort, int, boolean, boolean, boolean)} method
|
||||
* for instantiating a TopFieldCollector.
|
||||
*
|
||||
|
|
|
@ -19,7 +19,6 @@
|
|||
* Code to search indices.
|
||||
*
|
||||
* <h2>Table Of Contents</h2>
|
||||
* <p>
|
||||
* <ol>
|
||||
* <li><a href="#search">Search Basics</a></li>
|
||||
* <li><a href="#query">The Query Classes</a></li>
|
||||
|
@ -28,7 +27,6 @@
|
|||
* <li><a href="#changingScoring">Changing the Scoring</a></li>
|
||||
* <li><a href="#algorithm">Appendix: Search Algorithm</a></li>
|
||||
* </ol>
|
||||
* </p>
|
||||
*
|
||||
*
|
||||
* <a name="search"></a>
|
||||
|
@ -40,21 +38,17 @@
|
|||
* variety of ways to provide complex querying capabilities along with information about where matches took place in the document
|
||||
* collection. The <a href="#query">Query Classes</a> section below highlights some of the more important Query classes. For details
|
||||
* on implementing your own Query class, see <a href="#customQueriesExpert">Custom Queries -- Expert Level</a> below.
|
||||
* </p>
|
||||
* <p>
|
||||
* To perform a search, applications usually call {@link
|
||||
* org.apache.lucene.search.IndexSearcher#search(Query,int)} or {@link
|
||||
* org.apache.lucene.search.IndexSearcher#search(Query,Filter,int)}.
|
||||
* </p>
|
||||
* <p>
|
||||
* Once a Query has been created and submitted to the {@link org.apache.lucene.search.IndexSearcher IndexSearcher}, the scoring
|
||||
* process begins. After some infrastructure setup, control finally passes to the {@link org.apache.lucene.search.Weight Weight}
|
||||
* implementation and its {@link org.apache.lucene.search.Scorer Scorer} or {@link org.apache.lucene.search.BulkScorer BulkScore}
|
||||
* instances. See the <a href="#algorithm">Algorithm</a> section for more notes on the process.
|
||||
* </p>
|
||||
* <!-- FILL IN MORE HERE -->
|
||||
* <!-- TODO: this page over-links the same things too many times -->
|
||||
* </p>
|
||||
*
|
||||
*
|
||||
* <a name="query"></a>
|
||||
|
@ -83,7 +77,6 @@
|
|||
* {@link org.apache.lucene.document.Document Document}s that have the
|
||||
* {@link org.apache.lucene.document.Field Field} named <tt>"fieldName"</tt>
|
||||
* containing the word <tt>"term"</tt>.
|
||||
* </p>
|
||||
* <h3>
|
||||
* {@link org.apache.lucene.search.BooleanQuery BooleanQuery}
|
||||
* </h3>
|
||||
|
@ -123,7 +116,6 @@
|
|||
* The default setting for the maximum number
|
||||
* of clauses 1024, but this can be changed via the
|
||||
* static method {@link org.apache.lucene.search.BooleanQuery#setMaxClauseCount(int)}.
|
||||
* </p>
|
||||
*
|
||||
* <h3>Phrases</h3>
|
||||
*
|
||||
|
@ -156,7 +148,6 @@
|
|||
* instances.</p>
|
||||
* </li>
|
||||
* </ol>
|
||||
* </p>
|
||||
*
|
||||
* <h3>
|
||||
* {@link org.apache.lucene.search.TermRangeQuery TermRangeQuery}
|
||||
|
@ -174,7 +165,6 @@
|
|||
*
|
||||
* For example, one could find all documents
|
||||
* that have terms beginning with the letters <tt>a</tt> through <tt>c</tt>.
|
||||
* </p>
|
||||
*
|
||||
* <h3>
|
||||
* {@link org.apache.lucene.search.NumericRangeQuery NumericRangeQuery}
|
||||
|
@ -187,7 +177,6 @@
|
|||
* using a one of the numeric fields ({@link org.apache.lucene.document.IntField IntField},
|
||||
* {@link org.apache.lucene.document.LongField LongField}, {@link org.apache.lucene.document.FloatField FloatField},
|
||||
* or {@link org.apache.lucene.document.DoubleField DoubleField}).
|
||||
* </p>
|
||||
*
|
||||
* <h3>
|
||||
* {@link org.apache.lucene.search.PrefixQuery PrefixQuery},
|
||||
|
@ -211,7 +200,6 @@
|
|||
* to remove that protection.
|
||||
* The {@link org.apache.lucene.search.RegexpQuery RegexpQuery} is even more general than WildcardQuery,
|
||||
* allowing an application to identify all documents with terms that match a regular expression pattern.
|
||||
* </p>
|
||||
* <h3>
|
||||
* {@link org.apache.lucene.search.FuzzyQuery FuzzyQuery}
|
||||
* </h3>
|
||||
|
@ -222,7 +210,6 @@
|
|||
* determined using
|
||||
* <a href="http://en.wikipedia.org/wiki/Levenshtein">Levenshtein (edit) distance</a>.
|
||||
* This type of query can be useful when accounting for spelling variations in the collection.
|
||||
* </p>
|
||||
*
|
||||
*
|
||||
* <a name="scoring"></a>
|
||||
|
@ -234,10 +221,8 @@
|
|||
* <a href="mailto:java-user@lucene.apache.org">java-user@lucene.apache.org</a> to figure out
|
||||
* why a document with five of our query terms scores lower than a different document with
|
||||
* only one of the query terms.
|
||||
* </p>
|
||||
* <p>While this document won't answer your specific scoring issues, it will, hopefully, point you
|
||||
* to the places that can help you figure out the <i>what</i> and <i>why</i> of Lucene scoring.
|
||||
* </p>
|
||||
* <p>Lucene scoring supports a number of pluggable information retrieval
|
||||
* <a href="http://en.wikipedia.org/wiki/Information_retrieval#Model_types">models</a>, including:
|
||||
* <ul>
|
||||
|
@ -252,14 +237,12 @@
|
|||
* that need to be scored based on boolean logic in the Query specification, and then ranks this subset of
|
||||
* matching documents via the retrieval model. For some valuable references on VSM and IR in general refer to
|
||||
* <a href="http://wiki.apache.org/lucene-java/InformationRetrieval">Lucene Wiki IR references</a>.
|
||||
* </p>
|
||||
* <p>The rest of this document will cover <a href="#scoringBasics">Scoring basics</a> and explain how to
|
||||
* change your {@link org.apache.lucene.search.similarities.Similarity Similarity}. Next, it will cover
|
||||
* ways you can customize the lucene internals in
|
||||
* <a href="#customQueriesExpert">Custom Queries -- Expert Level</a>, which gives details on
|
||||
* implementing your own {@link org.apache.lucene.search.Query Query} class and related functionality.
|
||||
* Finally, we will finish up with some reference material in the <a href="#algorithm">Appendix</a>.
|
||||
* </p>
|
||||
*
|
||||
*
|
||||
* <a name="scoringBasics"></a>
|
||||
|
@ -286,7 +269,6 @@
|
|||
* important because two Documents with the exact same content, but one having the content in two
|
||||
* Fields and the other in one Field may return different scores for the same query due to length
|
||||
* normalization.
|
||||
* </p>
|
||||
* <h3>Score Boosting</h3>
|
||||
* <p>Lucene allows influencing search results by "boosting" at different times:
|
||||
* <ul>
|
||||
|
@ -296,7 +278,6 @@
|
|||
* <li><b>Query-time boost</b> by setting a boost on a query clause, calling
|
||||
* {@link org.apache.lucene.search.Query#setBoost(float) Query.setBoost()}.</li>
|
||||
* </ul>
|
||||
* </p>
|
||||
* <p>Indexing time boosts are pre-processed for storage efficiency and written to
|
||||
* storage for a field as follows:
|
||||
* <ul>
|
||||
|
@ -310,8 +291,6 @@
|
|||
* <li>Decoding of any index-time normalization values and integration into the document's score is also performed
|
||||
* at search time by the Similarity.</li>
|
||||
* </ul>
|
||||
* </p>
|
||||
*
|
||||
*
|
||||
* <a name="changingScoring"></a>
|
||||
* <h2>Changing Scoring — Similarity</h2>
|
||||
|
@ -324,22 +303,18 @@
|
|||
* IndexSearcher.setSimilarity(Similarity)}. Be sure to use the same
|
||||
* Similarity at query-time as at index-time (so that norms are
|
||||
* encoded/decoded correctly); Lucene makes no effort to verify this.
|
||||
* </p>
|
||||
* <p>
|
||||
* You can influence scoring by configuring a different built-in Similarity implementation, or by tweaking its
|
||||
* parameters, subclassing it to override behavior. Some implementations also offer a modular API which you can
|
||||
* extend by plugging in a different component (e.g. term frequency normalizer).
|
||||
* </p>
|
||||
* <p>
|
||||
* Finally, you can extend the low level {@link org.apache.lucene.search.similarities.Similarity Similarity} directly
|
||||
* to implement a new retrieval model, or to use external scoring factors particular to your application. For example,
|
||||
* a custom Similarity can access per-document values via {@link org.apache.lucene.index.NumericDocValues} and
|
||||
* integrate them into the score.
|
||||
* </p>
|
||||
* <p>
|
||||
* See the {@link org.apache.lucene.search.similarities} package documentation for information
|
||||
* on the built-in available scoring models and extending or changing Similarity.
|
||||
* </p>
|
||||
*
|
||||
*
|
||||
* <a name="customQueriesExpert"></a>
|
||||
|
@ -347,7 +322,6 @@
|
|||
*
|
||||
* <p>Custom queries are an expert level task, so tread carefully and be prepared to share your code if
|
||||
* you want help.
|
||||
* </p>
|
||||
*
|
||||
* <p>With the warning out of the way, it is possible to change a lot more than just the Similarity
|
||||
* when it comes to matching and scoring in Lucene. Lucene's search is a complex mechanism that is grounded by
|
||||
|
@ -374,7 +348,6 @@
|
|||
* implementations.</li>
|
||||
* </ol>
|
||||
* Details on each of these classes, and their children, can be found in the subsections below.
|
||||
* </p>
|
||||
* <h3>The Query Class</h3>
|
||||
* <p>In some sense, the
|
||||
* {@link org.apache.lucene.search.Query Query}
|
||||
|
@ -396,7 +369,6 @@
|
|||
* {@link org.apache.lucene.search.BooleanQuery BooleanQuery}, <span
|
||||
* >and other queries that implement {@link org.apache.lucene.search.Query#createWeight(IndexSearcher,boolean) createWeight(IndexSearcher searcher,boolean)}</span></li>
|
||||
* </ol>
|
||||
* </p>
|
||||
* <a name="weightClass"></a>
|
||||
* <h3>The Weight Interface</h3>
|
||||
* <p>The
|
||||
|
@ -449,10 +421,8 @@
|
|||
* Typically a weight such as TermWeight
|
||||
* that scores via a {@link org.apache.lucene.search.similarities.Similarity Similarity} will make use of the Similarity's implementation:
|
||||
* {@link org.apache.lucene.search.similarities.Similarity.SimScorer#explain(int, Explanation) SimScorer#explain(int doc, Explanation freq)}.
|
||||
* </li>
|
||||
* </li>
|
||||
* </li>
|
||||
* </ol>
|
||||
* </p>
|
||||
* <a name="scorerClass"></a>
|
||||
* <h3>The Scorer Class</h3>
|
||||
* <p>The
|
||||
|
@ -494,7 +464,6 @@
|
|||
* details on the scoring process.
|
||||
* </li>
|
||||
* </ol>
|
||||
* </p>
|
||||
* <a name="bulkScorerClass"></a>
|
||||
* <h3>The BulkScorer Class</h3>
|
||||
* <p>The
|
||||
|
@ -506,14 +475,13 @@
|
|||
* Score all documents up to but not including the specified max document.
|
||||
* </li>
|
||||
* </ol>
|
||||
* </p>
|
||||
* <h3>Why would I want to add my own Query?</h3>
|
||||
*
|
||||
* <p>In a nutshell, you want to add your own custom Query implementation when you think that Lucene's
|
||||
* aren't appropriate for the
|
||||
* task that you want to do. You might be doing some cutting edge research or you need more information
|
||||
* back
|
||||
* out of Lucene (similar to Doug adding SpanQuery functionality).</p>
|
||||
* out of Lucene (similar to Doug adding SpanQuery functionality).
|
||||
*
|
||||
* <!-- TODO: integrate this better, it's better served as an intro than an appendix -->
|
||||
*
|
||||
|
@ -521,10 +489,10 @@
|
|||
* <a name="algorithm"></a>
|
||||
* <h2>Appendix: Search Algorithm</h2>
|
||||
* <p>This section is mostly notes on stepping through the Scoring process and serves as
|
||||
* fertilizer for the earlier sections.</p>
|
||||
* fertilizer for the earlier sections.
|
||||
* <p>In the typical search application, a {@link org.apache.lucene.search.Query Query}
|
||||
* is passed to the {@link org.apache.lucene.search.IndexSearcher IndexSearcher},
|
||||
* beginning the scoring process.</p>
|
||||
* beginning the scoring process.
|
||||
* <p>Once inside the IndexSearcher, a {@link org.apache.lucene.search.Collector Collector}
|
||||
* is used for the scoring and sorting of the search results.
|
||||
* These important objects are involved in a search:
|
||||
|
@ -538,7 +506,6 @@
|
|||
* <li>A {@link org.apache.lucene.search.Sort Sort} object for specifying how to sort
|
||||
* the results if the standard score-based sort method is not desired.</li>
|
||||
* </ol>
|
||||
* </p>
|
||||
* <p>Assuming we are not sorting (since sorting doesn't affect the raw Lucene score),
|
||||
* we call one of the search methods of the IndexSearcher, passing in the
|
||||
* {@link org.apache.lucene.search.Weight Weight} object created by
|
||||
|
@ -553,12 +520,10 @@
|
|||
* see {@link org.apache.lucene.search.IndexSearcher IndexSearcher}). The TopScoreDocCollector
|
||||
* uses a {@link org.apache.lucene.util.PriorityQueue PriorityQueue} to collect the
|
||||
* top results for the search.
|
||||
* </p>
|
||||
* <p>If a Filter is being used, some initial setup is done to determine which docs to include.
|
||||
* Otherwise, we ask the Weight for a {@link org.apache.lucene.search.Scorer Scorer} for each
|
||||
* {@link org.apache.lucene.index.IndexReader IndexReader} segment and proceed by calling
|
||||
* {@link org.apache.lucene.search.BulkScorer#score(org.apache.lucene.search.LeafCollector) BulkScorer.score(LeafCollector)}.
|
||||
* </p>
|
||||
* <p>At last, we are actually going to score some documents. The score method takes in the Collector
|
||||
* (most likely the TopScoreDocCollector or TopFieldCollector) and does its business.Of course, here
|
||||
* is where things get involved. The {@link org.apache.lucene.search.Scorer Scorer} that is returned
|
||||
|
@ -567,13 +532,12 @@
|
|||
* {@link org.apache.lucene.search.Scorer Scorer} is going to be a <code>BooleanScorer2</code> created
|
||||
* from {@link org.apache.lucene.search.BooleanWeight BooleanWeight} (see the section on
|
||||
* <a href="#customQueriesExpert">custom queries</a> for info on changing this).
|
||||
* </p>
|
||||
* <p>Assuming a BooleanScorer2, we first initialize the Coordinator, which is used to apply the coord()
|
||||
* factor. We then get a internal Scorer based on the required, optional and prohibited parts of the query.
|
||||
* Using this internal Scorer, the BooleanScorer2 then proceeds into a while loop based on the
|
||||
* {@link org.apache.lucene.search.Scorer#nextDoc Scorer.nextDoc()} method. The nextDoc() method advances
|
||||
* to the next document matching the query. This is an abstract method in the Scorer class and is thus
|
||||
* overridden by all derived implementations. If you have a simple OR query your internal Scorer is most
|
||||
* likely a DisjunctionSumScorer, which essentially combines the scorers from the sub scorers of the OR'd terms.</p>
|
||||
* likely a DisjunctionSumScorer, which essentially combines the scorers from the sub scorers of the OR'd terms.
|
||||
*/
|
||||
package org.apache.lucene.search;
|
||||
|
|
|
@ -20,7 +20,7 @@ package org.apache.lucene.search.payloads;
|
|||
|
||||
/**
|
||||
* Calculate the final score as the average score of all payloads seen.
|
||||
* <p/>
|
||||
* <p>
|
||||
* Is thread safe and completely reusable.
|
||||
*
|
||||
**/
|
||||
|
|
|
@ -20,7 +20,7 @@ package org.apache.lucene.search.payloads;
|
|||
|
||||
/**
|
||||
* Returns the maximum payload score seen, else 1 if there are no payloads on the doc.
|
||||
* <p/>
|
||||
* <p>
|
||||
* Is thread safe and completely reusable.
|
||||
*
|
||||
**/
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue