Input token stream |
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java
index 280033e45ec..603df1349ba 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java
@@ -31,11 +31,12 @@ import java.util.HashSet;
import java.util.Set;
/**
- * Analyzer for Czech language. Supports an external list of stopwords (words that
- * will not be indexed at all).
- * A default set of stopwords is used unless an alternative list is specified, the
- * exclusion list is empty by default.
- *
+ * {@link Analyzer} for Czech language.
+ *
+ * Supports an external list of stopwords (words that
+ * will not be indexed at all).
+ * A default set of stopwords is used unless an alternative list is specified.
+ *
*/
public final class CzechAnalyzer extends Analyzer {
@@ -64,7 +65,7 @@ public final class CzechAnalyzer extends Analyzer {
};
/**
- * Contains the stopwords used with the StopFilter.
+ * Contains the stopwords used with the {@link StopFilter}.
*/
private Set stoptable;
@@ -125,10 +126,10 @@ public final class CzechAnalyzer extends Analyzer {
}
/**
- * Creates a TokenStream which tokenizes all the text in the provided Reader.
+ * Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}.
*
- * @return A TokenStream built from a StandardTokenizer filtered with
- * StandardFilter, LowerCaseFilter, and StopFilter
+ * @return A {@link TokenStream} built from a {@link StandardTokenizer} filtered with
+ * {@link StandardFilter}, {@link LowerCaseFilter}, and {@link StopFilter}
*/
public final TokenStream tokenStream( String fieldName, Reader reader ) {
TokenStream result = new StandardTokenizer( reader );
@@ -144,11 +145,11 @@ public final class CzechAnalyzer extends Analyzer {
};
/**
- * Returns a (possibly reused) TokenStream which tokenizes all the text in
- * the provided Reader.
+ * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text in
+ * the provided {@link Reader}.
*
- * @return A TokenStream built from a StandardTokenizer filtered with
- * StandardFilter, LowerCaseFilter, and StopFilter
+ * @return A {@link TokenStream} built from a {@link StandardTokenizer} filtered with
+ * {@link StandardFilter}, {@link LowerCaseFilter}, and {@link StopFilter}
*/
public TokenStream reusableTokenStream(String fieldName, Reader reader)
throws IOException {
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java
index b7d5a20a0f0..bf45a967507 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java
@@ -35,12 +35,14 @@ import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
/**
- * Analyzer for German language. Supports an external list of stopwords (words that
+ * {@link Analyzer} for German language.
+ *
+ * Supports an external list of stopwords (words that
* will not be indexed at all) and an external list of exclusions (word that will
* not be stemmed, but indexed).
- * A default set of stopwords is used unless an alternative list is specified, the
+ * A default set of stopwords is used unless an alternative list is specified, but the
* exclusion list is empty by default.
- *
+ *
*
* @version $Id$
*/
@@ -65,7 +67,7 @@ public class GermanAnalyzer extends Analyzer {
};
/**
- * Contains the stopwords used with the StopFilter.
+ * Contains the stopwords used with the {@link StopFilter}.
*/
private Set stopSet = new HashSet();
@@ -75,8 +77,8 @@ public class GermanAnalyzer extends Analyzer {
private Set exclusionSet = new HashSet();
/**
- * Builds an analyzer with the default stop words
- * (GERMAN_STOP_WORDS
).
+ * Builds an analyzer with the default stop words:
+ * {@link #GERMAN_STOP_WORDS}.
*/
public GermanAnalyzer() {
stopSet = StopFilter.makeStopSet(GERMAN_STOP_WORDS);
@@ -115,7 +117,7 @@ public class GermanAnalyzer extends Analyzer {
}
/**
- * Builds an exclusionlist from a Hashtable.
+ * Builds an exclusionlist from a {@link Map}
*/
public void setStemExclusionTable(Map exclusionlist) {
exclusionSet = new HashSet(exclusionlist.keySet());
@@ -129,10 +131,11 @@ public class GermanAnalyzer extends Analyzer {
}
/**
- * Creates a TokenStream which tokenizes all the text in the provided Reader.
+ * Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}.
*
- * @return A TokenStream built from a StandardTokenizer filtered with
- * StandardFilter, LowerCaseFilter, StopFilter, GermanStemFilter
+ * @return A {@link TokenStream} built from a {@link StandardTokenizer} filtered with
+ * {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}, and
+ * {@link GermanStemFilter}
*/
public TokenStream tokenStream(String fieldName, Reader reader) {
TokenStream result = new StandardTokenizer(reader);
@@ -149,11 +152,12 @@ public class GermanAnalyzer extends Analyzer {
};
/**
- * Returns a (possibly reused) TokenStream which tokenizes all the text
- * in the provided Reader.
+ * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text
+ * in the provided {@link Reader}.
*
- * @return A TokenStream built from a StandardTokenizer filtered with
- * StandardFilter, LowerCaseFilter, StopFilter, GermanStemFilter
+ * @return A {@link TokenStream} built from a {@link StandardTokenizer} filtered with
+ * {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}, and
+ * {@link GermanStemFilter}
*/
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
if (overridesTokenStreamMethod) {
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java
index 1929563ed71..c142965925d 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java
@@ -25,10 +25,12 @@ import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
/**
- * A filter that stems German words. It supports a table of words that should
+ * A {@link TokenFilter} that stems German words.
+ *
+ * It supports a table of words that should
* not be stemmed at all. The stemmer used can be changed at runtime after the
- * filter object is created (as long as it is a GermanStemmer).
- *
+ * filter object is created (as long as it is a {@link GermanStemmer}).
+ *
*
* @version $Id$
*/
@@ -78,7 +80,7 @@ public final class GermanStemFilter extends TokenFilter
}
/**
- * Set a alternative/custom GermanStemmer for this filter.
+ * Set a alternative/custom {@link GermanStemmer} for this filter.
*/
public void setStemmer( GermanStemmer stemmer )
{
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanStemmer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanStemmer.java
index fea67129196..1cdcb41ee47 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanStemmer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanStemmer.java
@@ -19,10 +19,12 @@ package org.apache.lucene.analysis.de;
*/
/**
- * A stemmer for German words. The algorithm is based on the report
+ * A stemmer for German words.
+ *
+ * The algorithm is based on the report
* "A Fast and Simple Stemming Algorithm for German Words" by Jörg
* Caumanns (joerg.caumanns at isst.fhg.de).
- *
+ *
*
* @version $Id$
*/
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java
index a77f5f80224..4de196e79d6 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java
@@ -30,10 +30,12 @@ import java.util.Map;
import java.util.Set;
/**
- * Analyzer for the Greek language. Supports an external list of stopwords (words
+ * {@link Analyzer} for the Greek language.
+ *
+ * Supports an external list of stopwords (words
* that will not be indexed at all).
* A default set of stopwords is used unless an alternative list is specified.
- *
+ *
*/
public final class GreekAnalyzer extends Analyzer
{
@@ -145,14 +147,14 @@ public final class GreekAnalyzer extends Analyzer
};
/**
- * Contains the stopwords used with the StopFilter.
+ * Contains the stopwords used with the {@link StopFilter}.
*/
private Set stopSet = new HashSet();
/**
* Charset for Greek letters.
* Represents encoding for 24 lowercase Greek letters.
- * Predefined charsets can be taken from GreekCharSets class
+ * Predefined charsets can be taken from {@link GreekCharsets} class
*/
private char[] charset;
@@ -209,10 +211,10 @@ public final class GreekAnalyzer extends Analyzer
}
/**
- * Creates a TokenStream which tokenizes all the text in the provided Reader.
+ * Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}.
*
- * @return A TokenStream built from a StandardTokenizer filtered with
- * GreekLowerCaseFilter and StopFilter
+ * @return A {@link TokenStream} built from a {@link StandardTokenizer} filtered with
+ * {@link GreekLowerCaseFilter} and {@link StopFilter}
*/
public TokenStream tokenStream(String fieldName, Reader reader)
{
@@ -228,11 +230,11 @@ public final class GreekAnalyzer extends Analyzer
};
/**
- * Returns a (possibly reused) TokenStream which tokenizes all the text
- * in the provided Reader.
+ * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text
+ * in the provided {@link Reader}.
*
- * @return A TokenStream built from a StandardTokenizer filtered with
- * GreekLowerCaseFilter and StopFilter
+ * @return A {@link TokenStream} built from a {@link StandardTokenizer} filtered with
+ * {@link GreekLowerCaseFilter} and {@link StopFilter}
*/
public TokenStream reusableTokenStream(String fieldName, Reader reader)
throws IOException {
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekCharsets.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekCharsets.java
index 061dbc2cb9d..80470976fd6 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekCharsets.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekCharsets.java
@@ -19,10 +19,11 @@ package org.apache.lucene.analysis.el;
/**
* GreekCharsets class contains encodings schemes (charsets) and toLowerCase() method implementation
* for greek characters in Unicode, ISO-8859-7 and Microsoft Windows CP1253.
+ *
* Each encoding scheme contains lowercase (positions 0-35) and uppercase (position 36-68) characters,
* including accented ones. One should be able to add other encoding schemes (see RFC 1947) by adding
* the definition of a new charset as well as the required logic in the toLowerCase() method.
- *
+ *
*/
public class GreekCharsets
{
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java
index 4c212768ebc..65e463d38ca 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java
@@ -36,12 +36,12 @@ import org.apache.lucene.analysis.ar.ArabicLetterTokenizer;
import org.apache.lucene.analysis.ar.ArabicNormalizationFilter;
/**
- * Analyzer for Persian.
- *
- * Analyzer uses {@link ArabicLetterTokenizer} which implies tokenizing around
- * ZWNJ in addition to space. Some persian-specific variant forms (such as farsi
+ * {@link Analyzer} for Persian.
+ *
+ * This Analyzer uses {@link ArabicLetterTokenizer} which implies tokenizing around
+ * zero-width non-joiner in addition to whitespace. Some persian-specific variant forms (such as farsi
* yeh and keheh) are standardized. "Stemming" is accomplished via stopwords.
- *
+ *
*/
public final class PersianAnalyzer extends Analyzer {
@@ -107,11 +107,13 @@ public final class PersianAnalyzer extends Analyzer {
}
/**
- * Creates a TokenStream which tokenizes all the text in the provided Reader.
+ * Creates a {@link TokenStream} which tokenizes all the text in the provided
+ * {@link Reader}.
*
- * @return A TokenStream build from a ArabicLetterTokenizer filtered with
- * LowerCaseFilter, ArabicNormalizationFilter,
- * PersianNormalizationFilter and Persian Stop words
+ * @return A {@link TokenStream} built from a {@link ArabicLetterTokenizer}
+ * filtered with {@link LowerCaseFilter},
+ * {@link ArabicNormalizationFilter},
+ * {@link PersianNormalizationFilter} and Persian Stop words
*/
public TokenStream tokenStream(String fieldName, Reader reader) {
TokenStream result = new ArabicLetterTokenizer(reader);
@@ -134,12 +136,13 @@ public final class PersianAnalyzer extends Analyzer {
}
/**
- * Returns a (possibly reused) TokenStream which tokenizes all the text
- * in the provided Reader.
+ * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text
+ * in the provided {@link Reader}.
*
- * @return A TokenStream build from a ArabicLetterTokenizer filtered with
- * LowerCaseFilter, ArabicNormalizationFilter,
- * PersianNormalizationFilter and Persian Stop words
+ * @return A {@link TokenStream} built from a {@link ArabicLetterTokenizer}
+ * filtered with {@link LowerCaseFilter},
+ * {@link ArabicNormalizationFilter},
+ * {@link PersianNormalizationFilter} and Persian Stop words
*/
public TokenStream reusableTokenStream(String fieldName, Reader reader)
throws IOException {
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianNormalizationFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianNormalizationFilter.java
index 1106cca4c49..e9eb308a4b4 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianNormalizationFilter.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianNormalizationFilter.java
@@ -24,7 +24,7 @@ import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
/**
- * A TokenFilter that applies {@link PersianNormalizer} to normalize the
+ * A {@link TokenFilter} that applies {@link PersianNormalizer} to normalize the
* orthography.
*
*/
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/ElisionFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/ElisionFilter.java
index b354e4dfdb1..82e8b3f3449 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/ElisionFilter.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/ElisionFilter.java
@@ -22,16 +22,17 @@ import java.util.Set;
import java.util.HashSet;
import java.util.Arrays;
import java.util.Iterator;
+import org.apache.lucene.analysis.standard.StandardTokenizer; // for javadocs
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
/**
- * Removes elisions from a token stream. For example, "l'avion" (the plane) will be
+ * Removes elisions from a {@link TokenStream}. For example, "l'avion" (the plane) will be
* tokenized as "avion" (plane).
*
- * Note that StandardTokenizer sees " ' " as a space, and cuts it out.
+ * Note that {@link StandardTokenizer} sees " ' " as a space, and cuts it out.
*
* @see Elision in Wikipedia
*/
@@ -78,7 +79,7 @@ public class ElisionFilter extends TokenFilter {
}
/**
- * Returns the next input Token with term() without elisioned start
+ * Increments the {@link TokenStream} with a {@link TermAttribute} without elisioned start
*/
public final boolean incrementToken() throws IOException {
if (input.incrementToken()) {
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java
index fbafa35a14a..048e8e7f06b 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java
@@ -34,12 +34,14 @@ import java.util.Map;
import java.util.Set;
/**
- * Analyzer for French language. Supports an external list of stopwords (words that
+ * {@link Analyzer} for French language.
+ *
+ * Supports an external list of stopwords (words that
* will not be indexed at all) and an external list of exclusions (word that will
* not be stemmed, but indexed).
- * A default set of stopwords is used unless an alternative list is specified, the
+ * A default set of stopwords is used unless an alternative list is specified, but the
* exclusion list is empty by default.
- *
+ *
*
* @version $Id$
*/
@@ -74,7 +76,7 @@ public final class FrenchAnalyzer extends Analyzer {
};
/**
- * Contains the stopwords used with the StopFilter.
+ * Contains the stopwords used with the {@link StopFilter}.
*/
private Set stoptable = new HashSet();
/**
@@ -127,10 +129,12 @@ public final class FrenchAnalyzer extends Analyzer {
}
/**
- * Creates a TokenStream which tokenizes all the text in the provided Reader.
+ * Creates a {@link TokenStream} which tokenizes all the text in the provided
+ * {@link Reader}.
*
- * @return A TokenStream built from a StandardTokenizer filtered with
- * StandardFilter, StopFilter, FrenchStemFilter and LowerCaseFilter
+ * @return A {@link TokenStream} built from a {@link StandardTokenizer}
+ * filtered with {@link StandardFilter}, {@link StopFilter},
+ * {@link FrenchStemFilter} and {@link LowerCaseFilter}
*/
public final TokenStream tokenStream(String fieldName, Reader reader) {
@@ -152,11 +156,12 @@ public final class FrenchAnalyzer extends Analyzer {
};
/**
- * Returns a (possibly reused) TokenStream which tokenizes all the text
- * in the provided Reader.
+ * Returns a (possibly reused) {@link TokenStream} which tokenizes all the
+ * text in the provided {@link Reader}.
*
- * @return A TokenStream built from a StandardTokenizer filtered with
- * StandardFilter, StopFilter, FrenchStemFilter and LowerCaseFilter
+ * @return A {@link TokenStream} built from a {@link StandardTokenizer}
+ * filtered with {@link StandardFilter}, {@link StopFilter},
+ * {@link FrenchStemFilter} and {@link LowerCaseFilter}
*/
public TokenStream reusableTokenStream(String fieldName, Reader reader)
throws IOException {
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchStemFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchStemFilter.java
index 991c4ec1e5f..8ea51c510f6 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchStemFilter.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchStemFilter.java
@@ -17,7 +17,6 @@ package org.apache.lucene.analysis.fr;
* limitations under the License.
*/
-import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
@@ -28,10 +27,12 @@ import java.util.Map;
import java.util.Set;
/**
- * A filter that stemms french words. It supports a table of words that should
+ * A {@link TokenFilter} that stems french words.
+ *
+ * It supports a table of words that should
* not be stemmed at all. The used stemmer can be changed at runtime after the
- * filter object is created (as long as it is a FrenchStemmer).
- *
+ * filter object is created (as long as it is a {@link FrenchStemmer}).
+ *
*/
public final class FrenchStemFilter extends TokenFilter {
@@ -75,7 +76,7 @@ public final class FrenchStemFilter extends TokenFilter {
}
}
/**
- * Set a alternative/custom FrenchStemmer for this filter.
+ * Set a alternative/custom {@link FrenchStemmer} for this filter.
*/
public void setStemmer( FrenchStemmer stemmer ) {
if ( stemmer != null ) {
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchStemmer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchStemmer.java
index 383a14b4de3..8dc6bf5ccf6 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchStemmer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchStemmer.java
@@ -18,11 +18,13 @@ package org.apache.lucene.analysis.fr;
*/
/**
- * A stemmer for French words. The algorithm is based on the work of
+ * A stemmer for French words.
+ *
+ * The algorithm is based on the work of
* Dr Martin Porter on his snowball project
* refer to http://snowball.sourceforge.net/french/stemmer.html
* (French stemming algorithm) for details
- *
+ *
*/
public class FrenchStemmer {
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/PrefixAndSuffixAwareTokenFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/PrefixAndSuffixAwareTokenFilter.java
index 3b2c3779622..c411c2319d1 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/PrefixAndSuffixAwareTokenFilter.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/PrefixAndSuffixAwareTokenFilter.java
@@ -23,7 +23,7 @@ import org.apache.lucene.analysis.TokenStream;
import java.io.IOException;
/**
- * Links two PrefixAwareTokenFilter
+ * Links two {@link PrefixAwareTokenFilter}.
*
* NOTE: This filter might not behave correctly if used with custom Attributes, i.e. Attributes other than
* the ones located in org.apache.lucene.analysis.tokenattributes.
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/SingleTokenTokenStream.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/SingleTokenTokenStream.java
index 51c824c471c..7205f46c10f 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/SingleTokenTokenStream.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/SingleTokenTokenStream.java
@@ -29,7 +29,7 @@ import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
/**
- * A token stream containing a single token.
+ * A {@link TokenStream} containing a single token.
*/
public class SingleTokenTokenStream extends TokenStream {
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/package.html b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/package.html
new file mode 100644
index 00000000000..1356a6e771d
--- /dev/null
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/package.html
@@ -0,0 +1,5 @@
+
+
+Miscellaneous TokenStreams
+
+
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java
index b0fca28d82e..a1d1b76261d 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java
@@ -27,9 +27,9 @@ import java.io.IOException;
/**
* Tokenizes the given token into n-grams of given size(s).
- *
- * This filter create n-grams from the beginning edge or ending edge of a input token.
- *
+ *
+ * This {@link TokenFilter} create n-grams from the beginning edge or ending edge of a input token.
+ *
*/
public class EdgeNGramTokenFilter extends TokenFilter {
public static final Side DEFAULT_SIDE = Side.FRONT;
@@ -84,7 +84,7 @@ public class EdgeNGramTokenFilter extends TokenFilter {
/**
* Creates EdgeNGramTokenFilter that can generate n-grams in the sizes of the given range
*
- * @param input TokenStream holding the input to be tokenized
+ * @param input {@link TokenStream} holding the input to be tokenized
* @param side the {@link Side} from which to chop off an n-gram
* @param minGram the smallest n-gram to generate
* @param maxGram the largest n-gram to generate
@@ -114,7 +114,7 @@ public class EdgeNGramTokenFilter extends TokenFilter {
/**
* Creates EdgeNGramTokenFilter that can generate n-grams in the sizes of the given range
*
- * @param input TokenStream holding the input to be tokenized
+ * @param input {@link TokenStream} holding the input to be tokenized
* @param sideLabel the name of the {@link Side} from which to chop off an n-gram
* @param minGram the smallest n-gram to generate
* @param maxGram the largest n-gram to generate
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java
index 91579094bb5..3c13c07a8ff 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java
@@ -19,7 +19,6 @@ package org.apache.lucene.analysis.ngram;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter.Side;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
@@ -28,10 +27,10 @@ import java.io.Reader;
/**
* Tokenizes the input from an edge into n-grams of given size(s).
- *
- * This tokenizer create n-grams from the beginning edge or ending edge of a input token.
+ *
+ * This {@link Tokenizer} create n-grams from the beginning edge or ending edge of a input token.
* MaxGram can't be larger than 1024 because of limitation.
- *
+ *
*/
public class EdgeNGramTokenizer extends Tokenizer {
public static final Side DEFAULT_SIDE = Side.FRONT;
@@ -82,7 +81,7 @@ public class EdgeNGramTokenizer extends Tokenizer {
/**
* Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
*
- * @param input Reader holding the input to be tokenized
+ * @param input {@link Reader} holding the input to be tokenized
* @param side the {@link Side} from which to chop off an n-gram
* @param minGram the smallest n-gram to generate
* @param maxGram the largest n-gram to generate
@@ -112,7 +111,7 @@ public class EdgeNGramTokenizer extends Tokenizer {
/**
* Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
*
- * @param input Reader holding the input to be tokenized
+ * @param input {@link Reader} holding the input to be tokenized
* @param sideLabel the name of the {@link Side} from which to chop off an n-gram
* @param minGram the smallest n-gram to generate
* @param maxGram the largest n-gram to generate
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java
index 46db5ce3670..e0d849b2c2f 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java
@@ -44,7 +44,7 @@ public class NGramTokenFilter extends TokenFilter {
/**
* Creates NGramTokenFilter with given min and max n-grams.
- * @param input TokenStream holding the input to be tokenized
+ * @param input {@link TokenStream} holding the input to be tokenized
* @param minGram the smallest n-gram to generate
* @param maxGram the largest n-gram to generate
*/
@@ -65,7 +65,7 @@ public class NGramTokenFilter extends TokenFilter {
/**
* Creates NGramTokenFilter with default min and max n-grams.
- * @param input TokenStream holding the input to be tokenized
+ * @param input {@link TokenStream} holding the input to be tokenized
*/
public NGramTokenFilter(TokenStream input) {
this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE);
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java
index 974bea64771..89dff1ae262 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java
@@ -44,7 +44,7 @@ public class NGramTokenizer extends Tokenizer {
/**
* Creates NGramTokenizer with given min and max n-grams.
- * @param input Reader holding the input to be tokenized
+ * @param input {@link Reader} holding the input to be tokenized
* @param minGram the smallest n-gram to generate
* @param maxGram the largest n-gram to generate
*/
@@ -64,7 +64,7 @@ public class NGramTokenizer extends Tokenizer {
}
/**
* Creates NGramTokenizer with default min and max n-grams.
- * @param input Reader holding the input to be tokenized
+ * @param input {@link Reader} holding the input to be tokenized
*/
public NGramTokenizer(Reader input) {
this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE);
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/package.html b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/package.html
new file mode 100644
index 00000000000..683197879fc
--- /dev/null
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/package.html
@@ -0,0 +1,5 @@
+
+
+Character n-gram tokenizers and filters.
+
+
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java
index dae58534962..081fbcf9898 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java
@@ -33,13 +33,15 @@ import java.util.Set;
import java.util.Map;
/**
- * Analyzer for Dutch language. Supports an external list of stopwords (words that
+ * {@link Analyzer} for Dutch language.
+ *
+ * Supports an external list of stopwords (words that
* will not be indexed at all), an external list of exclusions (word that will
* not be stemmed, but indexed) and an external list of word-stem pairs that overrule
* the algorithm (dictionary stemming).
- * A default set of stopwords is used unless an alternative list is specified, the
+ * A default set of stopwords is used unless an alternative list is specified, but the
* exclusion list is empty by default.
- *
+ *
*/
public class DutchAnalyzer extends Analyzer {
/**
@@ -165,10 +167,12 @@ public class DutchAnalyzer extends Analyzer {
}
/**
- * Creates a TokenStream which tokenizes all the text in the provided TextReader.
+ * Creates a {@link TokenStream} which tokenizes all the text in the
+ * provided {@link Reader}.
*
- * @return A TokenStream built from a StandardTokenizer filtered with StandardFilter,
- * StopFilter, DutchStemFilter
+ * @return A {@link TokenStream} built from a {@link StandardTokenizer}
+ * filtered with {@link StandardFilter}, {@link StopFilter},
+ * and {@link DutchStemFilter}
*/
public TokenStream tokenStream(String fieldName, Reader reader) {
TokenStream result = new StandardTokenizer(reader);
@@ -184,11 +188,12 @@ public class DutchAnalyzer extends Analyzer {
};
/**
- * Returns a (possibly reused) TokenStream which tokenizes all the text
- * in the provided Reader.
+ * Returns a (possibly reused) {@link TokenStream} which tokenizes all the
+ * text in the provided {@link Reader}.
*
- * @return A TokenStream built from a StandardTokenizer filtered with
- * StandardFilter, StopFilter, DutchStemFilter
+ * @return A {@link TokenStream} built from a {@link StandardTokenizer}
+ * filtered with {@link StandardFilter}, {@link StopFilter},
+ * and {@link DutchStemFilter}
*/
public TokenStream reusableTokenStream(String fieldName, Reader reader)
throws IOException {
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchStemFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchStemFilter.java
index 037ee028011..4f9ae66fb4c 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchStemFilter.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchStemFilter.java
@@ -28,10 +28,12 @@ import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
/**
- * A filter that stems Dutch words. It supports a table of words that should
+ * A {@link TokenFilter} that stems Dutch words.
+ *
+ * It supports a table of words that should
* not be stemmed at all. The stemmer used can be changed at runtime after the
- * filter object is created (as long as it is a DutchStemmer).
- *
+ * filter object is created (as long as it is a {@link DutchStemmer}).
+ *
*/
public final class DutchStemFilter extends TokenFilter {
/**
@@ -85,7 +87,7 @@ public final class DutchStemFilter extends TokenFilter {
}
/**
- * Set a alternative/custom DutchStemmer for this filter.
+ * Set a alternative/custom {@link DutchStemmer} for this filter.
*/
public void setStemmer(DutchStemmer stemmer) {
if (stemmer != null) {
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchStemmer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchStemmer.java
index 9b70ab6eb01..84f35f0c66e 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchStemmer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchStemmer.java
@@ -20,11 +20,12 @@ package org.apache.lucene.analysis.nl;
import java.util.Map;
/**
- *
- * A stemmer for Dutch words. The algorithm is an implementation of
+ * A stemmer for Dutch words.
+ *
+ * The algorithm is an implementation of
* the dutch stemming
* algorithm in Martin Porter's snowball project.
- *
+ *
*/
public class DutchStemmer {
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/AbstractEncoder.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/AbstractEncoder.java
index 6b0533bae01..a4bdad3ca82 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/AbstractEncoder.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/AbstractEncoder.java
@@ -4,7 +4,7 @@ import org.apache.lucene.index.Payload;
/**
- *
+ * Base class for payload encoders.
*
**/
public abstract class AbstractEncoder implements PayloadEncoder{
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/FloatEncoder.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/FloatEncoder.java
index fd7c0bd2c63..2dd8d831aba 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/FloatEncoder.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/FloatEncoder.java
@@ -22,7 +22,7 @@ import org.apache.lucene.index.Payload;
/**
* Encode a character array Float as a {@link org.apache.lucene.index.Payload}.
*
- * @see {@link org.apache.lucene.analysis.payloads.PayloadHelper#encodeFloat(float, byte[], int)}
+ * @see org.apache.lucene.analysis.payloads.PayloadHelper#encodeFloat(float, byte[], int)
*
**/
public class FloatEncoder extends AbstractEncoder implements PayloadEncoder {
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/PayloadEncoder.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/PayloadEncoder.java
index d88cc1c2e08..ebcdec40ffd 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/PayloadEncoder.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/PayloadEncoder.java
@@ -20,7 +20,7 @@ import org.apache.lucene.index.Payload;
/**
- * Mainly for use with the DelimitedPayloadTokenFilter, converts char buffers to Payload
+ * Mainly for use with the DelimitedPayloadTokenFilter, converts char buffers to Payload.
*
* NOTE: This interface is subject to change
*
@@ -34,7 +34,7 @@ public interface PayloadEncoder {
* @param buffer
* @param offset
* @param length
- * @return
+ * @return encoded {@link Payload}
*/
Payload encode(char [] buffer, int offset, int length);
}
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/PayloadHelper.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/PayloadHelper.java
index 31684d567c6..1c8a2f841cc 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/PayloadHelper.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/PayloadHelper.java
@@ -18,7 +18,7 @@ package org.apache.lucene.analysis.payloads;
/**
- *
+ * Utility methods for encoding payloads.
*
**/
public class PayloadHelper {
@@ -60,7 +60,7 @@ public class PayloadHelper {
* @param offset The offset into the array.
* @return The float that was encoded
*
- * @see # encodeFloat (float)
+ * @see #encodeFloat(float)
*/
public static final float decodeFloat(byte [] bytes, int offset){
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/position/package.html b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/position/package.html
new file mode 100644
index 00000000000..1f1457246ee
--- /dev/null
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/position/package.html
@@ -0,0 +1,5 @@
+
+
+Filter for assigning position increments.
+
+
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzer.java
index 1f3aea5a0c1..2a284dee557 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzer.java
@@ -28,19 +28,19 @@ import java.io.IOException;
import java.io.Reader;
import java.util.*;
-/*
- * An analyzer used primarily at query time to wrap another analyzer and provide a layer of protection
- * which prevents very common words from being passed into queries. For very large indexes the cost
+/**
+ * An {@link Analyzer} used primarily at query time to wrap another analyzer and provide a layer of protection
+ * which prevents very common words from being passed into queries.
+ *
+ * For very large indexes the cost
* of reading TermDocs for a very common word can be high. This analyzer was created after experience with
* a 38 million doc index which had a term in around 50% of docs and was causing TermQueries for
* this term to take 2 seconds.
- *
+ *
+ *
* Use the various "addStopWords" methods in this class to automate the identification and addition of
* stop words found in an already existing index.
- *
- *
- *
-
+ *
*/
public class QueryAutoStopWordAnalyzer extends Analyzer {
Analyzer delegate;
@@ -50,9 +50,9 @@ public class QueryAutoStopWordAnalyzer extends Analyzer {
public static final float defaultMaxDocFreqPercent = 0.4f;
/**
- * Initializes this analyzer with the Analyzer object that actual produces the tokens
+ * Initializes this analyzer with the Analyzer object that actually produces the tokens
*
- * @param delegate The choice of analyzer that is used to produce the token stream which needs filtering
+ * @param delegate The choice of {@link Analyzer} that is used to produce the token stream which needs filtering
*/
public QueryAutoStopWordAnalyzer(Analyzer delegate) {
this.delegate = delegate;
@@ -62,7 +62,7 @@ public class QueryAutoStopWordAnalyzer extends Analyzer {
/**
* Automatically adds stop words for all fields with terms exceeding the defaultMaxDocFreqPercent
*
- * @param reader The IndexReader class which will be consulted to identify potential stop words that
+ * @param reader The {@link IndexReader} which will be consulted to identify potential stop words that
* exceed the required document frequency
* @return The number of stop words identified.
* @throws IOException
@@ -74,7 +74,7 @@ public class QueryAutoStopWordAnalyzer extends Analyzer {
/**
* Automatically adds stop words for all fields with terms exceeding the maxDocFreqPercent
*
- * @param reader The IndexReader class which will be consulted to identify potential stop words that
+ * @param reader The {@link IndexReader} which will be consulted to identify potential stop words that
* exceed the required document frequency
* @param maxDocFreq The maximum number of index documents which can contain a term, after which
* the term is considered to be a stop word
@@ -94,7 +94,7 @@ public class QueryAutoStopWordAnalyzer extends Analyzer {
/**
* Automatically adds stop words for all fields with terms exceeding the maxDocFreqPercent
*
- * @param reader The IndexReader class which will be consulted to identify potential stop words that
+ * @param reader The {@link IndexReader} which will be consulted to identify potential stop words that
* exceed the required document frequency
* @param maxPercentDocs The maximum percentage (between 0.0 and 1.0) of index documents which
* contain a term, after which the word is considered to be a stop word.
@@ -114,7 +114,7 @@ public class QueryAutoStopWordAnalyzer extends Analyzer {
/**
* Automatically adds stop words for the given field with terms exceeding the maxPercentDocs
*
- * @param reader The IndexReader class which will be consulted to identify potential stop words that
+ * @param reader The {@link IndexReader} which will be consulted to identify potential stop words that
* exceed the required document frequency
* @param fieldName The field for which stopwords will be added
* @param maxPercentDocs The maximum percentage (between 0.0 and 1.0) of index documents which
@@ -129,7 +129,7 @@ public class QueryAutoStopWordAnalyzer extends Analyzer {
/**
* Automatically adds stop words for the given field with terms exceeding the maxPercentDocs
*
- * @param reader The IndexReader class which will be consulted to identify potential stop words that
+ * @param reader The {@link IndexReader} which will be consulted to identify potential stop words that
* exceed the required document frequency
* @param fieldName The field for which stopwords will be added
* @param maxDocFreq The maximum number of index documents which
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/query/package.html b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/query/package.html
new file mode 100644
index 00000000000..783648b8050
--- /dev/null
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/query/package.html
@@ -0,0 +1,5 @@
+
+
+Automatically filter high-frequency stopwords.
+
+
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/reverse/package.html b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/reverse/package.html
new file mode 100644
index 00000000000..f1eb2bb5cb4
--- /dev/null
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/reverse/package.html
@@ -0,0 +1,5 @@
+
+
+Filter to reverse token text.
+
+
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java
index d3bf8b56162..a6ab1819c3b 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java
@@ -29,10 +29,12 @@ import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
/**
- * Analyzer for Russian language. Supports an external list of stopwords (words that
+ * {@link Analyzer} for Russian language.
+ *
+ * Supports an external list of stopwords (words that
* will not be indexed at all).
* A default set of stopwords is used unless an alternative list is specified.
- *
+ *
*
* @version $Id$
*/
@@ -246,10 +248,13 @@ public final class RussianAnalyzer extends Analyzer
}
/**
- * Creates a TokenStream which tokenizes all the text in the provided Reader.
+ * Creates a {@link TokenStream} which tokenizes all the text in the
+ * provided {@link Reader}.
*
- * @return A TokenStream built from a RussianLetterTokenizer filtered with
- * RussianLowerCaseFilter, StopFilter, and RussianStemFilter
+ * @return A {@link TokenStream} built from a
+ * {@link RussianLetterTokenizer} filtered with
+ * {@link RussianLowerCaseFilter}, {@link StopFilter},
+ * and {@link RussianStemFilter}
*/
public TokenStream tokenStream(String fieldName, Reader reader)
{
@@ -266,11 +271,13 @@ public final class RussianAnalyzer extends Analyzer
};
/**
- * Returns a (possibly reused) TokenStream which tokenizes all the text
- * in the provided Reader.
+ * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text
+ * in the provided {@link Reader}.
*
- * @return A TokenStream built from a RussianLetterTokenizer filtered with
- * RussianLowerCaseFilter, StopFilter, and RussianStemFilter
+ * @return A {@link TokenStream} built from a
+ * {@link RussianLetterTokenizer} filtered with
+ * {@link RussianLowerCaseFilter}, {@link StopFilter},
+ * and {@link RussianStemFilter}
*/
public TokenStream reusableTokenStream(String fieldName, Reader reader)
throws IOException {
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianCharsets.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianCharsets.java
index e670ff2623b..bacbd325c06 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianCharsets.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianCharsets.java
@@ -19,10 +19,11 @@ package org.apache.lucene.analysis.ru;
/**
* RussianCharsets class contains encodings schemes (charsets) and toLowerCase() method implementation
* for russian characters in Unicode, KOI8 and CP1252.
+ *
* Each encoding scheme contains lowercase (positions 0-31) and uppercase (position 32-63) characters.
* One should be able to add other encoding schemes (like ISO-8859-5 or customized) by adding a new charset
* and adding logic to toLowerCase() method for that charset.
- *
+ *
*
* @version $Id$
*/
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLetterTokenizer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLetterTokenizer.java
index 546fd8c99e1..3a38e3e6799 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLetterTokenizer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLetterTokenizer.java
@@ -19,13 +19,18 @@ package org.apache.lucene.analysis.ru;
import java.io.Reader;
import org.apache.lucene.analysis.CharTokenizer;
+import org.apache.lucene.analysis.Tokenizer; // for javadocs
+import org.apache.lucene.analysis.LetterTokenizer; // for javadocs
/**
- * A RussianLetterTokenizer is a tokenizer that extends LetterTokenizer by additionally looking up letters
- * in a given "russian charset". The problem with LeterTokenizer is that it uses Character.isLetter() method,
+ * A RussianLetterTokenizer is a {@link Tokenizer} that extends {@link LetterTokenizer}
+ * by additionally looking up letters in a given "russian charset".
+ *
+ * The problem with
+ * {@link LetterTokenizer} is that it uses {@link Character#isLetter(char)} method,
* which doesn't know how to detect letters in encodings like CP1252 and KOI8
* (well-known problems with 0xD7 and 0xF7 chars)
- *
+ *
*
* @version $Id$
*/
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLowerCaseFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLowerCaseFilter.java
index cd54f0b5712..41eed6ae91c 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLowerCaseFilter.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLowerCaseFilter.java
@@ -20,7 +20,6 @@ package org.apache.lucene.analysis.ru;
import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
-import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java
index ab87c2b2ea0..4aed458a364 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java
@@ -17,7 +17,6 @@ package org.apache.lucene.analysis.ru;
* limitations under the License.
*/
-import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
@@ -25,10 +24,12 @@ import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import java.io.IOException;
/**
- * A filter that stems Russian words. The implementation was inspired by GermanStemFilter.
- * The input should be filtered by RussianLowerCaseFilter before passing it to RussianStemFilter ,
- * because RussianStemFilter only works with lowercase part of any "russian" charset.
- *
+ * A {@link TokenFilter} that stems Russian words.
+ *
+ * The implementation was inspired by GermanStemFilter.
+ * The input should be filtered by {@link RussianLowerCaseFilter} before passing it to RussianStemFilter ,
+ * because RussianStemFilter only works with lowercase part of any "russian" charset.
+ *
*
* @version $Id$
*/
@@ -66,7 +67,7 @@ public final class RussianStemFilter extends TokenFilter
/**
- * Set a alternative/custom RussianStemmer for this filter.
+ * Set a alternative/custom {@link RussianStemmer} for this filter.
*/
public void setStemmer(RussianStemmer stemmer)
{
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapper.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapper.java
index 358ae084c59..2ac6ffec91a 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapper.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapper.java
@@ -25,8 +25,10 @@ import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
/**
- * A ShingleAnalyzerWrapper wraps a ShingleFilter around another analyzer. A
- * shingle is another namefor a token based n-gram.
+ * A ShingleAnalyzerWrapper wraps a {@link ShingleFilter} around another {@link Analyzer}.
+ *
+ * A shingle is another name for a token based n-gram.
+ *
*/
public class ShingleAnalyzerWrapper extends Analyzer {
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java
index 45fd2474634..8395dc24eb5 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java
@@ -76,7 +76,7 @@ public class ShingleFilter extends TokenFilter {
/**
* Constructs a ShingleFilter with the specified single size from the
- * TokenStream input
+ * {@link TokenStream} input
*
* @param input input stream
* @param maxShingleSize maximum shingle size produced by the filter.
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleMatrixFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleMatrixFilter.java
index 6570e59d721..9388e5be106 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleMatrixFilter.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleMatrixFilter.java
@@ -129,7 +129,7 @@ public class ShingleMatrixFilter extends TokenStream {
/**
* Retrieves information on how a {@link org.apache.lucene.analysis.Token} is to be inserted to a {@link org.apache.lucene.analysis.shingle.ShingleMatrixFilter.Matrix}.
* @param token
- * @return
+ * @return {@link ShingleMatrixFilter.TokenPositioner}
* @throws IOException
*/
public abstract TokenPositioner getTokenPositioner(Token token) throws IOException;
@@ -1014,7 +1014,7 @@ public class ShingleMatrixFilter extends TokenStream {
* Returns a 32 bit float from the payload, or 1f it null.
*
* @param token
- * @return
+ * @return 32 bit float
*/
public float getWeight(Token token) {
if (token.getPayload() == null || token.getPayload().getData() == null) {
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/package.html b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/package.html
new file mode 100644
index 00000000000..ca0bb76529d
--- /dev/null
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/package.html
@@ -0,0 +1,5 @@
+
+
+Word n-gram filters
+
+
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java
index db4ee21e69d..a0d6ab4422f 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java
@@ -27,7 +27,7 @@ import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
/**
- * Analyzer for Thai language. It uses java.text.BreakIterator to break words.
+ * {@link Analyzer} for Thai language. It uses {@link java.text.BreakIterator} to break words.
* @version 0.2
*/
public class ThaiAnalyzer extends Analyzer {
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java
index 95baaa491d3..aa0d062b1b0 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java
@@ -28,7 +28,7 @@ import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import java.text.BreakIterator;
/**
- * TokenFilter that use java.text.BreakIterator to break each
+ * {@link TokenFilter} that use {@link java.text.BreakIterator} to break each
* Token that is Thai into separate Token(s) for each Thai word.
* @version 0.2
*/
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/package.html b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/package.html
new file mode 100644
index 00000000000..b77b1b9e26c
--- /dev/null
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/package.html
@@ -0,0 +1,5 @@
+
+
+Analyzer for Thai.
+
+
diff --git a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java
index 15527b75c52..0427cacc865 100644
--- a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java
+++ b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java
@@ -118,6 +118,14 @@ public class TestBrazilianStemmer extends TestCase {
check("quiosque", "quiosqu");
}
+ public void testNormalization() throws Exception {
+ check("Brasil", "brasil"); // lowercase by default
+ check("Brasília", "brasil"); // remove diacritics
+ check("quimio5terápicos", "quimio5terapicos"); // contains non-letter, diacritic will still be removed
+ check("áá", "áá"); // token is too short: diacritics are not removed
+ check("ááá", "aaa"); // normally, diacritics are removed
+ }
+
public void testReusableTokenStream() throws Exception {
Analyzer a = new BrazilianAnalyzer();
checkReuse(a, "boa", "boa");
@@ -126,6 +134,11 @@ public class TestBrazilianStemmer extends TestCase {
checkReuse(a, "bôas", "boas"); // removes diacritic: different from snowball portugese
}
+ public void testStemExclusionTable() throws Exception {
+ BrazilianAnalyzer a = new BrazilianAnalyzer();
+ a.setStemExclusionTable(new String[] { "quintessência" });
+ checkReuse(a, "quintessência", "quintessência"); // excluded words will be completely unchanged.
+ }
private void check(final String input, final String expected) throws IOException {
Analyzer analyzer = new BrazilianAnalyzer();
diff --git a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cjk/TestCJKTokenizer.java b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cjk/TestCJKTokenizer.java
index fde7c3c33df..0addd8757e3 100644
--- a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cjk/TestCJKTokenizer.java
+++ b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cjk/TestCJKTokenizer.java
@@ -169,6 +169,66 @@ public class TestCJKTokenizer extends TestCase{
checkCJKToken(str, out_tokens);
}
+ /*
+ * Full-width text is normalized to half-width
+ */
+ public void testFullWidth() throws Exception {
+ String str = "Test 1234";
+ TestToken[] out_tokens = {
+ newToken("test", 0, 4, CJKTokenizer.SINGLE_TOKEN_TYPE),
+ newToken("1234", 5, 9, CJKTokenizer.SINGLE_TOKEN_TYPE)
+ };
+ checkCJKToken(str, out_tokens);
+ }
+
+ /*
+ * Non-english text (not just CJK) is treated the same as CJK: C1C2 C2C3
+ */
+ public void testNonIdeographic() throws Exception {
+ String str = "\u4e00 روبرت موير";
+ TestToken[] out_tokens = {
+ newToken("\u4e00", 0, 1, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+ newToken("رو", 2, 4, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+ newToken("وب", 3, 5, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+ newToken("بر", 4, 6, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+ newToken("رت", 5, 7, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+ newToken("مو", 8, 10, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+ newToken("وي", 9, 11, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+ newToken("ير", 10, 12, CJKTokenizer.DOUBLE_TOKEN_TYPE)
+ };
+ checkCJKToken(str, out_tokens);
+ }
+
+ /*
+ * Non-english text with nonletters (non-spacing marks,etc) is treated as C1C2 C2C3,
+ * except for words are split around non-letters.
+ */
+ public void testNonIdeographicNonLetter() throws Exception {
+ String str = "\u4e00 رُوبرت موير";
+ TestToken[] out_tokens = {
+ newToken("\u4e00", 0, 1, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+ newToken("ر", 2, 3, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+ newToken("وب", 4, 6, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+ newToken("بر", 5, 7, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+ newToken("رت", 6, 8, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+ newToken("مو", 9, 11, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+ newToken("وي", 10, 12, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+ newToken("ير", 11, 13, CJKTokenizer.DOUBLE_TOKEN_TYPE)
+ };
+ checkCJKToken(str, out_tokens);
+ }
+
+ public void testTokenStream() throws Exception {
+ Analyzer analyzer = new CJKAnalyzer();
+ TokenStream ts = analyzer.tokenStream("dummy", new StringReader("\u4e00\u4e01\u4e02"));
+ TermAttribute termAtt = (TermAttribute) ts.getAttribute(TermAttribute.class);
+ assertTrue(ts.incrementToken());
+ assertEquals("\u4e00\u4e01", termAtt.term());
+ assertTrue(ts.incrementToken());
+ assertEquals("\u4e01\u4e02", termAtt.term());
+ assertFalse(ts.incrementToken());
+ }
+
public void testReusableTokenStream() throws Exception {
Analyzer analyzer = new CJKAnalyzer();
String str = "\u3042\u3044\u3046\u3048\u304aabc\u304b\u304d\u304f\u3051\u3053";
diff --git a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cn/TestChineseTokenizer.java b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cn/TestChineseTokenizer.java
index 07a88bb8636..2477b90f909 100644
--- a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cn/TestChineseTokenizer.java
+++ b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cn/TestChineseTokenizer.java
@@ -18,12 +18,15 @@ package org.apache.lucene.analysis.cn;
*/
import java.io.IOException;
+import java.io.Reader;
import java.io.StringReader;
import junit.framework.TestCase;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
@@ -59,6 +62,76 @@ public class TestChineseTokenizer extends TestCase
new int[] { 1, 2, 3 });
}
+ /*
+ * Analyzer that just uses ChineseTokenizer, not ChineseFilter.
+ * convenience to show the behavior of the tokenizer
+ */
+ private class JustChineseTokenizerAnalyzer extends Analyzer {
+ public TokenStream tokenStream(String fieldName, Reader reader) {
+ return new ChineseTokenizer(reader);
+ }
+ }
+
+ /*
+ * Analyzer that just uses ChineseFilter, not ChineseTokenizer.
+ * convenience to show the behavior of the filter.
+ */
+ private class JustChineseFilterAnalyzer extends Analyzer {
+ public TokenStream tokenStream(String fieldName, Reader reader) {
+ return new ChineseFilter(new WhitespaceTokenizer(reader));
+ }
+ }
+
+ /*
+ * ChineseTokenizer tokenizes numbers as one token, but they are filtered by ChineseFilter
+ */
+ public void testNumerics() throws Exception
+ {
+ Analyzer justTokenizer = new JustChineseTokenizerAnalyzer();
+ assertAnalyzesTo(justTokenizer, "中1234", new String[] { "中", "1234" });
+
+ // in this case the ChineseAnalyzer (which applies ChineseFilter) will remove the numeric token.
+ Analyzer a = new ChineseAnalyzer();
+ assertAnalyzesTo(a, "中1234", new String[] { "中" });
+ }
+
+ /*
+ * ChineseTokenizer tokenizes english similar to SimpleAnalyzer.
+ * it will lowercase terms automatically.
+ *
+ * ChineseFilter has an english stopword list, it also removes any single character tokens.
+ * the stopword list is case-sensitive.
+ */
+ public void testEnglish() throws Exception
+ {
+ Analyzer chinese = new ChineseAnalyzer();
+ assertAnalyzesTo(chinese, "This is a Test. b c d",
+ new String[] { "test" });
+
+ Analyzer justTokenizer = new JustChineseTokenizerAnalyzer();
+ assertAnalyzesTo(justTokenizer, "This is a Test. b c d",
+ new String[] { "this", "is", "a", "test", "b", "c", "d" });
+
+ Analyzer justFilter = new JustChineseFilterAnalyzer();
+ assertAnalyzesTo(justFilter, "This is a Test. b c d",
+ new String[] { "This", "Test." });
+ }
+
+ private void assertAnalyzesTo(Analyzer a, String input, String[] output)
+ throws Exception {
+ TokenStream ts = a.tokenStream("dummy", new StringReader(input));
+ TermAttribute termAtt = (TermAttribute) ts
+ .getAttribute(TermAttribute.class);
+
+ for (int i = 0; i < output.length; i++) {
+ assertTrue(ts.incrementToken());
+ assertEquals(output[i], termAtt.term());
+ }
+
+ assertFalse(ts.incrementToken());
+ ts.close();
+ }
+
private void assertAnalyzesToReuse(Analyzer a, String input, String[] output,
int startOffsets[], int endOffsets[])
throws Exception {
diff --git a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java
index 303a57c6f64..64c64b239c6 100644
--- a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java
+++ b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java
@@ -90,12 +90,12 @@ public class TestGermanStemFilter extends TestCase {
}
private void check(final String input, final String expected) throws IOException {
- StandardTokenizer tokenStream = new StandardTokenizer(new StringReader(input));
- GermanStemFilter filter = new GermanStemFilter(tokenStream);
- TermAttribute termAtt = (TermAttribute) filter.getAttribute(TermAttribute.class);
- assertTrue(filter.incrementToken());
+ Analyzer a = new GermanAnalyzer();
+ TokenStream tokenStream = a.tokenStream("dummy", new StringReader(input));
+ TermAttribute termAtt = (TermAttribute) tokenStream.getAttribute(TermAttribute.class);
+ assertTrue(tokenStream.incrementToken());
assertEquals(expected, termAtt.term());
- filter.close();
+ tokenStream.close();
}
private void checkReuse(Analyzer a, String input, String expected) throws IOException {
diff --git a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzerTest.java b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzerTest.java
index 38e71e3e86c..ec154b3e5c2 100644
--- a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzerTest.java
+++ b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzerTest.java
@@ -18,9 +18,11 @@ package org.apache.lucene.analysis.query;
import junit.framework.TestCase;
import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.LetterTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.analysis.WhitespaceTokenizer;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
@@ -35,6 +37,7 @@ import org.apache.lucene.store.RAMDirectory;
import java.io.IOException;
import java.io.Reader;
+import java.io.StringReader;
public class QueryAutoStopWordAnalyzerTest extends TestCase {
String variedFieldValues[] = {"the", "quick", "brown", "fox", "jumped", "over", "the", "lazy", "boring", "dog"};
@@ -162,4 +165,37 @@ public class QueryAutoStopWordAnalyzerTest extends TestCase {
Hits h = search(a, "repetitiveField:boring");
assertFalse(h.length() == 0);
}
+
+ /*
+ * analyzer that does not support reuse
+ * it is LetterTokenizer on odd invocations, WhitespaceTokenizer on even.
+ */
+ private class NonreusableAnalyzer extends Analyzer {
+ int invocationCount = 0;
+ public TokenStream tokenStream(String fieldName, Reader reader) {
+ if (++invocationCount % 2 == 0)
+ return new WhitespaceTokenizer(reader);
+ else
+ return new LetterTokenizer(reader);
+ }
+ }
+
+ public void testWrappingNonReusableAnalyzer() throws Exception {
+ QueryAutoStopWordAnalyzer a = new QueryAutoStopWordAnalyzer(new NonreusableAnalyzer());
+ a.addStopWords(reader, 10);
+ Hits h = search(a, "repetitiveField:boring");
+ assertTrue(h.length() == 0);
+ h = search(a, "repetitiveField:vaguelyboring");
+ assertTrue(h.length() == 0);
+ }
+
+ public void testTokenStream() throws Exception {
+ QueryAutoStopWordAnalyzer a = new QueryAutoStopWordAnalyzer(new WhitespaceAnalyzer());
+ a.addStopWords(reader, 10);
+ TokenStream ts = a.tokenStream("repetitiveField", new StringReader("this boring"));
+ TermAttribute termAtt = (TermAttribute) ts.getAttribute(TermAttribute.class);
+ assertTrue(ts.incrementToken());
+ assertEquals("this", termAtt.term());
+ assertFalse(ts.incrementToken());
+ }
}
diff --git a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/TestShingleMatrixFilter.java b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/TestShingleMatrixFilter.java
index 3f6dfe600db..cb80ad9d0b3 100644
--- a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/TestShingleMatrixFilter.java
+++ b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/TestShingleMatrixFilter.java
@@ -336,7 +336,9 @@ public class TestShingleMatrixFilter extends TestCase {
* @throws IOException
*/
public void testMatrix() throws IOException {
-
+ // some other tests set this to null.
+ // set it here in case tests are run out of the usual order.
+ ShingleMatrixFilter.defaultSettingsCodec = new ShingleMatrixFilter.SimpleThreeDimensionalTokenSettingsCodec();
Matrix matrix = new Matrix();
matrix.new Column(tokenFactory("no", 1));
diff --git a/contrib/analyzers/smartcn/src/test/org/apache/lucene/analysis/cn/TestSmartChineseAnalyzer.java b/contrib/analyzers/smartcn/src/test/org/apache/lucene/analysis/cn/TestSmartChineseAnalyzer.java
index 5d6d61f0115..025596eb755 100644
--- a/contrib/analyzers/smartcn/src/test/org/apache/lucene/analysis/cn/TestSmartChineseAnalyzer.java
+++ b/contrib/analyzers/smartcn/src/test/org/apache/lucene/analysis/cn/TestSmartChineseAnalyzer.java
@@ -57,7 +57,7 @@ public class TestSmartChineseAnalyzer extends TestCase {
* This test is the same as the above, except using an ideographic space as a separator.
* This tests to ensure the stopwords are working correctly.
*/
- public void testChineseStopWordsDefaultTwoPhrasesIdeoSpache() throws Exception {
+ public void testChineseStopWordsDefaultTwoPhrasesIdeoSpace() throws Exception {
Analyzer ca = new SmartChineseAnalyzer(); /* will load stopwords */
String sentence = "我购买了道具和服装 我购买了道具和服装。";
String result[] = { "我", "购买", "了", "道具", "和", "服装", "我", "购买", "了", "道具", "和", "服装" };
@@ -101,6 +101,52 @@ public class TestSmartChineseAnalyzer extends TestCase {
new String[] { "我", "购买", "test", "了", "道具", "和", "服装"});
}
+ /*
+ * Numerics are parsed as their own tokens
+ */
+ public void testNumerics() throws Exception {
+ assertAnalyzesTo(new SmartChineseAnalyzer(true), "我购买 Tests 了道具和服装1234",
+ new String[] { "我", "购买", "test", "了", "道具", "和", "服装", "1234"});
+ }
+
+ /*
+ * Full width alphas and numerics are folded to half-width
+ */
+ public void testFullWidth() throws Exception {
+ assertAnalyzesTo(new SmartChineseAnalyzer(true), "我购买 Tests 了道具和服装1234",
+ new String[] { "我", "购买", "test", "了", "道具", "和", "服装", "1234"});
+ }
+
+ /*
+ * Presentation form delimiters are removed
+ */
+ public void testDelimiters() throws Exception {
+ assertAnalyzesTo(new SmartChineseAnalyzer(true), "我购买︱ Tests 了道具和服装",
+ new String[] { "我", "购买", "test", "了", "道具", "和", "服装"});
+ }
+
+ /*
+ * Text from writing systems other than Chinese and Latin are parsed as individual characters.
+ * (regardless of Unicode category)
+ */
+ public void testNonChinese() throws Exception {
+ assertAnalyzesTo(new SmartChineseAnalyzer(true), "我购买 روبرتTests 了道具和服装",
+ new String[] { "我", "购买", "ر", "و", "ب", "ر", "ت", "test", "了", "道具", "和", "服装"});
+ }
+
+ /*
+ * Test what the analyzer does with out-of-vocabulary words.
+ * In this case the name is Yousaf Raza Gillani.
+ * Currently it is being analyzed into single characters...
+ */
+ public void testOOV() throws Exception {
+ assertAnalyzesTo(new SmartChineseAnalyzer(true), "优素福·拉扎·吉拉尼",
+ new String[] { "优", "素", "福", "拉", "扎", "吉", "拉", "尼" });
+
+ assertAnalyzesTo(new SmartChineseAnalyzer(true), "优素福拉扎吉拉尼",
+ new String[] { "优", "素", "福", "拉", "扎", "吉", "拉", "尼" });
+ }
+
public void testOffsets() throws Exception {
assertAnalyzesTo(new SmartChineseAnalyzer(true), "我购买了道具和服装",
new String[] { "我", "购买", "了", "道具", "和", "服装" },
diff --git a/contrib/memory/src/test/org/apache/lucene/index/memory/TestSynonymTokenFilter.java b/contrib/memory/src/test/org/apache/lucene/index/memory/TestSynonymTokenFilter.java
index 73cb91a28f6..efbb413bf26 100644
--- a/contrib/memory/src/test/org/apache/lucene/index/memory/TestSynonymTokenFilter.java
+++ b/contrib/memory/src/test/org/apache/lucene/index/memory/TestSynonymTokenFilter.java
@@ -109,6 +109,7 @@ public class TestSynonymTokenFilter extends TestCase {
streams.source = new WhitespaceTokenizer(reader);
streams.result = new LowerCaseFilter(streams.source);
streams.result = new SynonymTokenFilter(streams.result, synonyms, maxSynonyms);
+ setPreviousTokenStream(streams);
} else {
streams.source.reset(reader);
streams.result.reset(); // reset the SynonymTokenFilter