diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java index a7c82720b22..7c033748756 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java @@ -34,7 +34,7 @@ import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.WordlistLoader; /** - * Analyzer for Arabic. + * {@link Analyzer} for Arabic. *

* This analyzer implements light-stemming as specified by: * @@ -108,10 +108,11 @@ public final class ArabicAnalyzer extends Analyzer { /** - * Creates a TokenStream which tokenizes all the text in the provided Reader. + * Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}. * - * @return A TokenStream built from an ArabicTokenizer filtered with - * StopFilter, LowerCaseFilter, ArabicNormalizationFilter and ArabicStemFilter. + * @return A {@link TokenStream} built from an {@link ArabicLetterTokenizer} filtered with + * {@link StopFilter}, {@link LowerCaseFilter}, {@link ArabicNormalizationFilter} + * and {@link ArabicStemFilter}. */ public final TokenStream tokenStream(String fieldName, Reader reader) { TokenStream result = new ArabicLetterTokenizer( reader ); @@ -129,12 +130,12 @@ public final class ArabicAnalyzer extends Analyzer { }; /** - * Returns a (possibly reused) TokenStream which tokenizes all the text - * in the provided Reader. + * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text + * in the provided {@link Reader}. * - * @return A TokenStream built from an ArabicTokenizer filtered with - * StopFilter, LowerCaseFilter, ArabicNormalizationFilter and - * ArabicStemFilter. + * @return A {@link TokenStream} built from an {@link ArabicLetterTokenizer} filtered with + * {@link StopFilter}, {@link LowerCaseFilter}, {@link ArabicNormalizationFilter} + * and {@link ArabicStemFilter}. */ public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException { diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicNormalizationFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicNormalizationFilter.java index 4e12ab7a1c5..75bd09e9d23 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicNormalizationFilter.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicNormalizationFilter.java @@ -24,7 +24,7 @@ import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.TermAttribute; /** - * A TokenFilter that applies {@link ArabicNormalizer} to normalize the orthography. + * A {@link TokenFilter} that applies {@link ArabicNormalizer} to normalize the orthography. * */ diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicStemFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicStemFilter.java index 34beb5f9fa9..e07756b053b 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicStemFilter.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicStemFilter.java @@ -24,7 +24,7 @@ import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.TermAttribute; /** - * A TokenFilter that applies {@link ArabicStemmer} to stem Arabic words.. + * A {@link TokenFilter} that applies {@link ArabicStemmer} to stem Arabic words.. * */ diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java index 39feeb8d558..d06f4cc7f03 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java @@ -34,15 +34,17 @@ import org.apache.lucene.analysis.standard.StandardFilter; import org.apache.lucene.analysis.standard.StandardTokenizer; /** - * Analyzer for Brazilian language. Supports an external list of stopwords (words that - * will not be indexed at all) and an external list of exclusions (word that will + * {@link Analyzer} for Brazilian Portuguese language. + *

+ * Supports an external list of stopwords (words that + * will not be indexed at all) and an external list of exclusions (words that will * not be stemmed, but indexed). - * + *

*/ public final class BrazilianAnalyzer extends Analyzer { /** - * List of typical Brazilian stopwords. + * List of typical Brazilian Portuguese stopwords. */ public final static String[] BRAZILIAN_STOP_WORDS = { "a","ainda","alem","ambas","ambos","antes", @@ -67,7 +69,7 @@ public final class BrazilianAnalyzer extends Analyzer { /** - * Contains the stopwords used with the StopFilter. + * Contains the stopwords used with the {@link StopFilter}. */ private Set stoptable = new HashSet(); @@ -111,7 +113,7 @@ public final class BrazilianAnalyzer extends Analyzer { excltable = StopFilter.makeStopSet( exclusionlist ); } /** - * Builds an exclusionlist from a Hashtable. + * Builds an exclusionlist from a {@link Map}. */ public void setStemExclusionTable( Map exclusionlist ) { excltable = new HashSet(exclusionlist.keySet()); @@ -124,11 +126,11 @@ public final class BrazilianAnalyzer extends Analyzer { } /** - * Creates a TokenStream which tokenizes all the text in the provided Reader. + * Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}. * - * @return A TokenStream built from a StandardTokenizer filtered with - * LowerCaseFilter, StandardFilter, StopFilter, and - * BrazilianStemFilter. + * @return A {@link TokenStream} built from a {@link StandardTokenizer} filtered with + * {@link LowerCaseFilter}, {@link StandardFilter}, {@link StopFilter}, and + * {@link BrazilianStemFilter}. */ public final TokenStream tokenStream(String fieldName, Reader reader) { TokenStream result = new StandardTokenizer( reader ); @@ -145,12 +147,12 @@ public final class BrazilianAnalyzer extends Analyzer { }; /** - * Returns a (possibly reused) TokenStream which tokenizes all the text - * in the provided Reader. + * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text + * in the provided {@link Reader}. * - * @return A TokenStream built from a StandardTokenizer filtered with - * LowerCaseFilter, StandardFilter, StopFilter, and - * BrazilianStemFilter. + * @return A {@link TokenStream} built from a {@link StandardTokenizer} filtered with + * {@link LowerCaseFilter}, {@link StandardFilter}, {@link StopFilter}, and + * {@link BrazilianStemFilter}. */ public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException { diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianStemFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianStemFilter.java index 3eff32f9faa..c6ed0b5b5b0 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianStemFilter.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianStemFilter.java @@ -25,13 +25,13 @@ import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.TermAttribute; /** - * Based on GermanStemFilter + * A {@link TokenFilter} that applies {@link BrazilianStemmer}. * */ public final class BrazilianStemFilter extends TokenFilter { /** - * The actual token in the input stream. + * {@link BrazilianStemmer} in use by this filter. */ private BrazilianStemmer stemmer = null; private Set exclusions = null; diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianStemmer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianStemmer.java index b358f02711b..aaea8ccad6c 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianStemmer.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianStemmer.java @@ -18,7 +18,7 @@ package org.apache.lucene.analysis.br; */ /** - * A stemmer for Brazilian words. + * A stemmer for Brazilian Portuguese words. */ public class BrazilianStemmer { diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/package.html b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/package.html index 62f98d78143..dfcdeea0aa4 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/package.html +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/package.html @@ -1,5 +1,5 @@ -Analyzer for Brazilian. +Analyzer for Brazilian Portuguese. diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKAnalyzer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKAnalyzer.java index ee39161d157..f5e871b5722 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKAnalyzer.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKAnalyzer.java @@ -28,7 +28,8 @@ import java.util.Set; /** - * Filters CJKTokenizer with StopFilter. + * An {@link Analyzer} that tokenizes text with {@link CJKTokenizer} and + * filters with {@link StopFilter} * */ public class CJKAnalyzer extends Analyzer { @@ -77,11 +78,12 @@ public class CJKAnalyzer extends Analyzer { //~ Methods ---------------------------------------------------------------- /** - * get token stream from input + * Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}. * * @param fieldName lucene field name - * @param reader input reader - * @return TokenStream + * @param reader input {@link Reader} + * @return A {@link TokenStream} built from {@link CJKTokenizer}, filtered with + * {@link StopFilter} */ public final TokenStream tokenStream(String fieldName, Reader reader) { return new StopFilter(new CJKTokenizer(reader), stopTable); @@ -93,11 +95,13 @@ public class CJKAnalyzer extends Analyzer { }; /** - * get (possibly reused) token stream from input + * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text + * in the provided {@link Reader}. * * @param fieldName lucene field name - * @param reader input reader - * @return TokenStream + * @param reader Input {@link Reader} + * @return A {@link TokenStream} built from {@link CJKTokenizer}, filtered with + * {@link StopFilter} */ public final TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException { /* tokenStream() is final, no back compat issue */ diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java index 68fe8d54490..5ddd4c9e9d6 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java @@ -27,13 +27,20 @@ import org.apache.lucene.analysis.tokenattributes.TypeAttribute; /** - * CJKTokenizer was modified from StopTokenizer which does a decent job for - * most European languages. It performs other token methods for double-byte - * Characters: the token will return at each two characters with overlap match.
- * Example: "java C1C2C3C4" will be segment to: "java" "C1C2" "C2C3" "C3C4" it - * also need filter filter zero length token ""
- * for Digit: digit, '+', '#' will token as letter
- * for more info on Asia language(Chinese Japanese Korean) text segmentation: + * CJKTokenizer is designed for Chinese, Japanese, and Korean languages. + *

+ * The tokens returned are every two adjacent characters with overlap match. + *

+ *

+ * Example: "java C1C2C3C4" will be segmented to: "java" "C1C2" "C2C3" "C3C4". + *

+ * Additionally, the following is applied to Latin text (such as English): + * + * For more info on Asian language (Chinese, Japanese, and Korean) text segmentation: * please search google * diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseAnalyzer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseAnalyzer.java index 5470a4f215a..1024e7bfd51 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseAnalyzer.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseAnalyzer.java @@ -24,13 +24,8 @@ import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; /** - * Title: ChineseAnalyzer - * Description: - * Subclass of org.apache.lucene.analysis.Analyzer - * build from a ChineseTokenizer, filtered with ChineseFilter. - * Copyright: Copyright (c) 2001 - * Company: - * @version 1.0 + * An {@link Analyzer} that tokenizes text with {@link ChineseTokenizer} and + * filters with {@link ChineseFilter} * */ @@ -40,9 +35,10 @@ public class ChineseAnalyzer extends Analyzer { } /** - * Creates a TokenStream which tokenizes all the text in the provided Reader. + * Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}. * - * @return A TokenStream build from a ChineseTokenizer filtered with ChineseFilter. + * @return A {@link TokenStream} built from a {@link ChineseTokenizer} + * filtered with {@link ChineseFilter}. */ public final TokenStream tokenStream(String fieldName, Reader reader) { TokenStream result = new ChineseTokenizer(reader); @@ -56,11 +52,11 @@ public class ChineseAnalyzer extends Analyzer { }; /** - * Returns a (possibly reused) TokenStream which tokenizes all the text in the - * provided Reader. + * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text in the + * provided {@link Reader}. * - * @return A TokenStream build from a ChineseTokenizer filtered with - * ChineseFilter. + * @return A {@link TokenStream} built from a {@link ChineseTokenizer} + * filtered with {@link ChineseFilter}. */ public final TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException { diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseFilter.java index 31de4a7f0a5..7e847fb48ef 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseFilter.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseFilter.java @@ -26,18 +26,19 @@ import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.TermAttribute; /** - * Title: ChineseFilter - * Description: Filter with a stop word table - * Rule: No digital is allowed. - * English word/token should larger than 1 character. - * One Chinese character as one Chinese word. + * A {@link TokenFilter} with a stop word table. + * * TO DO: - * 1. Add Chinese stop words, such as \ue400 - * 2. Dictionary based Chinese word extraction - * 3. Intelligent Chinese word extraction - * - * Copyright: Copyright (c) 2001 - * Company: + *
    + *
  1. Add Chinese stop words, such as \ue400 + *
  2. Dictionary based Chinese word extraction + *
  3. Intelligent Chinese word extraction + *
+ * * @version 1.0 * */ diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java index 1d38378094f..0c5ee549cd1 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java @@ -27,28 +27,29 @@ import org.apache.lucene.analysis.tokenattributes.TermAttribute; /** - * Title: ChineseTokenizer - * Description: Extract tokens from the Stream using Character.getType() - * Rule: A Chinese character as a single token - * Copyright: Copyright (c) 2001 - * Company: - * - * The difference between thr ChineseTokenizer and the - * CJKTokenizer (id=23545) is that they have different - * token parsing logic. + * Tokenize Chinese text as individual chinese characters. * - * Let me use an example. If having a Chinese text - * "C1C2C3C4" to be indexed, the tokens returned from the - * ChineseTokenizer are C1, C2, C3, C4. And the tokens - * returned from the CJKTokenizer are C1C2, C2C3, C3C4. - * - * Therefore the index the CJKTokenizer created is much - * larger. - * + *

+ * The difference between ChineseTokenizer and + * CJKTokenizer is that they have different + * token parsing logic. + *

+ *

+ * For example, if the Chinese text + * "C1C2C3C4" is to be indexed: + *

+ *

+ *

+ * Therefore the index created by CJKTokenizer is much larger. + *

+ *

* The problem is that when searching for C1, C1C2, C1C3, * C4C2, C1C2C3 ... the ChineseTokenizer works, but the * CJKTokenizer will not work. - * + *

* @version 1.0 * */ diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/package.html b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/package.html index aaa58651118..57027ee9082 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/package.html +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/package.html @@ -3,7 +3,7 @@ -Analyzer for Chinese, which indexes unigrams (individuals chinese characters). +Analyzer for Chinese, which indexes unigrams (individual chinese characters).

Three analyzers are provided for Chinese, each of which treats Chinese text in a different way.