From 3887cf9419333718d4a5b0520bff5242f5a7440f Mon Sep 17 00:00:00 2001
From: Robert Muir <rmuir@apache.org>
Date: Tue, 18 Aug 2009 12:55:26 +0000
Subject: [PATCH] LUCENE-1692: Additional tests and javadocs for
 contrib/analyzers

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@805400 13f79535-47bb-0310-9956-ffa450edef68
---
 .../lucene/analysis/ar/ArabicAnalyzer.java    | 19 ++---
 .../ar/ArabicNormalizationFilter.java         |  2 +-
 .../lucene/analysis/ar/ArabicStemFilter.java  |  2 +-
 .../lucene/analysis/br/BrazilianAnalyzer.java | 32 ++++----
 .../analysis/br/BrazilianStemFilter.java      |  4 +-
 .../lucene/analysis/br/BrazilianStemmer.java  |  2 +-
 .../apache/lucene/analysis/br/package.html    |  2 +-
 .../lucene/analysis/cjk/CJKAnalyzer.java      | 18 +++--
 .../lucene/analysis/cjk/CJKTokenizer.java     | 21 ++++--
 .../lucene/analysis/cn/ChineseAnalyzer.java   | 22 +++---
 .../lucene/analysis/cn/ChineseFilter.java     | 23 +++---
 .../lucene/analysis/cn/ChineseTokenizer.java  | 37 +++++-----
 .../apache/lucene/analysis/cn/package.html    |  2 +-
 .../compound/CompoundWordTokenFilterBase.java |  5 +-
 .../DictionaryCompoundWordTokenFilter.java    | 13 ++--
 .../HyphenationCompoundWordTokenFilter.java   | 15 ++--
 .../compound/hyphenation/HyphenationTree.java |  2 +-
 .../lucene/analysis/compound/package.html     |  2 +-
 .../lucene/analysis/cz/CzechAnalyzer.java     | 27 +++----
 .../lucene/analysis/de/GermanAnalyzer.java    | 32 ++++----
 .../lucene/analysis/de/GermanStemFilter.java  | 10 ++-
 .../lucene/analysis/de/GermanStemmer.java     |  6 +-
 .../lucene/analysis/el/GreekAnalyzer.java     | 24 +++---
 .../lucene/analysis/el/GreekCharsets.java     |  3 +-
 .../lucene/analysis/fa/PersianAnalyzer.java   | 31 ++++----
 .../fa/PersianNormalizationFilter.java        |  2 +-
 .../lucene/analysis/fr/ElisionFilter.java     |  7 +-
 .../lucene/analysis/fr/FrenchAnalyzer.java    | 27 ++++---
 .../lucene/analysis/fr/FrenchStemFilter.java  | 11 +--
 .../lucene/analysis/fr/FrenchStemmer.java     |  6 +-
 .../PrefixAndSuffixAwareTokenFilter.java      |  2 +-
 .../miscellaneous/SingleTokenTokenStream.java |  2 +-
 .../analysis/miscellaneous/package.html       |  5 ++
 .../analysis/ngram/EdgeNGramTokenFilter.java  | 10 +--
 .../analysis/ngram/EdgeNGramTokenizer.java    | 11 ++-
 .../analysis/ngram/NGramTokenFilter.java      |  4 +-
 .../lucene/analysis/ngram/NGramTokenizer.java |  4 +-
 .../apache/lucene/analysis/ngram/package.html |  5 ++
 .../lucene/analysis/nl/DutchAnalyzer.java     | 25 ++++---
 .../lucene/analysis/nl/DutchStemFilter.java   | 10 ++-
 .../lucene/analysis/nl/DutchStemmer.java      |  7 +-
 .../analysis/payloads/AbstractEncoder.java    |  2 +-
 .../analysis/payloads/FloatEncoder.java       |  2 +-
 .../analysis/payloads/PayloadEncoder.java     |  4 +-
 .../analysis/payloads/PayloadHelper.java      |  4 +-
 .../lucene/analysis/position/package.html     |  5 ++
 .../query/QueryAutoStopWordAnalyzer.java      | 30 ++++----
 .../apache/lucene/analysis/query/package.html |  5 ++
 .../lucene/analysis/reverse/package.html      |  5 ++
 .../lucene/analysis/ru/RussianAnalyzer.java   | 25 ++++---
 .../lucene/analysis/ru/RussianCharsets.java   |  3 +-
 .../analysis/ru/RussianLetterTokenizer.java   | 11 ++-
 .../analysis/ru/RussianLowerCaseFilter.java   |  1 -
 .../lucene/analysis/ru/RussianStemFilter.java | 13 ++--
 .../shingle/ShingleAnalyzerWrapper.java       |  6 +-
 .../analysis/shingle/ShingleFilter.java       |  2 +-
 .../analysis/shingle/ShingleMatrixFilter.java |  4 +-
 .../lucene/analysis/shingle/package.html      |  5 ++
 .../lucene/analysis/th/ThaiAnalyzer.java      |  2 +-
 .../lucene/analysis/th/ThaiWordFilter.java    |  2 +-
 .../apache/lucene/analysis/th/package.html    |  5 ++
 .../analysis/br/TestBrazilianStemmer.java     | 13 ++++
 .../lucene/analysis/cjk/TestCJKTokenizer.java | 60 +++++++++++++++
 .../analysis/cn/TestChineseTokenizer.java     | 73 +++++++++++++++++++
 .../analysis/de/TestGermanStemFilter.java     | 10 +--
 .../query/QueryAutoStopWordAnalyzerTest.java  | 36 +++++++++
 .../shingle/TestShingleMatrixFilter.java      |  4 +-
 .../analysis/cn/TestSmartChineseAnalyzer.java | 48 +++++++++++-
 .../index/memory/TestSynonymTokenFilter.java  |  1 +
 69 files changed, 603 insertions(+), 272 deletions(-)
 create mode 100644 contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/package.html
 create mode 100644 contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/package.html
 create mode 100644 contrib/analyzers/common/src/java/org/apache/lucene/analysis/position/package.html
 create mode 100644 contrib/analyzers/common/src/java/org/apache/lucene/analysis/query/package.html
 create mode 100644 contrib/analyzers/common/src/java/org/apache/lucene/analysis/reverse/package.html
 create mode 100644 contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/package.html
 create mode 100644 contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/package.html

diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java
index a7c82720b22..7c033748756 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java
@@ -34,7 +34,7 @@ import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.WordlistLoader;
 
 /**
- * Analyzer for Arabic. 
+ * {@link Analyzer} for Arabic. 
  * <p>
  * This analyzer implements light-stemming as specified by:
  * <i>
@@ -108,10 +108,11 @@ public final class ArabicAnalyzer extends Analyzer {
 
 
   /**
-   * Creates a TokenStream which tokenizes all the text in the provided Reader.
+   * Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}.
    *
-   * @return  A TokenStream built from an ArabicTokenizer filtered with
-   * 			StopFilter, LowerCaseFilter, ArabicNormalizationFilter and ArabicStemFilter.
+   * @return  A {@link TokenStream} built from an {@link ArabicLetterTokenizer} filtered with
+   * 			{@link StopFilter}, {@link LowerCaseFilter}, {@link ArabicNormalizationFilter}
+   *            and {@link ArabicStemFilter}.
    */
   public final TokenStream tokenStream(String fieldName, Reader reader) {
     TokenStream result = new ArabicLetterTokenizer( reader );
@@ -129,12 +130,12 @@ public final class ArabicAnalyzer extends Analyzer {
   };
   
   /**
-   * Returns a (possibly reused) TokenStream which tokenizes all the text 
-   * in the provided Reader.
+   * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text 
+   * in the provided {@link Reader}.
    *
-   * @return  A TokenStream built from an ArabicTokenizer filtered with
-   *            StopFilter, LowerCaseFilter, ArabicNormalizationFilter and 
-   *            ArabicStemFilter.
+   * @return  A {@link TokenStream} built from an {@link ArabicLetterTokenizer} filtered with
+   *            {@link StopFilter}, {@link LowerCaseFilter}, {@link ArabicNormalizationFilter}
+   *            and {@link ArabicStemFilter}.
    */
   public TokenStream reusableTokenStream(String fieldName, Reader reader)
       throws IOException {
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicNormalizationFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicNormalizationFilter.java
index 4e12ab7a1c5..75bd09e9d23 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicNormalizationFilter.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicNormalizationFilter.java
@@ -24,7 +24,7 @@ import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 
 /**
- * A TokenFilter that applies {@link ArabicNormalizer} to normalize the orthography.
+ * A {@link TokenFilter} that applies {@link ArabicNormalizer} to normalize the orthography.
  * 
  */
 
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicStemFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicStemFilter.java
index 34beb5f9fa9..e07756b053b 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicStemFilter.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicStemFilter.java
@@ -24,7 +24,7 @@ import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 
 /**
- * A TokenFilter that applies {@link ArabicStemmer} to stem Arabic words..
+ * A {@link TokenFilter} that applies {@link ArabicStemmer} to stem Arabic words..
  * 
  */
 
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java
index 39feeb8d558..d06f4cc7f03 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java
@@ -34,15 +34,17 @@ import org.apache.lucene.analysis.standard.StandardFilter;
 import org.apache.lucene.analysis.standard.StandardTokenizer;
 
 /**
- * Analyzer for Brazilian language. Supports an external list of stopwords (words that
- * will not be indexed at all) and an external list of exclusions (word that will
+ * {@link Analyzer} for Brazilian Portuguese language. 
+ * <p>
+ * Supports an external list of stopwords (words that
+ * will not be indexed at all) and an external list of exclusions (words that will
  * not be stemmed, but indexed).
- *
+ * </p>
  */
 public final class BrazilianAnalyzer extends Analyzer {
 
 	/**
-	 * List of typical Brazilian stopwords.
+	 * List of typical Brazilian Portuguese stopwords.
 	 */
 	public final static String[] BRAZILIAN_STOP_WORDS = {
       "a","ainda","alem","ambas","ambos","antes",
@@ -67,7 +69,7 @@ public final class BrazilianAnalyzer extends Analyzer {
 
 
 	/**
-	 * Contains the stopwords used with the StopFilter.
+	 * Contains the stopwords used with the {@link StopFilter}.
 	 */
 	private Set stoptable = new HashSet();
 	
@@ -111,7 +113,7 @@ public final class BrazilianAnalyzer extends Analyzer {
 		excltable = StopFilter.makeStopSet( exclusionlist );
 	}
 	/**
-	 * Builds an exclusionlist from a Hashtable.
+	 * Builds an exclusionlist from a {@link Map}.
 	 */
 	public void setStemExclusionTable( Map exclusionlist ) {
 		excltable = new HashSet(exclusionlist.keySet());
@@ -124,11 +126,11 @@ public final class BrazilianAnalyzer extends Analyzer {
 	}
 
 	/**
-	 * Creates a TokenStream which tokenizes all the text in the provided Reader.
+	 * Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}.
 	 *
-	 * @return  A TokenStream built from a StandardTokenizer filtered with
-	 * 			LowerCaseFilter, StandardFilter, StopFilter, and 
-	 *          BrazilianStemFilter.
+	 * @return  A {@link TokenStream} built from a {@link StandardTokenizer} filtered with
+	 * 			{@link LowerCaseFilter}, {@link StandardFilter}, {@link StopFilter}, and 
+	 *          {@link BrazilianStemFilter}.
 	 */
 	public final TokenStream tokenStream(String fieldName, Reader reader) {
 		TokenStream result = new StandardTokenizer( reader );
@@ -145,12 +147,12 @@ public final class BrazilianAnalyzer extends Analyzer {
     };
     
     /**
-     * Returns a (possibly reused) TokenStream which tokenizes all the text 
-     * in the provided Reader.
+     * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text 
+     * in the provided {@link Reader}.
      *
-     * @return  A TokenStream built from a StandardTokenizer filtered with
-     *          LowerCaseFilter, StandardFilter, StopFilter, and 
-     *          BrazilianStemFilter.
+     * @return  A {@link TokenStream} built from a {@link StandardTokenizer} filtered with
+     *          {@link LowerCaseFilter}, {@link StandardFilter}, {@link StopFilter}, and 
+     *          {@link BrazilianStemFilter}.
      */
     public TokenStream reusableTokenStream(String fieldName, Reader reader)
       throws IOException {
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianStemFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianStemFilter.java
index 3eff32f9faa..c6ed0b5b5b0 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianStemFilter.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianStemFilter.java
@@ -25,13 +25,13 @@ import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 
 /**
- * Based on GermanStemFilter
+ * A {@link TokenFilter} that applies {@link BrazilianStemmer}.
  *
  */
 public final class BrazilianStemFilter extends TokenFilter {
 
   /**
-   * The actual token in the input stream.
+   * {@link BrazilianStemmer} in use by this filter.
    */
   private BrazilianStemmer stemmer = null;
   private Set exclusions = null;
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianStemmer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianStemmer.java
index b358f02711b..aaea8ccad6c 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianStemmer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianStemmer.java
@@ -18,7 +18,7 @@ package org.apache.lucene.analysis.br;
  */
 
 /**
- * A stemmer for Brazilian words.
+ * A stemmer for Brazilian Portuguese words.
  */
 public class BrazilianStemmer {
 
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/package.html b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/package.html
index 62f98d78143..dfcdeea0aa4 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/package.html
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/package.html
@@ -1,5 +1,5 @@
 <html><head></head>
 <body>
-Analyzer for Brazilian.
+Analyzer for Brazilian Portuguese.
 </body>
 </html>
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKAnalyzer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKAnalyzer.java
index ee39161d157..f5e871b5722 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKAnalyzer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKAnalyzer.java
@@ -28,7 +28,8 @@ import java.util.Set;
 
 
 /**
- * Filters CJKTokenizer with StopFilter.
+ * An {@link Analyzer} that tokenizes text with {@link CJKTokenizer} and
+ * filters with {@link StopFilter}
  *
  */
 public class CJKAnalyzer extends Analyzer {
@@ -77,11 +78,12 @@ public class CJKAnalyzer extends Analyzer {
   //~ Methods ----------------------------------------------------------------
 
   /**
-   * get token stream from input
+   * Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}.
    *
    * @param fieldName lucene field name
-   * @param reader    input reader
-   * @return TokenStream
+   * @param reader    input {@link Reader}
+   * @return A {@link TokenStream} built from {@link CJKTokenizer}, filtered with
+   *    {@link StopFilter}
    */
   public final TokenStream tokenStream(String fieldName, Reader reader) {
     return new StopFilter(new CJKTokenizer(reader), stopTable);
@@ -93,11 +95,13 @@ public class CJKAnalyzer extends Analyzer {
   };
   
   /**
-   * get (possibly reused) token stream from input
+   * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text 
+   * in the provided {@link Reader}.
    *
    * @param fieldName lucene field name
-   * @param reader    input reader
-   * @return TokenStream
+   * @param reader    Input {@link Reader}
+   * @return A {@link TokenStream} built from {@link CJKTokenizer}, filtered with
+   *    {@link StopFilter}
    */
   public final TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
     /* tokenStream() is final, no back compat issue */
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java
index 68fe8d54490..5ddd4c9e9d6 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java
@@ -27,13 +27,20 @@ import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
 
 
 /**
- * CJKTokenizer was modified from StopTokenizer which does a decent job for
- * most European languages. It performs other token methods for double-byte
- * Characters: the token will return at each two characters with overlap match.<br>
- * Example: "java C1C2C3C4" will be segment to: "java" "C1C2" "C2C3" "C3C4" it
- * also need filter filter zero length token ""<br>
- * for Digit: digit, '+', '#' will token as letter<br>
- * for more info on Asia language(Chinese Japanese Korean) text segmentation:
+ * CJKTokenizer is designed for Chinese, Japanese, and Korean languages.
+ * <p>  
+ * The tokens returned are every two adjacent characters with overlap match.
+ * </p>
+ * <p>
+ * Example: "java C1C2C3C4" will be segmented to: "java" "C1C2" "C2C3" "C3C4".
+ * </p>
+ * Additionally, the following is applied to Latin text (such as English):
+ * <ul>
+ * <li>Text is converted to lowercase.
+ * <li>Numeric digits, '+', '#', and '_' are tokenized as letters.
+ * <li>Full-width forms are converted to half-width forms.
+ * </ul>
+ * For more info on Asian language (Chinese, Japanese, and Korean) text segmentation:
  * please search  <a
  * href="http://www.google.com/search?q=word+chinese+segment">google</a>
  *
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseAnalyzer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseAnalyzer.java
index 5470a4f215a..1024e7bfd51 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseAnalyzer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseAnalyzer.java
@@ -24,13 +24,8 @@ import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
 
 /**
- * Title: ChineseAnalyzer
- * Description:
- *   Subclass of org.apache.lucene.analysis.Analyzer
- *   build from a ChineseTokenizer, filtered with ChineseFilter.
- * Copyright:   Copyright (c) 2001
- * Company:
- * @version 1.0
+ * An {@link Analyzer} that tokenizes text with {@link ChineseTokenizer} and
+ * filters with {@link ChineseFilter}
  *
  */
 
@@ -40,9 +35,10 @@ public class ChineseAnalyzer extends Analyzer {
     }
 
     /**
-    * Creates a TokenStream which tokenizes all the text in the provided Reader.
+    * Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}.
     *
-    * @return  A TokenStream build from a ChineseTokenizer filtered with ChineseFilter.
+    * @return  A {@link TokenStream} built from a {@link ChineseTokenizer} 
+    *   filtered with {@link ChineseFilter}.
     */
     public final TokenStream tokenStream(String fieldName, Reader reader) {
         TokenStream result = new ChineseTokenizer(reader);
@@ -56,11 +52,11 @@ public class ChineseAnalyzer extends Analyzer {
     };
 
     /**
-    * Returns a (possibly reused) TokenStream which tokenizes all the text in the
-    * provided Reader.
+    * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text in the
+    * provided {@link Reader}.
     * 
-    * @return A TokenStream build from a ChineseTokenizer filtered with
-    *         ChineseFilter.
+    * @return A {@link TokenStream} built from a {@link ChineseTokenizer} 
+    *   filtered with {@link ChineseFilter}.
     */
     public final TokenStream reusableTokenStream(String fieldName, Reader reader)
       throws IOException {
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseFilter.java
index 31de4a7f0a5..7e847fb48ef 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseFilter.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseFilter.java
@@ -26,18 +26,19 @@ import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 
 /**
- * Title: ChineseFilter
- * Description: Filter with a stop word table
- *              Rule: No digital is allowed.
- *                    English word/token should larger than 1 character.
- *                    One Chinese character as one Chinese word.
+ * A {@link TokenFilter} with a stop word table.  
+ * <ul>
+ * <li>Numeric tokens are removed.
+ * <li>English tokens must be larger than 1 character.
+ * <li>One Chinese character as one Chinese word.
+ * </ul>
  * TO DO:
- *   1. Add Chinese stop words, such as \ue400
- *   2. Dictionary based Chinese word extraction
- *   3. Intelligent Chinese word extraction
- *
- * Copyright:    Copyright (c) 2001
- * Company:
+ * <ol>
+ * <li>Add Chinese stop words, such as \ue400
+ * <li>Dictionary based Chinese word extraction
+ * <li>Intelligent Chinese word extraction
+ * </ol>
+ * 
  * @version 1.0
  *
  */
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java
index 1d38378094f..0c5ee549cd1 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java
@@ -27,28 +27,29 @@ import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 
 
 /**
- * Title: ChineseTokenizer
- * Description: Extract tokens from the Stream using Character.getType()
- *              Rule: A Chinese character as a single token
- * Copyright:   Copyright (c) 2001
- * Company:
- *
- * The difference between thr ChineseTokenizer and the
- * CJKTokenizer (id=23545) is that they have different
- * token parsing logic.
+ * Tokenize Chinese text as individual chinese characters.
  * 
- * Let me use an example. If having a Chinese text
- * "C1C2C3C4" to be indexed, the tokens returned from the
- * ChineseTokenizer are C1, C2, C3, C4. And the tokens
- * returned from the CJKTokenizer are C1C2, C2C3, C3C4.
- *
- * Therefore the index the CJKTokenizer created is much
- * larger.
- *
+ * <p>
+ * The difference between ChineseTokenizer and
+ * CJKTokenizer is that they have different
+ * token parsing logic.
+ * </p>
+ * <p>
+ * For example, if the Chinese text
+ * "C1C2C3C4" is to be indexed:
+ * <ul>
+ * <li>The tokens returned from ChineseTokenizer are C1, C2, C3, C4. 
+ * <li>The tokens returned from the CJKTokenizer are C1C2, C2C3, C3C4.
+ * </ul>
+ * </p>
+ * <p>
+ * Therefore the index created by CJKTokenizer is much larger.
+ * </p>
+ * <p>
  * The problem is that when searching for C1, C1C2, C1C3,
  * C4C2, C1C2C3 ... the ChineseTokenizer works, but the
  * CJKTokenizer will not work.
- *
+ * </p>
  * @version 1.0
  *
  */
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/package.html b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/package.html
index aaa58651118..57027ee9082 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/package.html
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/package.html
@@ -3,7 +3,7 @@
 <META http-equiv="Content-Type" content="text/html; charset=UTF-8">
 </head>
 <body>
-Analyzer for Chinese, which indexes unigrams (individuals chinese characters).
+Analyzer for Chinese, which indexes unigrams (individual chinese characters).
 <p>
 Three analyzers are provided for Chinese, each of which treats Chinese text in a different way.
 <ul>
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java
index 5bd501f165a..3c7c13ca1a6 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java
@@ -118,10 +118,11 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter {
    * Create a set of words from an array
    * The resulting Set does case insensitive matching
    * TODO We should look for a faster dictionary lookup approach.
-   * @param dictionary
-   * @return
+   * @param dictionary 
+   * @return {@link Set} of lowercased terms 
    */
   public static final Set makeDictionary(final String[] dictionary) {
+    // is the below really case insensitive? 
     CharArraySet dict = new CharArraySet(dictionary.length, false);
     addAllLowerCase(dict, Arrays.asList(dictionary));
     return dict;
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java
index 37dae189d02..15d17e0ca8d 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java
@@ -21,18 +21,21 @@ package org.apache.lucene.analysis.compound;
 import java.util.Set;
 
 import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenFilter; // for javadocs
 import org.apache.lucene.analysis.TokenStream;
 
 /**
- * A TokenFilter that decomposes compound words found in many germanic languages
+ * A {@link TokenFilter} that decomposes compound words found in many Germanic languages.
+ * <p>
  * "Donaudampfschiff" becomes Donau, dampf, schiff so that you can find
  * "Donaudampfschiff" even when you only enter "schiff". 
  *  It uses a brute-force algorithm to achieve this.
+ * </p>
  */
 public class DictionaryCompoundWordTokenFilter extends CompoundWordTokenFilterBase {
   /**
    * 
-   * @param input the token stream to process
+   * @param input the {@link TokenStream} to process
    * @param dictionary the word dictionary to match against
    * @param minWordSize only words longer than this get processed
    * @param minSubwordSize only subwords longer than this get to the output stream
@@ -46,7 +49,7 @@ public class DictionaryCompoundWordTokenFilter extends CompoundWordTokenFilterBa
 
   /**
    * 
-   * @param input the token stream to process
+   * @param input the {@link TokenStream} to process
    * @param dictionary the word dictionary to match against
    */
   public DictionaryCompoundWordTokenFilter(TokenStream input, String[] dictionary) {
@@ -55,7 +58,7 @@ public class DictionaryCompoundWordTokenFilter extends CompoundWordTokenFilterBa
 
   /**
    * 
-   * @param input the token stream to process
+   * @param input the {@link TokenStream} to process
    * @param dictionary the word dictionary to match against. If this is a {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it must have set ignoreCase=false and only contain
    *        lower case strings. 
    */
@@ -65,7 +68,7 @@ public class DictionaryCompoundWordTokenFilter extends CompoundWordTokenFilterBa
 
   /**
    * 
-   * @param input the token stream to process
+   * @param input the {@link TokenStream} to process
    * @param dictionary the word dictionary to match against. If this is a {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it must have set ignoreCase=false and only contain
    *        lower case strings. 
    * @param minWordSize only words longer than this get processed
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java
index 0e7f936b3bc..06d10ea4b26 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java
@@ -24,16 +24,19 @@ import java.io.Reader;
 import java.util.Set;
 
 import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenFilter; // for javadocs
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.compound.hyphenation.Hyphenation;
 import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
 import org.xml.sax.InputSource;
 
 /**
- * A TokenFilter that decomposes compound words found in many germanic languages
+ * A {@link TokenFilter} that decomposes compound words found in many Germanic languages.
+ * <p>
  * "Donaudampfschiff" becomes Donau, dampf, schiff so that you can find
- * "Donaudampfschiff" even when you only enter "schiff" It uses a hyphenation
+ * "Donaudampfschiff" even when you only enter "schiff". It uses a hyphenation
  * grammar and a word dictionary to achieve this.
+ * </p>
  */
 public class HyphenationCompoundWordTokenFilter extends
     CompoundWordTokenFilterBase {
@@ -41,7 +44,7 @@ public class HyphenationCompoundWordTokenFilter extends
 
   /**
    * 
-   * @param input the token stream to process
+   * @param input the {@link TokenStream} to process
    * @param hyphenator the hyphenation pattern tree to use for hyphenation
    * @param dictionary the word dictionary to match against
    * @param minWordSize only words longer than this get processed
@@ -60,7 +63,7 @@ public class HyphenationCompoundWordTokenFilter extends
 
   /**
    * 
-   * @param input the token stream to process
+   * @param input the {@link TokenStream} to process
    * @param hyphenator the hyphenation pattern tree to use for hyphenation
    * @param dictionary the word dictionary to match against
    */
@@ -72,7 +75,7 @@ public class HyphenationCompoundWordTokenFilter extends
 
   /**
    * 
-   * @param input the token stream to process
+   * @param input the {@link TokenStream} to process
    * @param hyphenator the hyphenation pattern tree to use for hyphenation
    * @param dictionary the word dictionary to match against. If this is a {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it must have set ignoreCase=false and only contain
    *        lower case strings. 
@@ -85,7 +88,7 @@ public class HyphenationCompoundWordTokenFilter extends
 
   /**
    * 
-   * @param input the token stream to process
+   * @param input the {@link TokenStream} to process
    * @param hyphenator the hyphenation pattern tree to use for hyphenation
    * @param dictionary the word dictionary to match against. If this is a {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it must have set ignoreCase=false and only contain
    *        lower case strings. 
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/hyphenation/HyphenationTree.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/hyphenation/HyphenationTree.java
index a836494d4bf..10446592c20 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/hyphenation/HyphenationTree.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/hyphenation/HyphenationTree.java
@@ -110,7 +110,7 @@ public class HyphenationTree extends TernaryTree implements PatternConsumer,
   /**
    * Read hyphenation patterns from an XML file.
    * 
-   * @param filename the filename
+   * @param f the filename
    * @throws HyphenationException In case the parsing fails
    */
   public void loadPatterns(File f) throws HyphenationException {
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/package.html b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/package.html
index cf3e8bf07b6..d2e5ac58bd6 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/package.html
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/package.html
@@ -5,7 +5,7 @@
 </head>
 <body>
 A filter that decomposes compound words you find in many Germanic
-languages to the word parts. This example shows what it does:
+languages into the word parts. This example shows what it does:
 <table border="1">
 	<tr>
 		<th>Input token stream</th>
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java
index 280033e45ec..603df1349ba 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java
@@ -31,11 +31,12 @@ import java.util.HashSet;
 import java.util.Set;
 
 /**
- * Analyzer for Czech language. Supports an external list of stopwords (words that
- * will not be indexed at all).
- * A default set of stopwords is used unless an alternative list is specified, the
- * exclusion list is empty by default.
- *
+ * {@link Analyzer} for Czech language. 
+ * <p>
+ * Supports an external list of stopwords (words that
+ * will not be indexed at all). 
+ * A default set of stopwords is used unless an alternative list is specified.
+ * </p>
  */
 public final class CzechAnalyzer extends Analyzer {
 
@@ -64,7 +65,7 @@ public final class CzechAnalyzer extends Analyzer {
     };
 
 	/**
-	 * Contains the stopwords used with the StopFilter.
+	 * Contains the stopwords used with the {@link StopFilter}.
 	 */
 	private Set stoptable;
 
@@ -125,10 +126,10 @@ public final class CzechAnalyzer extends Analyzer {
     }
 
 	/**
-	 * Creates a TokenStream which tokenizes all the text in the provided Reader.
+	 * Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}.
 	 *
-	 * @return  A TokenStream built from a StandardTokenizer filtered with
-	 * 			StandardFilter, LowerCaseFilter, and StopFilter
+	 * @return  A {@link TokenStream} built from a {@link StandardTokenizer} filtered with
+	 * 			{@link StandardFilter}, {@link LowerCaseFilter}, and {@link StopFilter}
 	 */
 	public final TokenStream tokenStream( String fieldName, Reader reader ) {
 		TokenStream result = new StandardTokenizer( reader );
@@ -144,11 +145,11 @@ public final class CzechAnalyzer extends Analyzer {
 	};
 	
 	/**
-     * Returns a (possibly reused) TokenStream which tokenizes all the text in 
-     * the provided Reader.
+     * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text in 
+     * the provided {@link Reader}.
      *
-     * @return  A TokenStream built from a StandardTokenizer filtered with
-     *          StandardFilter, LowerCaseFilter, and StopFilter
+     * @return  A {@link TokenStream} built from a {@link StandardTokenizer} filtered with
+     *          {@link StandardFilter}, {@link LowerCaseFilter}, and {@link StopFilter}
      */
 	public TokenStream reusableTokenStream(String fieldName, Reader reader)
       throws IOException {
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java
index b7d5a20a0f0..bf45a967507 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java
@@ -35,12 +35,14 @@ import org.apache.lucene.analysis.standard.StandardFilter;
 import org.apache.lucene.analysis.standard.StandardTokenizer;
 
 /**
- * Analyzer for German language. Supports an external list of stopwords (words that
+ * {@link Analyzer} for German language. 
+ * <p>
+ * Supports an external list of stopwords (words that
  * will not be indexed at all) and an external list of exclusions (word that will
  * not be stemmed, but indexed).
- * A default set of stopwords is used unless an alternative list is specified, the
+ * A default set of stopwords is used unless an alternative list is specified, but the
  * exclusion list is empty by default.
- *
+ * </p>
  * 
  * @version $Id$
  */
@@ -65,7 +67,7 @@ public class GermanAnalyzer extends Analyzer {
   };
 
   /**
-   * Contains the stopwords used with the StopFilter.
+   * Contains the stopwords used with the {@link StopFilter}.
    */
   private Set stopSet = new HashSet();
 
@@ -75,8 +77,8 @@ public class GermanAnalyzer extends Analyzer {
   private Set exclusionSet = new HashSet();
 
   /**
-   * Builds an analyzer with the default stop words
-   * (<code>GERMAN_STOP_WORDS</code>).
+   * Builds an analyzer with the default stop words:
+   * {@link #GERMAN_STOP_WORDS}.
    */
   public GermanAnalyzer() {
     stopSet = StopFilter.makeStopSet(GERMAN_STOP_WORDS);
@@ -115,7 +117,7 @@ public class GermanAnalyzer extends Analyzer {
   }
 
   /**
-   * Builds an exclusionlist from a Hashtable.
+   * Builds an exclusionlist from a {@link Map}
    */
   public void setStemExclusionTable(Map exclusionlist) {
     exclusionSet = new HashSet(exclusionlist.keySet());
@@ -129,10 +131,11 @@ public class GermanAnalyzer extends Analyzer {
   }
 
   /**
-   * Creates a TokenStream which tokenizes all the text in the provided Reader.
+   * Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}.
    *
-   * @return A TokenStream built from a StandardTokenizer filtered with
-   *         StandardFilter, LowerCaseFilter, StopFilter, GermanStemFilter
+   * @return A {@link TokenStream} built from a {@link StandardTokenizer} filtered with
+   *         {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}, and
+   *         {@link GermanStemFilter}
    */
   public TokenStream tokenStream(String fieldName, Reader reader) {
     TokenStream result = new StandardTokenizer(reader);
@@ -149,11 +152,12 @@ public class GermanAnalyzer extends Analyzer {
   };
   
   /**
-   * Returns a (possibly reused) TokenStream which tokenizes all the text 
-   * in the provided Reader.
+   * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text 
+   * in the provided {@link Reader}.
    *
-   * @return A TokenStream built from a StandardTokenizer filtered with
-   *         StandardFilter, LowerCaseFilter, StopFilter, GermanStemFilter
+   * @return A {@link TokenStream} built from a {@link StandardTokenizer} filtered with
+   *         {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}, and
+   *         {@link GermanStemFilter}
    */
   public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
     if (overridesTokenStreamMethod) {
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java
index 1929563ed71..c142965925d 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java
@@ -25,10 +25,12 @@ import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 
 /**
- * A filter that stems German words. It supports a table of words that should
+ * A {@link TokenFilter} that stems German words. 
+ * <p>
+ * It supports a table of words that should
  * not be stemmed at all. The stemmer used can be changed at runtime after the
- * filter object is created (as long as it is a GermanStemmer).
- *
+ * filter object is created (as long as it is a {@link GermanStemmer}).
+ * </p>
  *
  * @version   $Id$
  */
@@ -78,7 +80,7 @@ public final class GermanStemFilter extends TokenFilter
     }
 
     /**
-     * Set a alternative/custom GermanStemmer for this filter.
+     * Set a alternative/custom {@link GermanStemmer} for this filter.
      */
     public void setStemmer( GermanStemmer stemmer )
     {
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanStemmer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanStemmer.java
index fea67129196..1cdcb41ee47 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanStemmer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanStemmer.java
@@ -19,10 +19,12 @@ package org.apache.lucene.analysis.de;
  */
 
 /**
- * A stemmer for German words. The algorithm is based on the report
+ * A stemmer for German words. 
+ * <p>
+ * The algorithm is based on the report
  * "A Fast and Simple Stemming Algorithm for German Words" by J&ouml;rg
  * Caumanns (joerg.caumanns at isst.fhg.de).
- *
+ * </p>
  *
  * @version   $Id$
  */
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java
index a77f5f80224..4de196e79d6 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java
@@ -30,10 +30,12 @@ import java.util.Map;
 import java.util.Set;
 
 /**
- * Analyzer for the Greek language. Supports an external list of stopwords (words
+ * {@link Analyzer} for the Greek language. 
+ * <p>
+ * Supports an external list of stopwords (words
  * that will not be indexed at all).
  * A default set of stopwords is used unless an alternative list is specified.
- *
+ * </p>
  */
 public final class GreekAnalyzer extends Analyzer
 {
@@ -145,14 +147,14 @@ public final class GreekAnalyzer extends Analyzer
     };
 
     /**
-     * Contains the stopwords used with the StopFilter.
+     * Contains the stopwords used with the {@link StopFilter}.
      */
     private Set stopSet = new HashSet();
 
     /**
      * Charset for Greek letters.
      * Represents encoding for 24 lowercase Greek letters.
-     * Predefined charsets can be taken from GreekCharSets class
+     * Predefined charsets can be taken from {@link GreekCharsets} class
      */
     private char[] charset;
 
@@ -209,10 +211,10 @@ public final class GreekAnalyzer extends Analyzer
     }
 
     /**
-     * Creates a TokenStream which tokenizes all the text in the provided Reader.
+     * Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}.
      *
-     * @return  A TokenStream built from a StandardTokenizer filtered with
-     *                  GreekLowerCaseFilter and StopFilter
+     * @return  A {@link TokenStream} built from a {@link StandardTokenizer} filtered with
+     *                  {@link GreekLowerCaseFilter} and {@link StopFilter}
      */
     public TokenStream tokenStream(String fieldName, Reader reader)
     {
@@ -228,11 +230,11 @@ public final class GreekAnalyzer extends Analyzer
     };
     
     /**
-     * Returns a (possibly reused) TokenStream which tokenizes all the text 
-     * in the provided Reader.
+     * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text 
+     * in the provided {@link Reader}.
      *
-     * @return  A TokenStream built from a StandardTokenizer filtered with
-     *                  GreekLowerCaseFilter and StopFilter
+     * @return  A {@link TokenStream} built from a {@link StandardTokenizer} filtered with
+     *                  {@link GreekLowerCaseFilter} and {@link StopFilter}
      */
     public TokenStream reusableTokenStream(String fieldName, Reader reader) 
       throws IOException {
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekCharsets.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekCharsets.java
index 061dbc2cb9d..80470976fd6 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekCharsets.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekCharsets.java
@@ -19,10 +19,11 @@ package org.apache.lucene.analysis.el;
 /**
  * GreekCharsets class contains encodings schemes (charsets) and toLowerCase() method implementation
  * for greek characters in Unicode, ISO-8859-7 and Microsoft Windows CP1253.
+ * <p>
  * Each encoding scheme contains lowercase (positions 0-35) and uppercase (position 36-68) characters,
  * including accented ones. One should be able to add other encoding schemes (see RFC 1947) by adding
  * the definition of a new charset as well as the required logic in the toLowerCase() method.
- *
+ * </p>
  */
 public class GreekCharsets
 {
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java
index 4c212768ebc..65e463d38ca 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java
@@ -36,12 +36,12 @@ import org.apache.lucene.analysis.ar.ArabicLetterTokenizer;
 import org.apache.lucene.analysis.ar.ArabicNormalizationFilter;
 
 /**
- * Analyzer for Persian.
- * 
- * Analyzer uses {@link ArabicLetterTokenizer} which implies tokenizing around
- * ZWNJ in addition to space. Some persian-specific variant forms (such as farsi
+ * {@link Analyzer} for Persian.
+ * <p>
+ * This Analyzer uses {@link ArabicLetterTokenizer} which implies tokenizing around
+ * zero-width non-joiner in addition to whitespace. Some persian-specific variant forms (such as farsi
  * yeh and keheh) are standardized. "Stemming" is accomplished via stopwords.
- * 
+ * </p>
  */
 public final class PersianAnalyzer extends Analyzer {
 
@@ -107,11 +107,13 @@ public final class PersianAnalyzer extends Analyzer {
   }
 
   /**
-   * Creates a TokenStream which tokenizes all the text in the provided Reader.
+   * Creates a {@link TokenStream} which tokenizes all the text in the provided
+   * {@link Reader}.
    * 
-   * @return A TokenStream build from a ArabicLetterTokenizer filtered with
-   *         LowerCaseFilter, ArabicNormalizationFilter,
-   *         PersianNormalizationFilter and Persian Stop words
+   * @return A {@link TokenStream} built from a {@link ArabicLetterTokenizer}
+   *         filtered with {@link LowerCaseFilter}, 
+   *         {@link ArabicNormalizationFilter},
+   *         {@link PersianNormalizationFilter} and Persian Stop words
    */
   public TokenStream tokenStream(String fieldName, Reader reader) {
     TokenStream result = new ArabicLetterTokenizer(reader);
@@ -134,12 +136,13 @@ public final class PersianAnalyzer extends Analyzer {
   }
 
   /**
-   * Returns a (possibly reused) TokenStream which tokenizes all the text 
-   * in the provided Reader.
+   * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text 
+   * in the provided {@link Reader}.
    * 
-   * @return A TokenStream build from a ArabicLetterTokenizer filtered with
-   *         LowerCaseFilter, ArabicNormalizationFilter,
-   *         PersianNormalizationFilter and Persian Stop words
+   * @return A {@link TokenStream} built from a {@link ArabicLetterTokenizer}
+   *         filtered with {@link LowerCaseFilter}, 
+   *         {@link ArabicNormalizationFilter},
+   *         {@link PersianNormalizationFilter} and Persian Stop words
    */
   public TokenStream reusableTokenStream(String fieldName, Reader reader)
       throws IOException {
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianNormalizationFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianNormalizationFilter.java
index 1106cca4c49..e9eb308a4b4 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianNormalizationFilter.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianNormalizationFilter.java
@@ -24,7 +24,7 @@ import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 
 /**
- * A TokenFilter that applies {@link PersianNormalizer} to normalize the
+ * A {@link TokenFilter} that applies {@link PersianNormalizer} to normalize the
  * orthography.
  * 
  */
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/ElisionFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/ElisionFilter.java
index b354e4dfdb1..82e8b3f3449 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/ElisionFilter.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/ElisionFilter.java
@@ -22,16 +22,17 @@ import java.util.Set;
 import java.util.HashSet;
 import java.util.Arrays;
 import java.util.Iterator;
+import org.apache.lucene.analysis.standard.StandardTokenizer; // for javadocs
 import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 
 /**
- * Removes elisions from a token stream. For example, "l'avion" (the plane) will be
+ * Removes elisions from a {@link TokenStream}. For example, "l'avion" (the plane) will be
  * tokenized as "avion" (plane).
  * <p>
- * Note that StandardTokenizer sees " ' " as a space, and cuts it out.
+ * Note that {@link StandardTokenizer} sees " ' " as a space, and cuts it out.
  * 
  * @see <a href="http://fr.wikipedia.org/wiki/%C3%89lision">Elision in Wikipedia</a>
  */
@@ -78,7 +79,7 @@ public class ElisionFilter extends TokenFilter {
   }
 
   /**
-   * Returns the next input Token with term() without elisioned start
+   * Increments the {@link TokenStream} with a {@link TermAttribute} without elisioned start
    */
   public final boolean incrementToken() throws IOException {
     if (input.incrementToken()) {
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java
index fbafa35a14a..048e8e7f06b 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java
@@ -34,12 +34,14 @@ import java.util.Map;
 import java.util.Set;
 
 /**
- * Analyzer for French language. Supports an external list of stopwords (words that
+ * {@link Analyzer} for French language. 
+ * <p>
+ * Supports an external list of stopwords (words that
  * will not be indexed at all) and an external list of exclusions (word that will
  * not be stemmed, but indexed).
- * A default set of stopwords is used unless an alternative list is specified, the
+ * A default set of stopwords is used unless an alternative list is specified, but the
  * exclusion list is empty by default.
- *
+ * </p>
  *
  * @version $Id$
  */
@@ -74,7 +76,7 @@ public final class FrenchAnalyzer extends Analyzer {
   };
 
   /**
-   * Contains the stopwords used with the StopFilter.
+   * Contains the stopwords used with the {@link StopFilter}.
    */
   private Set stoptable = new HashSet();
   /**
@@ -127,10 +129,12 @@ public final class FrenchAnalyzer extends Analyzer {
   }
 
   /**
-   * Creates a TokenStream which tokenizes all the text in the provided Reader.
+   * Creates a {@link TokenStream} which tokenizes all the text in the provided
+   * {@link Reader}.
    *
-   * @return A TokenStream built from a StandardTokenizer filtered with
-   *         StandardFilter, StopFilter, FrenchStemFilter and LowerCaseFilter
+   * @return A {@link TokenStream} built from a {@link StandardTokenizer} 
+   *         filtered with {@link StandardFilter}, {@link StopFilter}, 
+   *         {@link FrenchStemFilter} and {@link LowerCaseFilter}
    */
   public final TokenStream tokenStream(String fieldName, Reader reader) {
 
@@ -152,11 +156,12 @@ public final class FrenchAnalyzer extends Analyzer {
   };
   
   /**
-   * Returns a (possibly reused) TokenStream which tokenizes all the text 
-   * in the provided Reader.
+   * Returns a (possibly reused) {@link TokenStream} which tokenizes all the 
+   * text in the provided {@link Reader}.
    *
-   * @return A TokenStream built from a StandardTokenizer filtered with
-   *         StandardFilter, StopFilter, FrenchStemFilter and LowerCaseFilter
+   * @return A {@link TokenStream} built from a {@link StandardTokenizer} 
+   *         filtered with {@link StandardFilter}, {@link StopFilter}, 
+   *         {@link FrenchStemFilter} and {@link LowerCaseFilter}
    */
   public TokenStream reusableTokenStream(String fieldName, Reader reader)
       throws IOException {
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchStemFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchStemFilter.java
index 991c4ec1e5f..8ea51c510f6 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchStemFilter.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchStemFilter.java
@@ -17,7 +17,6 @@ package org.apache.lucene.analysis.fr;
  * limitations under the License.
  */
 
-import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
@@ -28,10 +27,12 @@ import java.util.Map;
 import java.util.Set;
 
 /**
- * A filter that stemms french words. It supports a table of words that should
+ * A {@link TokenFilter} that stems french words. 
+ * <p>
+ * It supports a table of words that should
  * not be stemmed at all. The used stemmer can be changed at runtime after the
- * filter object is created (as long as it is a FrenchStemmer).
- *
+ * filter object is created (as long as it is a {@link FrenchStemmer}).
+ * </p>
  */
 public final class FrenchStemFilter extends TokenFilter {
 
@@ -75,7 +76,7 @@ public final class FrenchStemFilter extends TokenFilter {
 	  }
 	}
 	/**
-	 * Set a alternative/custom FrenchStemmer for this filter.
+	 * Set a alternative/custom {@link FrenchStemmer} for this filter.
 	 */
 	public void setStemmer( FrenchStemmer stemmer ) {
 		if ( stemmer != null ) {
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchStemmer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchStemmer.java
index 383a14b4de3..8dc6bf5ccf6 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchStemmer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchStemmer.java
@@ -18,11 +18,13 @@ package org.apache.lucene.analysis.fr;
  */
 
 /**
- * A stemmer for French words. The algorithm is based on the work of
+ * A stemmer for French words. 
+ * <p>
+ * The algorithm is based on the work of
  * Dr Martin Porter on his snowball project<br>
  * refer to http://snowball.sourceforge.net/french/stemmer.html<br>
  * (French stemming algorithm) for details
- *
+ * </p>
  */
 
 public class FrenchStemmer {
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/PrefixAndSuffixAwareTokenFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/PrefixAndSuffixAwareTokenFilter.java
index 3b2c3779622..c411c2319d1 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/PrefixAndSuffixAwareTokenFilter.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/PrefixAndSuffixAwareTokenFilter.java
@@ -23,7 +23,7 @@ import org.apache.lucene.analysis.TokenStream;
 import java.io.IOException;
 
 /**
- * Links two PrefixAwareTokenFilter
+ * Links two {@link PrefixAwareTokenFilter}.
  * <p/>
  * <b>NOTE:</b> This filter might not behave correctly if used with custom Attributes, i.e. Attributes other than
  * the ones located in org.apache.lucene.analysis.tokenattributes. 
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/SingleTokenTokenStream.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/SingleTokenTokenStream.java
index 51c824c471c..7205f46c10f 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/SingleTokenTokenStream.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/SingleTokenTokenStream.java
@@ -29,7 +29,7 @@ import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
 
 /**
- * A token stream containing a single token.
+ * A {@link TokenStream} containing a single token.
  */
 public class SingleTokenTokenStream extends TokenStream {
 
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/package.html b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/package.html
new file mode 100644
index 00000000000..1356a6e771d
--- /dev/null
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/package.html
@@ -0,0 +1,5 @@
+<html><head></head>
+<body>
+Miscellaneous TokenStreams
+</body>
+</html>
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java
index b0fca28d82e..a1d1b76261d 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java
@@ -27,9 +27,9 @@ import java.io.IOException;
 
 /**
  * Tokenizes the given token into n-grams of given size(s).
- *
- * This filter create n-grams from the beginning edge or ending edge of a input token.
- * 
+ * <p>
+ * This {@link TokenFilter} create n-grams from the beginning edge or ending edge of a input token.
+ * </p>
  */
 public class EdgeNGramTokenFilter extends TokenFilter {
   public static final Side DEFAULT_SIDE = Side.FRONT;
@@ -84,7 +84,7 @@ public class EdgeNGramTokenFilter extends TokenFilter {
   /**
    * Creates EdgeNGramTokenFilter that can generate n-grams in the sizes of the given range
    *
-   * @param input TokenStream holding the input to be tokenized
+   * @param input {@link TokenStream} holding the input to be tokenized
    * @param side the {@link Side} from which to chop off an n-gram
    * @param minGram the smallest n-gram to generate
    * @param maxGram the largest n-gram to generate
@@ -114,7 +114,7 @@ public class EdgeNGramTokenFilter extends TokenFilter {
   /**
    * Creates EdgeNGramTokenFilter that can generate n-grams in the sizes of the given range
    *
-   * @param input TokenStream holding the input to be tokenized
+   * @param input {@link TokenStream} holding the input to be tokenized
    * @param sideLabel the name of the {@link Side} from which to chop off an n-gram
    * @param minGram the smallest n-gram to generate
    * @param maxGram the largest n-gram to generate
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java
index 91579094bb5..3c13c07a8ff 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java
@@ -19,7 +19,6 @@ package org.apache.lucene.analysis.ngram;
 
 import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter.Side;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 
@@ -28,10 +27,10 @@ import java.io.Reader;
 
 /**
  * Tokenizes the input from an edge into n-grams of given size(s).
- *
- * This tokenizer create n-grams from the beginning edge or ending edge of a input token.
+ * <p>
+ * This {@link Tokenizer} create n-grams from the beginning edge or ending edge of a input token.
  * MaxGram can't be larger than 1024 because of limitation.
- *
+ * </p>
  */
 public class EdgeNGramTokenizer extends Tokenizer {
   public static final Side DEFAULT_SIDE = Side.FRONT;
@@ -82,7 +81,7 @@ public class EdgeNGramTokenizer extends Tokenizer {
   /**
    * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
    *
-   * @param input Reader holding the input to be tokenized
+   * @param input {@link Reader} holding the input to be tokenized
    * @param side the {@link Side} from which to chop off an n-gram
    * @param minGram the smallest n-gram to generate
    * @param maxGram the largest n-gram to generate
@@ -112,7 +111,7 @@ public class EdgeNGramTokenizer extends Tokenizer {
   /**
    * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
    *
-   * @param input Reader holding the input to be tokenized
+   * @param input {@link Reader} holding the input to be tokenized
    * @param sideLabel the name of the {@link Side} from which to chop off an n-gram
    * @param minGram the smallest n-gram to generate
    * @param maxGram the largest n-gram to generate
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java
index 46db5ce3670..e0d849b2c2f 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java
@@ -44,7 +44,7 @@ public class NGramTokenFilter extends TokenFilter {
 
   /**
    * Creates NGramTokenFilter with given min and max n-grams.
-   * @param input TokenStream holding the input to be tokenized
+   * @param input {@link TokenStream} holding the input to be tokenized
    * @param minGram the smallest n-gram to generate
    * @param maxGram the largest n-gram to generate
    */
@@ -65,7 +65,7 @@ public class NGramTokenFilter extends TokenFilter {
 
   /**
    * Creates NGramTokenFilter with default min and max n-grams.
-   * @param input TokenStream holding the input to be tokenized
+   * @param input {@link TokenStream} holding the input to be tokenized
    */
   public NGramTokenFilter(TokenStream input) {
     this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE);
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java
index 974bea64771..89dff1ae262 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java
@@ -44,7 +44,7 @@ public class NGramTokenizer extends Tokenizer {
 
   /**
    * Creates NGramTokenizer with given min and max n-grams.
-   * @param input Reader holding the input to be tokenized
+   * @param input {@link Reader} holding the input to be tokenized
    * @param minGram the smallest n-gram to generate
    * @param maxGram the largest n-gram to generate
    */
@@ -64,7 +64,7 @@ public class NGramTokenizer extends Tokenizer {
   }
   /**
    * Creates NGramTokenizer with default min and max n-grams.
-   * @param input Reader holding the input to be tokenized
+   * @param input {@link Reader} holding the input to be tokenized
    */
   public NGramTokenizer(Reader input) {
     this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE);
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/package.html b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/package.html
new file mode 100644
index 00000000000..683197879fc
--- /dev/null
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/package.html
@@ -0,0 +1,5 @@
+<html><head></head>
+<body>
+Character n-gram tokenizers and filters.
+</body>
+</html>
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java
index dae58534962..081fbcf9898 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java
@@ -33,13 +33,15 @@ import java.util.Set;
 import java.util.Map;
 
 /**
- * Analyzer for Dutch language. Supports an external list of stopwords (words that
+ * {@link Analyzer} for Dutch language. 
+ * <p>
+ * Supports an external list of stopwords (words that
  * will not be indexed at all), an external list of exclusions (word that will
  * not be stemmed, but indexed) and an external list of word-stem pairs that overrule
  * the algorithm (dictionary stemming).
- * A default set of stopwords is used unless an alternative list is specified, the
+ * A default set of stopwords is used unless an alternative list is specified, but the
  * exclusion list is empty by default.
- *
+ * </p>
  */
 public class DutchAnalyzer extends Analyzer {
   /**
@@ -165,10 +167,12 @@ public class DutchAnalyzer extends Analyzer {
   }
 
   /**
-   * Creates a TokenStream which tokenizes all the text in the provided TextReader.
+   * Creates a {@link TokenStream} which tokenizes all the text in the 
+   * provided {@link Reader}.
    *
-   * @return A TokenStream built from a StandardTokenizer filtered with StandardFilter,
-   * StopFilter, DutchStemFilter
+   * @return A {@link TokenStream} built from a {@link StandardTokenizer}
+   *   filtered with {@link StandardFilter}, {@link StopFilter}, 
+   *   and {@link DutchStemFilter}
    */
   public TokenStream tokenStream(String fieldName, Reader reader) {
     TokenStream result = new StandardTokenizer(reader);
@@ -184,11 +188,12 @@ public class DutchAnalyzer extends Analyzer {
   };
   
   /**
-   * Returns a (possibly reused) TokenStream which tokenizes all the text 
-   * in the provided Reader.
+   * Returns a (possibly reused) {@link TokenStream} which tokenizes all the 
+   * text in the provided {@link Reader}.
    *
-   * @return A TokenStream built from a StandardTokenizer filtered with
-   *         StandardFilter, StopFilter, DutchStemFilter
+   * @return A {@link TokenStream} built from a {@link StandardTokenizer}
+   *   filtered with {@link StandardFilter}, {@link StopFilter}, 
+   *   and {@link DutchStemFilter}
    */
   public TokenStream reusableTokenStream(String fieldName, Reader reader)
       throws IOException {
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchStemFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchStemFilter.java
index 037ee028011..4f9ae66fb4c 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchStemFilter.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchStemFilter.java
@@ -28,10 +28,12 @@ import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 
 /**
- * A filter that stems Dutch words. It supports a table of words that should
+ * A {@link TokenFilter} that stems Dutch words. 
+ * <p>
+ * It supports a table of words that should
  * not be stemmed at all. The stemmer used can be changed at runtime after the
- * filter object is created (as long as it is a DutchStemmer).
- *
+ * filter object is created (as long as it is a {@link DutchStemmer}).
+ * </p>
  */
 public final class DutchStemFilter extends TokenFilter {
   /**
@@ -85,7 +87,7 @@ public final class DutchStemFilter extends TokenFilter {
   }
 
   /**
-   * Set a alternative/custom DutchStemmer for this filter.
+   * Set a alternative/custom {@link DutchStemmer} for this filter.
    */
   public void setStemmer(DutchStemmer stemmer) {
     if (stemmer != null) {
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchStemmer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchStemmer.java
index 9b70ab6eb01..84f35f0c66e 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchStemmer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchStemmer.java
@@ -20,11 +20,12 @@ package org.apache.lucene.analysis.nl;
 import java.util.Map;
 
 /**
- *
- * A stemmer for Dutch words. The algorithm is an implementation of
+ * A stemmer for Dutch words. 
+ * <p>
+ * The algorithm is an implementation of
  * the <a href="http://snowball.tartarus.org/algorithms/dutch/stemmer.html">dutch stemming</a>
  * algorithm in Martin Porter's snowball project.
- *
+ * </p>
  */
 
 public class DutchStemmer {
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/AbstractEncoder.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/AbstractEncoder.java
index 6b0533bae01..a4bdad3ca82 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/AbstractEncoder.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/AbstractEncoder.java
@@ -4,7 +4,7 @@ import org.apache.lucene.index.Payload;
 
 
 /**
- *
+ * Base class for payload encoders.
  *
  **/
 public abstract class AbstractEncoder implements PayloadEncoder{
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/FloatEncoder.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/FloatEncoder.java
index fd7c0bd2c63..2dd8d831aba 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/FloatEncoder.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/FloatEncoder.java
@@ -22,7 +22,7 @@ import org.apache.lucene.index.Payload;
 /**
  *  Encode a character array Float as a {@link org.apache.lucene.index.Payload}.
  * <p/>
- * @see {@link org.apache.lucene.analysis.payloads.PayloadHelper#encodeFloat(float, byte[], int)}
+ * @see org.apache.lucene.analysis.payloads.PayloadHelper#encodeFloat(float, byte[], int)
  *
  **/
 public class FloatEncoder extends AbstractEncoder implements PayloadEncoder {
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/PayloadEncoder.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/PayloadEncoder.java
index d88cc1c2e08..ebcdec40ffd 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/PayloadEncoder.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/PayloadEncoder.java
@@ -20,7 +20,7 @@ import org.apache.lucene.index.Payload;
 
 
 /**
- * Mainly for use with the DelimitedPayloadTokenFilter, converts char buffers to Payload
+ * Mainly for use with the DelimitedPayloadTokenFilter, converts char buffers to Payload.
  * <p/>
  * NOTE: This interface is subject to change 
  *
@@ -34,7 +34,7 @@ public interface PayloadEncoder {
    * @param buffer
    * @param offset
    * @param length
-   * @return
+   * @return encoded {@link Payload}
    */
   Payload encode(char [] buffer, int offset, int length);
 }
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/PayloadHelper.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/PayloadHelper.java
index 31684d567c6..1c8a2f841cc 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/PayloadHelper.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/PayloadHelper.java
@@ -18,7 +18,7 @@ package org.apache.lucene.analysis.payloads;
 
 
 /**
- *
+ * Utility methods for encoding payloads.
  *
  **/
 public class PayloadHelper {
@@ -60,7 +60,7 @@ public class PayloadHelper {
    * @param offset The offset into the array.
    * @return The float that was encoded
    *
-   * @see # encodeFloat (float)
+   * @see #encodeFloat(float)
    */
   public static final float decodeFloat(byte [] bytes, int offset){
 
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/position/package.html b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/position/package.html
new file mode 100644
index 00000000000..1f1457246ee
--- /dev/null
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/position/package.html
@@ -0,0 +1,5 @@
+<html><head></head>
+<body>
+Filter for assigning position increments.
+</body>
+</html>
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzer.java
index 1f3aea5a0c1..2a284dee557 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzer.java
@@ -28,19 +28,19 @@ import java.io.IOException;
 import java.io.Reader;
 import java.util.*;
 
-/*
- * An analyzer used primarily at query time to wrap another analyzer and provide a layer of protection
- * which prevents very common words from being passed into queries. For very large indexes the cost
+/**
+ * An {@link Analyzer} used primarily at query time to wrap another analyzer and provide a layer of protection
+ * which prevents very common words from being passed into queries. 
+ * <p>
+ * For very large indexes the cost
  * of reading TermDocs for a very common word can be  high. This analyzer was created after experience with
  * a 38 million doc index which had a term in around 50% of docs and was causing TermQueries for 
  * this term to take 2 seconds.
- *
+ * </p>
+ * <p>
  * Use the various "addStopWords" methods in this class to automate the identification and addition of 
  * stop words found in an already existing index.
- * 
- * 
- * 
-
+ * </p>
  */
 public class QueryAutoStopWordAnalyzer extends Analyzer {
   Analyzer delegate;
@@ -50,9 +50,9 @@ public class QueryAutoStopWordAnalyzer extends Analyzer {
   public static final float defaultMaxDocFreqPercent = 0.4f;
 
   /**
-   * Initializes this analyzer with the Analyzer object that actual produces the tokens
+   * Initializes this analyzer with the Analyzer object that actually produces the tokens
    *
-   * @param delegate The choice of analyzer that is used to produce the token stream which needs filtering
+   * @param delegate The choice of {@link Analyzer} that is used to produce the token stream which needs filtering
    */
   public QueryAutoStopWordAnalyzer(Analyzer delegate) {
     this.delegate = delegate;
@@ -62,7 +62,7 @@ public class QueryAutoStopWordAnalyzer extends Analyzer {
   /**
    * Automatically adds stop words for all fields with terms exceeding the defaultMaxDocFreqPercent
    *
-   * @param reader The IndexReader class which will be consulted to identify potential stop words that
+   * @param reader The {@link IndexReader} which will be consulted to identify potential stop words that
    *               exceed the required document frequency
    * @return The number of stop words identified.
    * @throws IOException
@@ -74,7 +74,7 @@ public class QueryAutoStopWordAnalyzer extends Analyzer {
   /**
    * Automatically adds stop words for all fields with terms exceeding the maxDocFreqPercent
    *
-   * @param reader     The IndexReader class which will be consulted to identify potential stop words that
+   * @param reader     The {@link IndexReader} which will be consulted to identify potential stop words that
    *                   exceed the required document frequency
    * @param maxDocFreq The maximum number of index documents which can contain a term, after which
    *                   the term is considered to be a stop word
@@ -94,7 +94,7 @@ public class QueryAutoStopWordAnalyzer extends Analyzer {
   /**
    * Automatically adds stop words for all fields with terms exceeding the maxDocFreqPercent
    *
-   * @param reader        The IndexReader class which will be consulted to identify potential stop words that
+   * @param reader        The {@link IndexReader} which will be consulted to identify potential stop words that
    *                      exceed the required document frequency
    * @param maxPercentDocs The maximum percentage (between 0.0 and 1.0) of index documents which
    *                      contain a term, after which the word is considered to be a stop word.
@@ -114,7 +114,7 @@ public class QueryAutoStopWordAnalyzer extends Analyzer {
   /**
    * Automatically adds stop words for the given field with terms exceeding the maxPercentDocs
    *
-   * @param reader         The IndexReader class which will be consulted to identify potential stop words that
+   * @param reader         The {@link IndexReader} which will be consulted to identify potential stop words that
    *                       exceed the required document frequency
    * @param fieldName      The field for which stopwords will be added
    * @param maxPercentDocs The maximum percentage (between 0.0 and 1.0) of index documents which
@@ -129,7 +129,7 @@ public class QueryAutoStopWordAnalyzer extends Analyzer {
   /**
    * Automatically adds stop words for the given field with terms exceeding the maxPercentDocs
    *
-   * @param reader     The IndexReader class which will be consulted to identify potential stop words that
+   * @param reader     The {@link IndexReader} which will be consulted to identify potential stop words that
    *                   exceed the required document frequency
    * @param fieldName  The field for which stopwords will be added
    * @param maxDocFreq The maximum number of index documents which
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/query/package.html b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/query/package.html
new file mode 100644
index 00000000000..783648b8050
--- /dev/null
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/query/package.html
@@ -0,0 +1,5 @@
+<html><head></head>
+<body>
+Automatically filter high-frequency stopwords.
+</body>
+</html>
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/reverse/package.html b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/reverse/package.html
new file mode 100644
index 00000000000..f1eb2bb5cb4
--- /dev/null
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/reverse/package.html
@@ -0,0 +1,5 @@
+<html><head></head>
+<body>
+Filter to reverse token text.
+</body>
+</html>
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java
index d3bf8b56162..a6ab1819c3b 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java
@@ -29,10 +29,12 @@ import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
 
 /**
- * Analyzer for Russian language. Supports an external list of stopwords (words that
+ * {@link Analyzer} for Russian language. 
+ * <p>
+ * Supports an external list of stopwords (words that
  * will not be indexed at all).
  * A default set of stopwords is used unless an alternative list is specified.
- *
+ * </p>
  *
  * @version $Id$
  */
@@ -246,10 +248,13 @@ public final class RussianAnalyzer extends Analyzer
     }
 
     /**
-     * Creates a TokenStream which tokenizes all the text in the provided Reader.
+     * Creates a {@link TokenStream} which tokenizes all the text in the 
+     * provided {@link Reader}.
      *
-     * @return  A TokenStream built from a RussianLetterTokenizer filtered with
-     *                  RussianLowerCaseFilter, StopFilter, and RussianStemFilter
+     * @return  A {@link TokenStream} built from a 
+     *   {@link RussianLetterTokenizer} filtered with 
+     *   {@link RussianLowerCaseFilter}, {@link StopFilter}, 
+     *   and {@link RussianStemFilter}
      */
     public TokenStream tokenStream(String fieldName, Reader reader)
     {
@@ -266,11 +271,13 @@ public final class RussianAnalyzer extends Analyzer
     };
     
     /**
-     * Returns a (possibly reused) TokenStream which tokenizes all the text 
-     * in the provided Reader.
+     * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text 
+     * in the provided {@link Reader}.
      *
-     * @return  A TokenStream built from a RussianLetterTokenizer filtered with
-     *                  RussianLowerCaseFilter, StopFilter, and RussianStemFilter
+     * @return  A {@link TokenStream} built from a 
+     *   {@link RussianLetterTokenizer} filtered with 
+     *   {@link RussianLowerCaseFilter}, {@link StopFilter}, 
+     *   and {@link RussianStemFilter}
      */
     public TokenStream reusableTokenStream(String fieldName, Reader reader) 
       throws IOException {
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianCharsets.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianCharsets.java
index e670ff2623b..bacbd325c06 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianCharsets.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianCharsets.java
@@ -19,10 +19,11 @@ package org.apache.lucene.analysis.ru;
 /**
  * RussianCharsets class contains encodings schemes (charsets) and toLowerCase() method implementation
  * for russian characters in Unicode, KOI8 and CP1252.
+ * <p>
  * Each encoding scheme contains lowercase (positions 0-31) and uppercase (position 32-63) characters.
  * One should be able to add other encoding schemes (like ISO-8859-5 or customized) by adding a new charset
  * and adding logic to toLowerCase() method for that charset.
- *
+ * </p>
  *
  * @version $Id$
  */
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLetterTokenizer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLetterTokenizer.java
index 546fd8c99e1..3a38e3e6799 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLetterTokenizer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLetterTokenizer.java
@@ -19,13 +19,18 @@ package org.apache.lucene.analysis.ru;
 
 import java.io.Reader;
 import org.apache.lucene.analysis.CharTokenizer;
+import org.apache.lucene.analysis.Tokenizer; // for javadocs
+import org.apache.lucene.analysis.LetterTokenizer; // for javadocs
 
 /**
- * A RussianLetterTokenizer is a tokenizer that extends LetterTokenizer by additionally looking up letters
- * in a given "russian charset". The problem with LeterTokenizer is that it uses Character.isLetter() method,
+ * A RussianLetterTokenizer is a {@link Tokenizer} that extends {@link LetterTokenizer}
+ * by additionally looking up letters in a given "russian charset". 
+ * <p>
+ * The problem with 
+ * {@link LetterTokenizer} is that it uses {@link Character#isLetter(char)} method,
  * which doesn't know how to detect letters in encodings like CP1252 and KOI8
  * (well-known problems with 0xD7 and 0xF7 chars)
- *
+ * </p>
  *
  * @version $Id$
  */
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLowerCaseFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLowerCaseFilter.java
index cd54f0b5712..41eed6ae91c 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLowerCaseFilter.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLowerCaseFilter.java
@@ -20,7 +20,6 @@ package org.apache.lucene.analysis.ru;
 import java.io.IOException;
 
 import org.apache.lucene.analysis.TokenFilter;
-import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java
index ab87c2b2ea0..4aed458a364 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java
@@ -17,7 +17,6 @@ package org.apache.lucene.analysis.ru;
  * limitations under the License.
  */
 
-import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
@@ -25,10 +24,12 @@ import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 import java.io.IOException;
 
 /**
- * A filter that stems Russian words. The implementation was inspired by GermanStemFilter.
- * The input should be filtered by RussianLowerCaseFilter before passing it to RussianStemFilter ,
- * because RussianStemFilter only works  with lowercase part of any "russian" charset.
- *
+ * A {@link TokenFilter} that stems Russian words. 
+ * <p>
+ * The implementation was inspired by GermanStemFilter.
+ * The input should be filtered by {@link RussianLowerCaseFilter} before passing it to RussianStemFilter ,
+ * because RussianStemFilter only works with lowercase part of any "russian" charset.
+ * </p>
  *
  * @version   $Id$
  */
@@ -66,7 +67,7 @@ public final class RussianStemFilter extends TokenFilter
 
 
     /**
-     * Set a alternative/custom RussianStemmer for this filter.
+     * Set a alternative/custom {@link RussianStemmer} for this filter.
      */
     public void setStemmer(RussianStemmer stemmer)
     {
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapper.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapper.java
index 358ae084c59..2ac6ffec91a 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapper.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapper.java
@@ -25,8 +25,10 @@ import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
 
 /**
- * A ShingleAnalyzerWrapper wraps a ShingleFilter around another analyzer. A
- * shingle is another namefor a token based n-gram.
+ * A ShingleAnalyzerWrapper wraps a {@link ShingleFilter} around another {@link Analyzer}.
+ * <p>
+ * A shingle is another name for a token based n-gram.
+ * </p>
  */
 public class ShingleAnalyzerWrapper extends Analyzer {
 
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java
index 45fd2474634..8395dc24eb5 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java
@@ -76,7 +76,7 @@ public class ShingleFilter extends TokenFilter {
 
   /**
    * Constructs a ShingleFilter with the specified single size from the
-   * TokenStream <code>input</code>
+   * {@link TokenStream} <code>input</code>
    *
    * @param input input stream
    * @param maxShingleSize maximum shingle size produced by the filter.
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleMatrixFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleMatrixFilter.java
index 6570e59d721..9388e5be106 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleMatrixFilter.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleMatrixFilter.java
@@ -129,7 +129,7 @@ public class ShingleMatrixFilter extends TokenStream {
     /**
      * Retrieves information on how a {@link org.apache.lucene.analysis.Token} is to be inserted to a {@link org.apache.lucene.analysis.shingle.ShingleMatrixFilter.Matrix}.
      * @param token
-     * @return
+     * @return {@link ShingleMatrixFilter.TokenPositioner}
      * @throws IOException
      */
     public abstract TokenPositioner getTokenPositioner(Token token) throws IOException;
@@ -1014,7 +1014,7 @@ public class ShingleMatrixFilter extends TokenStream {
      * Returns a 32 bit float from the payload, or 1f it null.
      *
      * @param token
-     * @return
+     * @return 32 bit float
      */
     public float getWeight(Token token) {
       if (token.getPayload() == null || token.getPayload().getData() == null) {
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/package.html b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/package.html
new file mode 100644
index 00000000000..ca0bb76529d
--- /dev/null
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/package.html
@@ -0,0 +1,5 @@
+<html><head></head>
+<body>
+Word n-gram filters
+</body>
+</html>
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java
index db4ee21e69d..a0d6ab4422f 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java
@@ -27,7 +27,7 @@ import org.apache.lucene.analysis.standard.StandardFilter;
 import org.apache.lucene.analysis.standard.StandardTokenizer;
 
 /**
- * Analyzer for Thai language. It uses java.text.BreakIterator to break words.
+ * {@link Analyzer} for Thai language. It uses {@link java.text.BreakIterator} to break words.
  * @version 0.2
  */
 public class ThaiAnalyzer extends Analyzer {
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java
index 95baaa491d3..aa0d062b1b0 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java
@@ -28,7 +28,7 @@ import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 import java.text.BreakIterator;
 
 /**
- * TokenFilter that use java.text.BreakIterator to break each 
+ * {@link TokenFilter} that use {@link java.text.BreakIterator} to break each 
  * Token that is Thai into separate Token(s) for each Thai word.
  * @version 0.2
  */
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/package.html b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/package.html
new file mode 100644
index 00000000000..b77b1b9e26c
--- /dev/null
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/package.html
@@ -0,0 +1,5 @@
+<html><head></head>
+<body>
+Analyzer for Thai.
+</body>
+</html>
diff --git a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java
index 15527b75c52..0427cacc865 100644
--- a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java
+++ b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java
@@ -118,6 +118,14 @@ public class TestBrazilianStemmer extends TestCase {
 	 check("quiosque", "quiosqu");
   }
   
+  public void testNormalization() throws Exception {
+    check("Brasil", "brasil"); // lowercase by default
+    check("Brasília", "brasil"); // remove diacritics
+    check("quimio5terápicos", "quimio5terapicos"); // contains non-letter, diacritic will still be removed
+    check("áá", "áá"); // token is too short: diacritics are not removed
+    check("ááá", "aaa"); // normally, diacritics are removed
+  }
+  
   public void testReusableTokenStream() throws Exception {
     Analyzer a = new BrazilianAnalyzer();
     checkReuse(a, "boa", "boa");
@@ -126,6 +134,11 @@ public class TestBrazilianStemmer extends TestCase {
     checkReuse(a, "bôas", "boas"); // removes diacritic: different from snowball portugese
   }
  
+  public void testStemExclusionTable() throws Exception {
+    BrazilianAnalyzer a = new BrazilianAnalyzer();
+    a.setStemExclusionTable(new String[] { "quintessência" });
+    checkReuse(a, "quintessência", "quintessência"); // excluded words will be completely unchanged.
+  }
 
   private void check(final String input, final String expected) throws IOException {
     Analyzer analyzer = new BrazilianAnalyzer(); 
diff --git a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cjk/TestCJKTokenizer.java b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cjk/TestCJKTokenizer.java
index fde7c3c33df..0addd8757e3 100644
--- a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cjk/TestCJKTokenizer.java
+++ b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cjk/TestCJKTokenizer.java
@@ -169,6 +169,66 @@ public class TestCJKTokenizer extends TestCase{
     checkCJKToken(str, out_tokens);
   }
   
+  /*
+   * Full-width text is normalized to half-width 
+   */
+  public void testFullWidth() throws Exception {
+    String str = "Ｔｅｓｔ １２３４";
+    TestToken[] out_tokens = { 
+        newToken("test", 0, 4, CJKTokenizer.SINGLE_TOKEN_TYPE), 
+        newToken("1234", 5, 9, CJKTokenizer.SINGLE_TOKEN_TYPE)
+    };
+    checkCJKToken(str, out_tokens);
+  }
+  
+  /*
+   * Non-english text (not just CJK) is treated the same as CJK: C1C2 C2C3 
+   */
+  public void testNonIdeographic() throws Exception {
+    String str = "\u4e00 روبرت موير";
+    TestToken[] out_tokens = {
+        newToken("\u4e00", 0, 1, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+        newToken("رو", 2, 4, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+        newToken("وب", 3, 5, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+        newToken("بر", 4, 6, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+        newToken("رت", 5, 7, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+        newToken("مو", 8, 10, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+        newToken("وي", 9, 11, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+        newToken("ير", 10, 12, CJKTokenizer.DOUBLE_TOKEN_TYPE)
+    };
+    checkCJKToken(str, out_tokens);
+  }
+  
+  /*
+   * Non-english text with nonletters (non-spacing marks,etc) is treated as C1C2 C2C3,
+   * except for words are split around non-letters.
+   */
+  public void testNonIdeographicNonLetter() throws Exception {
+    String str = "\u4e00 رُوبرت موير";
+    TestToken[] out_tokens = {
+        newToken("\u4e00", 0, 1, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+        newToken("ر", 2, 3, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+        newToken("وب", 4, 6, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+        newToken("بر", 5, 7, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+        newToken("رت", 6, 8, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+        newToken("مو", 9, 11, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+        newToken("وي", 10, 12, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+        newToken("ير", 11, 13, CJKTokenizer.DOUBLE_TOKEN_TYPE)
+    };
+    checkCJKToken(str, out_tokens);
+  }
+  
+  public void testTokenStream() throws Exception {
+    Analyzer analyzer = new CJKAnalyzer();
+    TokenStream ts = analyzer.tokenStream("dummy", new StringReader("\u4e00\u4e01\u4e02"));
+    TermAttribute termAtt = (TermAttribute) ts.getAttribute(TermAttribute.class);
+    assertTrue(ts.incrementToken());
+    assertEquals("\u4e00\u4e01", termAtt.term());
+    assertTrue(ts.incrementToken());
+    assertEquals("\u4e01\u4e02", termAtt.term());
+    assertFalse(ts.incrementToken());
+  }
+  
   public void testReusableTokenStream() throws Exception {
     Analyzer analyzer = new CJKAnalyzer();
     String str = "\u3042\u3044\u3046\u3048\u304aabc\u304b\u304d\u304f\u3051\u3053";
diff --git a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cn/TestChineseTokenizer.java b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cn/TestChineseTokenizer.java
index 07a88bb8636..2477b90f909 100644
--- a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cn/TestChineseTokenizer.java
+++ b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cn/TestChineseTokenizer.java
@@ -18,12 +18,15 @@ package org.apache.lucene.analysis.cn;
  */
 
 import java.io.IOException;
+import java.io.Reader;
 import java.io.StringReader;
 
 import junit.framework.TestCase;
 
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.WhitespaceTokenizer;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 
@@ -59,6 +62,76 @@ public class TestChineseTokenizer extends TestCase
         new int[] { 1, 2, 3 });
     }
     
+    /*
+     * Analyzer that just uses ChineseTokenizer, not ChineseFilter.
+     * convenience to show the behavior of the tokenizer
+     */
+    private class JustChineseTokenizerAnalyzer extends Analyzer {
+      public TokenStream tokenStream(String fieldName, Reader reader) {
+        return new ChineseTokenizer(reader);
+      }   
+    }
+    
+    /*
+     * Analyzer that just uses ChineseFilter, not ChineseTokenizer.
+     * convenience to show the behavior of the filter.
+     */
+    private class JustChineseFilterAnalyzer extends Analyzer {
+      public TokenStream tokenStream(String fieldName, Reader reader) {
+        return new ChineseFilter(new WhitespaceTokenizer(reader));
+      }
+    }
+    
+    /*
+     * ChineseTokenizer tokenizes numbers as one token, but they are filtered by ChineseFilter
+     */
+    public void testNumerics() throws Exception
+    { 
+      Analyzer justTokenizer = new JustChineseTokenizerAnalyzer();
+      assertAnalyzesTo(justTokenizer, "中1234", new String[] { "中", "1234" });
+          
+      // in this case the ChineseAnalyzer (which applies ChineseFilter) will remove the numeric token.
+      Analyzer a = new ChineseAnalyzer(); 
+      assertAnalyzesTo(a, "中1234", new String[] { "中" });
+    }
+    
+    /*
+     * ChineseTokenizer tokenizes english similar to SimpleAnalyzer.
+     * it will lowercase terms automatically.
+     * 
+     * ChineseFilter has an english stopword list, it also removes any single character tokens.
+     * the stopword list is case-sensitive.
+     */
+    public void testEnglish() throws Exception
+    {
+      Analyzer chinese = new ChineseAnalyzer();
+      assertAnalyzesTo(chinese, "This is a Test. b c d",
+          new String[] { "test" });
+      
+      Analyzer justTokenizer = new JustChineseTokenizerAnalyzer();
+      assertAnalyzesTo(justTokenizer, "This is a Test. b c d",
+          new String[] { "this", "is", "a", "test", "b", "c", "d" });
+      
+      Analyzer justFilter = new JustChineseFilterAnalyzer();
+      assertAnalyzesTo(justFilter, "This is a Test. b c d", 
+          new String[] { "This", "Test." });
+    }
+    
+    private void assertAnalyzesTo(Analyzer a, String input, String[] output)
+      throws Exception {
+      TokenStream ts = a.tokenStream("dummy", new StringReader(input));
+      TermAttribute termAtt = (TermAttribute) ts
+      .getAttribute(TermAttribute.class);
+
+     for (int i = 0; i < output.length; i++) {
+       assertTrue(ts.incrementToken());
+       assertEquals(output[i], termAtt.term());
+     }
+
+     assertFalse(ts.incrementToken());
+     ts.close();
+    }
+    
     private void assertAnalyzesToReuse(Analyzer a, String input, String[] output,
       int startOffsets[], int endOffsets[])
       throws Exception {
diff --git a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java
index 303a57c6f64..64c64b239c6 100644
--- a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java
+++ b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java
@@ -90,12 +90,12 @@ public class TestGermanStemFilter extends TestCase {
   }
 
   private void check(final String input, final String expected) throws IOException {
-    StandardTokenizer tokenStream = new StandardTokenizer(new StringReader(input));
-    GermanStemFilter filter = new GermanStemFilter(tokenStream);
-    TermAttribute termAtt = (TermAttribute) filter.getAttribute(TermAttribute.class);
-    assertTrue(filter.incrementToken());
+    Analyzer a = new GermanAnalyzer();
+    TokenStream tokenStream = a.tokenStream("dummy", new StringReader(input));
+    TermAttribute termAtt = (TermAttribute) tokenStream.getAttribute(TermAttribute.class);
+    assertTrue(tokenStream.incrementToken());
     assertEquals(expected, termAtt.term());
-    filter.close();
+    tokenStream.close();
   }
   
   private void checkReuse(Analyzer a, String input, String expected) throws IOException {
diff --git a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzerTest.java b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzerTest.java
index 38e71e3e86c..ec154b3e5c2 100644
--- a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzerTest.java
+++ b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzerTest.java
@@ -18,9 +18,11 @@ package org.apache.lucene.analysis.query;
 
 import junit.framework.TestCase;
 import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.LetterTokenizer;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.WhitespaceAnalyzer;
 import org.apache.lucene.analysis.WhitespaceTokenizer;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.index.IndexReader;
@@ -35,6 +37,7 @@ import org.apache.lucene.store.RAMDirectory;
 
 import java.io.IOException;
 import java.io.Reader;
+import java.io.StringReader;
 
 public class QueryAutoStopWordAnalyzerTest extends TestCase {
   String variedFieldValues[] = {"the", "quick", "brown", "fox", "jumped", "over", "the", "lazy", "boring", "dog"};
@@ -162,4 +165,37 @@ public class QueryAutoStopWordAnalyzerTest extends TestCase {
     Hits h = search(a, "repetitiveField:boring");
     assertFalse(h.length() == 0);
   }
+  
+  /*
+   * analyzer that does not support reuse
+   * it is LetterTokenizer on odd invocations, WhitespaceTokenizer on even.
+   */
+  private class NonreusableAnalyzer extends Analyzer {
+    int invocationCount = 0;
+    public TokenStream tokenStream(String fieldName, Reader reader) {
+      if (++invocationCount % 2 == 0)
+        return new WhitespaceTokenizer(reader);
+      else
+        return new LetterTokenizer(reader);
+    }
+  }
+  
+  public void testWrappingNonReusableAnalyzer() throws Exception {
+    QueryAutoStopWordAnalyzer a = new QueryAutoStopWordAnalyzer(new NonreusableAnalyzer());
+    a.addStopWords(reader, 10);
+    Hits h = search(a, "repetitiveField:boring");
+    assertTrue(h.length() == 0);
+    h = search(a, "repetitiveField:vaguelyboring");
+    assertTrue(h.length() == 0);
+  }
+  
+  public void testTokenStream() throws Exception {
+    QueryAutoStopWordAnalyzer a = new QueryAutoStopWordAnalyzer(new WhitespaceAnalyzer());
+    a.addStopWords(reader, 10);
+    TokenStream ts = a.tokenStream("repetitiveField", new StringReader("this boring"));
+    TermAttribute termAtt = (TermAttribute) ts.getAttribute(TermAttribute.class);
+    assertTrue(ts.incrementToken());
+    assertEquals("this", termAtt.term());
+    assertFalse(ts.incrementToken());
+  }
 }
diff --git a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/TestShingleMatrixFilter.java b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/TestShingleMatrixFilter.java
index 3f6dfe600db..cb80ad9d0b3 100644
--- a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/TestShingleMatrixFilter.java
+++ b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/TestShingleMatrixFilter.java
@@ -336,7 +336,9 @@ public class TestShingleMatrixFilter extends TestCase {
    * @throws IOException
    */
   public void testMatrix() throws IOException {
-
+    // some other tests set this to null.
+    // set it here in case tests are run out of the usual order.
+    ShingleMatrixFilter.defaultSettingsCodec = new ShingleMatrixFilter.SimpleThreeDimensionalTokenSettingsCodec();
     Matrix matrix = new Matrix();
 
     matrix.new Column(tokenFactory("no", 1));
diff --git a/contrib/analyzers/smartcn/src/test/org/apache/lucene/analysis/cn/TestSmartChineseAnalyzer.java b/contrib/analyzers/smartcn/src/test/org/apache/lucene/analysis/cn/TestSmartChineseAnalyzer.java
index 5d6d61f0115..025596eb755 100644
--- a/contrib/analyzers/smartcn/src/test/org/apache/lucene/analysis/cn/TestSmartChineseAnalyzer.java
+++ b/contrib/analyzers/smartcn/src/test/org/apache/lucene/analysis/cn/TestSmartChineseAnalyzer.java
@@ -57,7 +57,7 @@ public class TestSmartChineseAnalyzer extends TestCase {
    * This test is the same as the above, except using an ideographic space as a separator.
    * This tests to ensure the stopwords are working correctly.
    */
-  public void testChineseStopWordsDefaultTwoPhrasesIdeoSpache() throws Exception {
+  public void testChineseStopWordsDefaultTwoPhrasesIdeoSpace() throws Exception {
     Analyzer ca = new SmartChineseAnalyzer(); /* will load stopwords */
     String sentence = "我购买了道具和服装　我购买了道具和服装。";
     String result[] = { "我", "购买", "了", "道具", "和", "服装", "我", "购买", "了", "道具", "和", "服装" };
@@ -101,6 +101,52 @@ public class TestSmartChineseAnalyzer extends TestCase {
         new String[] { "我", "购买", "test", "了", "道具", "和", "服装"});
   }
   
+  /*
+   * Numerics are parsed as their own tokens
+   */
+  public void testNumerics() throws Exception {
+    assertAnalyzesTo(new SmartChineseAnalyzer(true), "我购买 Tests 了道具和服装1234",
+      new String[] { "我", "购买", "test", "了", "道具", "和", "服装", "1234"});
+  }
+  
+  /*
+   * Full width alphas and numerics are folded to half-width
+   */
+  public void testFullWidth() throws Exception {
+    assertAnalyzesTo(new SmartChineseAnalyzer(true), "我购买 Ｔｅｓｔｓ 了道具和服装１２３４",
+        new String[] { "我", "购买", "test", "了", "道具", "和", "服装", "1234"});
+  }
+  
+  /*
+   * Presentation form delimiters are removed
+   */
+  public void testDelimiters() throws Exception {
+    assertAnalyzesTo(new SmartChineseAnalyzer(true), "我购买︱ Tests 了道具和服装", 
+        new String[] { "我", "购买", "test", "了", "道具", "和", "服装"});
+  }
+  
+  /*
+   * Text from writing systems other than Chinese and Latin are parsed as individual characters.
+   * (regardless of Unicode category)
+   */
+  public void testNonChinese() throws Exception {
+    assertAnalyzesTo(new SmartChineseAnalyzer(true), "我购买 روبرتTests 了道具和服装", 
+        new String[] { "我", "购买", "ر", "و", "ب", "ر", "ت", "test", "了", "道具", "和", "服装"});
+  }
+  
+  /*
+   * Test what the analyzer does with out-of-vocabulary words.
+   * In this case the name is Yousaf Raza Gillani.
+   * Currently it is being analyzed into single characters...
+   */
+  public void testOOV() throws Exception {
+    assertAnalyzesTo(new SmartChineseAnalyzer(true), "优素福·拉扎·吉拉尼",
+      new String[] { "优", "素", "福", "拉", "扎", "吉", "拉", "尼" });
+    
+    assertAnalyzesTo(new SmartChineseAnalyzer(true), "优素福拉扎吉拉尼",
+      new String[] { "优", "素", "福", "拉", "扎", "吉", "拉", "尼" });
+  }
+  
   public void testOffsets() throws Exception {
     assertAnalyzesTo(new SmartChineseAnalyzer(true), "我购买了道具和服装",
         new String[] { "我", "购买", "了", "道具", "和", "服装" },
diff --git a/contrib/memory/src/test/org/apache/lucene/index/memory/TestSynonymTokenFilter.java b/contrib/memory/src/test/org/apache/lucene/index/memory/TestSynonymTokenFilter.java
index 73cb91a28f6..efbb413bf26 100644
--- a/contrib/memory/src/test/org/apache/lucene/index/memory/TestSynonymTokenFilter.java
+++ b/contrib/memory/src/test/org/apache/lucene/index/memory/TestSynonymTokenFilter.java
@@ -109,6 +109,7 @@ public class TestSynonymTokenFilter extends TestCase {
         streams.source = new WhitespaceTokenizer(reader);
         streams.result = new LowerCaseFilter(streams.source);
         streams.result = new SynonymTokenFilter(streams.result, synonyms, maxSynonyms);
+        setPreviousTokenStream(streams);
       } else {
         streams.source.reset(reader);
         streams.result.reset(); // reset the SynonymTokenFilter