LUCENE-1826: Add constructors that take AttributeSource and AttributeFactory to all Tokenizer implementations.

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@806942 13f79535-47bb-0310-9956-ffa450edef68
2009-08-23 08:34:22 +00:00 · 2009-08-23 08:34:22 +00:00 · 64ed5f39a5
parent fd861761f0
commit 64ed5f39a5
16 changed files with 329 additions and 31 deletions
--- a/CHANGES.txt
+++ b/CHANGES.txt
@ -402,6 +402,10 @@ API Changes
 36. LUCENE-1808: Query.createWeight has been changed from protected to
    public. (Tim Smith, Shai Erera via Mark Miller)

+37. LUCENE-1826: Add constructors that take AttributeSource and
+    AttributeFactory to all Tokenizer implementations.
+    (Michael Busch)
+
 Bug fixes

 1. LUCENE-1415: MultiPhraseQuery has incorrect hashCode() and equals()
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicLetterTokenizer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicLetterTokenizer.java
@ -19,6 +19,7 @@ package org.apache.lucene.analysis.ar;
 import java.io.Reader;

 import org.apache.lucene.analysis.LetterTokenizer;
+import org.apache.lucene.util.AttributeSource;

 /**
 * The problem with the standard Letter tokenizer is that it fails on diacritics.
@ -32,6 +33,14 @@ public class ArabicLetterTokenizer extends LetterTokenizer {
    super(in);
  }

+  public ArabicLetterTokenizer(AttributeSource source, Reader in) {
+    super(source, in);
+  }
+
+  public ArabicLetterTokenizer(AttributeFactory factory, Reader in) {
+    super(factory, in);
+  }
+  
  /** 
   * Allows for Letter category or NonspacingMark category
   * @see org.apache.lucene.analysis.LetterTokenizer#isTokenChar(char)
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java
@ -24,6 +24,8 @@ import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.util.AttributeSource;
+import org.apache.lucene.util.AttributeSource.AttributeFactory;


 /**
@ -111,11 +113,25 @@ public final class CJKTokenizer extends Tokenizer {
     */
    public CJKTokenizer(Reader in) {
      super(in);
+      init();
+    }
+
+    public CJKTokenizer(AttributeSource source, Reader in) {
+      super(source, in);
+      init();
+    }
+
+    public CJKTokenizer(AttributeFactory factory, Reader in) {
+      super(factory, in);
+      init();
+    }
+    
+    private void init() {
      termAtt = (TermAttribute) addAttribute(TermAttribute.class);
      offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
      typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
    }
-
+    
    //~ Methods ----------------------------------------------------------------

    /**
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java
@ -24,6 +24,7 @@ import java.io.Reader;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.util.AttributeSource;


 /**
@ -59,10 +60,24 @@ public final class ChineseTokenizer extends Tokenizer {

    public ChineseTokenizer(Reader in) {
      super(in);
+      init();
+    }
+
+    public ChineseTokenizer(AttributeSource source, Reader in) {
+      super(source, in);
+      init();
+    }
+
+    public ChineseTokenizer(AttributeFactory factory, Reader in) {
+      super(factory, in);
+      init();
+    }
+    
+    private void init() {
      termAtt = (TermAttribute) addAttribute(TermAttribute.class);
      offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
    }
-
+    
    private int offset = 0, bufferIndex=0, dataLen=0;
    private final static int MAX_WORD_LEN = 255;
    private final static int IO_BUFFER_SIZE = 1024;
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java
@ -21,6 +21,7 @@ import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.util.AttributeSource;

 import java.io.IOException;
 import java.io.Reader;
@ -88,7 +89,76 @@ public class EdgeNGramTokenizer extends Tokenizer {
   */
  public EdgeNGramTokenizer(Reader input, Side side, int minGram, int maxGram) {
    super(input);
+    init(side, minGram, maxGram);
+  }

+  /**
+   * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
+   *
+   * @param source {@link AttributeSource} to use
+   * @param input {@link Reader} holding the input to be tokenized
+   * @param side the {@link Side} from which to chop off an n-gram
+   * @param minGram the smallest n-gram to generate
+   * @param maxGram the largest n-gram to generate
+   */
+  public EdgeNGramTokenizer(AttributeSource source, Reader input, Side side, int minGram, int maxGram) {
+    super(source, input);
+    init(side, minGram, maxGram);
+  }
+
+  /**
+   * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
+   * 
+   * @param factory {@link AttributeFactory} to use
+   * @param input {@link Reader} holding the input to be tokenized
+   * @param side the {@link Side} from which to chop off an n-gram
+   * @param minGram the smallest n-gram to generate
+   * @param maxGram the largest n-gram to generate
+   */
+  public EdgeNGramTokenizer(AttributeFactory factory, Reader input, Side side, int minGram, int maxGram) {
+    super(factory, input);
+    init(side, minGram, maxGram);
+  }
+  
+  /**
+   * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
+   *
+   * @param input {@link Reader} holding the input to be tokenized
+   * @param sideLabel the name of the {@link Side} from which to chop off an n-gram
+   * @param minGram the smallest n-gram to generate
+   * @param maxGram the largest n-gram to generate
+   */
+  public EdgeNGramTokenizer(Reader input, String sideLabel, int minGram, int maxGram) {
+    this(input, Side.getSide(sideLabel), minGram, maxGram);
+  }
+
+  /**
+   * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
+   *
+   * @param source {@link AttributeSource} to use
+   * @param input {@link Reader} holding the input to be tokenized
+   * @param sideLabel the name of the {@link Side} from which to chop off an n-gram
+   * @param minGram the smallest n-gram to generate
+   * @param maxGram the largest n-gram to generate
+   */
+  public EdgeNGramTokenizer(AttributeSource source, Reader input, String sideLabel, int minGram, int maxGram) {
+    this(source, input, Side.getSide(sideLabel), minGram, maxGram);
+  }
+
+  /**
+   * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
+   * 
+   * @param factory {@link AttributeFactory} to use
+   * @param input {@link Reader} holding the input to be tokenized
+   * @param sideLabel the name of the {@link Side} from which to chop off an n-gram
+   * @param minGram the smallest n-gram to generate
+   * @param maxGram the largest n-gram to generate
+   */
+  public EdgeNGramTokenizer(AttributeFactory factory, Reader input, String sideLabel, int minGram, int maxGram) {
+    this(factory, input, Side.getSide(sideLabel), minGram, maxGram);
+  }
+  
+  private void init(Side side, int minGram, int maxGram) {
    if (side == null) {
      throw new IllegalArgumentException("sideLabel must be either front or back");
    }
@ -107,17 +177,7 @@ public class EdgeNGramTokenizer extends Tokenizer {
    
    this.termAtt = (TermAttribute) addAttribute(TermAttribute.class);
    this.offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
-  }
-  /**
-   * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
-   *
-   * @param input {@link Reader} holding the input to be tokenized
-   * @param sideLabel the name of the {@link Side} from which to chop off an n-gram
-   * @param minGram the smallest n-gram to generate
-   * @param maxGram the largest n-gram to generate
-   */
-  public EdgeNGramTokenizer(Reader input, String sideLabel, int minGram, int maxGram) {
-    this(input, Side.getSide(sideLabel), minGram, maxGram);
+
  }

  /** Returns the next token in the stream, or null at EOS. */
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java
@ -21,6 +21,7 @@ import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.util.AttributeSource;

 import java.io.IOException;
 import java.io.Reader;
@ -50,6 +51,42 @@ public class NGramTokenizer extends Tokenizer {
   */
  public NGramTokenizer(Reader input, int minGram, int maxGram) {
    super(input);
+    init(minGram, maxGram);
+  }
+
+  /**
+   * Creates NGramTokenizer with given min and max n-grams.
+   * @param source {@link AttributeSource} to use
+   * @param input {@link Reader} holding the input to be tokenized
+   * @param minGram the smallest n-gram to generate
+   * @param maxGram the largest n-gram to generate
+   */
+  public NGramTokenizer(AttributeSource source, Reader input, int minGram, int maxGram) {
+    super(source, input);
+    init(minGram, maxGram);
+  }
+
+  /**
+   * Creates NGramTokenizer with given min and max n-grams.
+   * @param factory {@link AttributeFactory} to use
+   * @param input {@link Reader} holding the input to be tokenized
+   * @param minGram the smallest n-gram to generate
+   * @param maxGram the largest n-gram to generate
+   */
+  public NGramTokenizer(AttributeFactory factory, Reader input, int minGram, int maxGram) {
+    super(factory, input);
+    init(minGram, maxGram);
+  }
+
+  /**
+   * Creates NGramTokenizer with default min and max n-grams.
+   * @param input {@link Reader} holding the input to be tokenized
+   */
+  public NGramTokenizer(Reader input) {
+    this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE);
+  }
+  
+  private void init(int minGram, int maxGram) {
    if (minGram < 1) {
      throw new IllegalArgumentException("minGram must be greater than zero");
    }
@ -60,14 +97,7 @@ public class NGramTokenizer extends Tokenizer {
    this.maxGram = maxGram;
    
    this.termAtt = (TermAttribute) addAttribute(TermAttribute.class);
-    this.offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
-  }
-  /**
-   * Creates NGramTokenizer with default min and max n-grams.
-   * @param input {@link Reader} holding the input to be tokenized
-   */
-  public NGramTokenizer(Reader input) {
-    this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE);
+    this.offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);    
  }

  /** Returns the next token in the stream, or null at EOS. */
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLetterTokenizer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLetterTokenizer.java
@ -21,6 +21,7 @@ import java.io.Reader;
 import org.apache.lucene.analysis.CharTokenizer;
 import org.apache.lucene.analysis.Tokenizer; // for javadocs
 import org.apache.lucene.analysis.LetterTokenizer; // for javadocs
+import org.apache.lucene.util.AttributeSource;

 /**
 * A RussianLetterTokenizer is a {@link Tokenizer} that extends {@link LetterTokenizer}
@ -57,6 +58,18 @@ public class RussianLetterTokenizer extends CharTokenizer
    	this(in, RussianCharsets.UnicodeRussian);
    }

+    public RussianLetterTokenizer(AttributeSource source, Reader in, char[] charset)
+    {
+        super(source, in);
+        this.charset = charset;
+    }
+
+    public RussianLetterTokenizer(AttributeFactory factory, Reader in, char[] charset)
+    {
+        super(factory, in);
+        this.charset = charset;
+    }
+    
    /**
     * Collects only characters which satisfy
     * {@link Character#isLetter(char)}.
--- a/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java
+++ b/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java
@ -24,6 +24,7 @@ import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.util.AttributeSource;

 /**
 * Tokenizes input text into sentences.
@ -48,11 +49,25 @@ public final class SentenceTokenizer extends Tokenizer {

  public SentenceTokenizer(Reader reader) {
    super(reader);
-    termAtt = (TermAttribute) addAttribute(TermAttribute.class);
-    offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
-    typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
+    init();
  }

+  public SentenceTokenizer(AttributeSource source, Reader reader) {
+    super(source, reader);
+    init();
+  }
+
+  public SentenceTokenizer(AttributeFactory factory, Reader reader) {
+    super(factory, reader);
+    init();
+  }
+  
+  private void init() {
+    termAtt = (TermAttribute) addAttribute(TermAttribute.class);
+    offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
+    typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);    
+  }
+  
  public boolean incrementToken() throws IOException {
    clearAttributes();
    buffer.setLength(0);
--- a/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizer.java
+++ b/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizer.java
@ -151,14 +151,46 @@ public class WikipediaTokenizer extends Tokenizer {
   */
  public WikipediaTokenizer(Reader input, int tokenOutput, Set untokenizedTypes) {
    super(input);
-    this.tokenOutput = tokenOutput;
    this.scanner = new WikipediaTokenizerImpl(input);
+    init(tokenOutput, untokenizedTypes);
+  }
+
+  /**
+   * Createa a new instance of the {@link org.apache.lucene.wikipedia.analysis.WikipediaTokenizer}.  Attaches the
+   * <conde>input</code> to a the newly created JFlex scanner. Uses the given {@link AttributeFactory}.
+   *
+   * @param input The input
+   * @param tokenOutput One of {@link #TOKENS_ONLY}, {@link #UNTOKENIZED_ONLY}, {@link #BOTH}
+   * @param untokenizedTypes
+   */
+  public WikipediaTokenizer(AttributeFactory factory, Reader input, int tokenOutput, Set untokenizedTypes) {
+    super(factory, input);
+    this.scanner = new WikipediaTokenizerImpl(input);
+    init(tokenOutput, untokenizedTypes);
+  }
+
+  /**
+   * Createa a new instance of the {@link org.apache.lucene.wikipedia.analysis.WikipediaTokenizer}.  Attaches the
+   * <conde>input</code> to a the newly created JFlex scanner. Uses the given {@link AttributeSource}.
+   *
+   * @param input The input
+   * @param tokenOutput One of {@link #TOKENS_ONLY}, {@link #UNTOKENIZED_ONLY}, {@link #BOTH}
+   * @param untokenizedTypes
+   */
+  public WikipediaTokenizer(AttributeSource source, Reader input, int tokenOutput, Set untokenizedTypes) {
+    super(source, input);
+    this.scanner = new WikipediaTokenizerImpl(input);
+    init(tokenOutput, untokenizedTypes);
+  }
+  
+  private void init(int tokenOutput, Set untokenizedTypes) {
+    this.tokenOutput = tokenOutput;
    this.untokenizedTypes = untokenizedTypes;
    this.offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
    this.typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
    this.posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
    this.termAtt = (TermAttribute) addAttribute(TermAttribute.class);
-    this.flagsAtt = (FlagsAttribute) addAttribute(FlagsAttribute.class);
+    this.flagsAtt = (FlagsAttribute) addAttribute(FlagsAttribute.class);    
  }

  /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
--- a/src/java/org/apache/lucene/analysis/CharTokenizer.java
+++ b/src/java/org/apache/lucene/analysis/CharTokenizer.java
@ -22,6 +22,8 @@ import java.io.Reader;

 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.util.AttributeSource;
+import org.apache.lucene.util.AttributeSource.AttributeFactory;

 /** An abstract base class for simple, character-oriented tokenizers.*/
 public abstract class CharTokenizer extends Tokenizer {
@ -31,6 +33,18 @@ public abstract class CharTokenizer extends Tokenizer {
    termAtt = (TermAttribute) addAttribute(TermAttribute.class);
  }

+  public CharTokenizer(AttributeSource source, Reader input) {
+    super(source, input);
+    offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
+    termAtt = (TermAttribute) addAttribute(TermAttribute.class);
+  }
+
+  public CharTokenizer(AttributeFactory factory, Reader input) {
+    super(factory, input);
+    offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
+    termAtt = (TermAttribute) addAttribute(TermAttribute.class);
+  }
+  
  private int offset = 0, bufferIndex = 0, dataLen = 0;
  private static final int MAX_WORD_LEN = 255;
  private static final int IO_BUFFER_SIZE = 4096;
--- a/src/java/org/apache/lucene/analysis/KeywordTokenizer.java
+++ b/src/java/org/apache/lucene/analysis/KeywordTokenizer.java
@ -22,6 +22,7 @@ import java.io.Reader;

 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.util.AttributeSource;

 /**
 * Emits the entire input as a single token.
@ -41,10 +42,24 @@ public class KeywordTokenizer extends Tokenizer {

  public KeywordTokenizer(Reader input, int bufferSize) {
    super(input);
+    init(bufferSize);
+  }
+
+  public KeywordTokenizer(AttributeSource source, Reader input, int bufferSize) {
+    super(source, input);
+    init(bufferSize);
+  }
+
+  public KeywordTokenizer(AttributeFactory factory, Reader input, int bufferSize) {
+    super(factory, input);
+    init(bufferSize);
+  }
+  
+  private void init(int bufferSize) {
    this.done = false;
    termAtt = (TermAttribute) addAttribute(TermAttribute.class);
    offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
-    termAtt.resizeTermBuffer(bufferSize);
+    termAtt.resizeTermBuffer(bufferSize);    
  }
  
  public final boolean incrementToken() throws IOException {
--- a/src/java/org/apache/lucene/analysis/LetterTokenizer.java
+++ b/src/java/org/apache/lucene/analysis/LetterTokenizer.java
@ -19,6 +19,9 @@ package org.apache.lucene.analysis;

 import java.io.Reader;

+import org.apache.lucene.util.AttributeSource;
+import org.apache.lucene.util.AttributeSource.AttributeFactory;
+
 /** A LetterTokenizer is a tokenizer that divides text at non-letters.  That's
  to say, it defines tokens as maximal strings of adjacent letters, as defined
  by java.lang.Character.isLetter() predicate.
@ -31,6 +34,16 @@ public class LetterTokenizer extends CharTokenizer {
  public LetterTokenizer(Reader in) {
    super(in);
  }
+  
+  /** Construct a new LetterTokenizer using a given {@link AttributeSource}. */
+  public LetterTokenizer(AttributeSource source, Reader in) {
+    super(source, in);
+  }
+  
+  /** Construct a new LetterTokenizer using a given {@link AttributeFactory}. */
+  public LetterTokenizer(AttributeFactory factory, Reader in) {
+    super(factory, in);
+  }

  /** Collects only characters which satisfy
   * {@link Character#isLetter(char)}.*/
--- a/src/java/org/apache/lucene/analysis/LowerCaseTokenizer.java
+++ b/src/java/org/apache/lucene/analysis/LowerCaseTokenizer.java
@ -19,6 +19,9 @@ package org.apache.lucene.analysis;

 import java.io.Reader;

+import org.apache.lucene.util.AttributeSource;
+import org.apache.lucene.util.AttributeSource.AttributeFactory;
+
 /**
 * LowerCaseTokenizer performs the function of LetterTokenizer
 * and LowerCaseFilter together.  It divides text at non-letters and converts
@ -35,6 +38,16 @@ public final class LowerCaseTokenizer extends LetterTokenizer {
    super(in);
  }

+  /** Construct a new LowerCaseTokenizer using a given {@link AttributeSource}. */
+  public LowerCaseTokenizer(AttributeSource source, Reader in) {
+    super(source, in);
+  }
+
+  /** Construct a new LowerCaseTokenizer using a given {@link AttributeFactory}. */
+  public LowerCaseTokenizer(AttributeFactory factory, Reader in) {
+    super(factory, in);
+  }
+  
  /** Converts char to lower case
   * {@link Character#toLowerCase(char)}.*/
  protected char normalize(char c) {
--- a/src/java/org/apache/lucene/analysis/Tokenizer.java
+++ b/src/java/org/apache/lucene/analysis/Tokenizer.java
@ -77,6 +77,18 @@ public abstract class Tokenizer extends TokenStream {
    super(source);
  }

+  /** Construct a token stream processing the given input using the given AttributeSource. */
+  protected Tokenizer(AttributeSource source, Reader input) {
+    super(source);
+    this.input = CharReader.get(input);
+  }
+  
+  /** Construct a token stream processing the given input using the given AttributeSource. */
+  protected Tokenizer(AttributeSource source, CharStream input) {
+    super(source);
+    this.input = input;
+  }
+  
  /** By default, closes the input Reader. */
  public void close() throws IOException {
    input.close();
--- a/src/java/org/apache/lucene/analysis/WhitespaceTokenizer.java
+++ b/src/java/org/apache/lucene/analysis/WhitespaceTokenizer.java
@ -19,15 +19,28 @@ package org.apache.lucene.analysis;

 import java.io.Reader;

+import org.apache.lucene.util.AttributeSource;
+import org.apache.lucene.util.AttributeSource.AttributeFactory;
+
 /** A WhitespaceTokenizer is a tokenizer that divides text at whitespace.
 * Adjacent sequences of non-Whitespace characters form tokens. */

 public class WhitespaceTokenizer extends CharTokenizer {
-  /** Construct a new WhitespaceTokenizer. */
+  /** Construct a new WhitespaceTokenizer using a given {@link AttributeSource}. */
  public WhitespaceTokenizer(Reader in) {
    super(in);
  }

+  /** Construct a new WhitespaceTokenizer using a given {@link AttributeSource}. */
+  public WhitespaceTokenizer(AttributeSource source, Reader in) {
+    super(source, in);
+  }
+
+  /** Construct a new WhitespaceTokenizer using a given {@link AttributeFactory}. */
+  public WhitespaceTokenizer(AttributeFactory factory, Reader in) {
+    super(factory, in);
+  }
+  
  /** Collects only characters which do not satisfy
   * {@link Character#isWhitespace(char)}.*/
  protected boolean isTokenChar(char c) {
--- a/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
+++ b/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
@ -27,6 +27,7 @@ import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.util.AttributeSource;

 /** A grammar-based tokenizer constructed with JFlex
 *
@ -126,15 +127,38 @@ public class StandardTokenizer extends Tokenizer {
   * See http://issues.apache.org/jira/browse/LUCENE-1068
   */
  public StandardTokenizer(Reader input, boolean replaceInvalidAcronym) {
-    this.replaceInvalidAcronym = replaceInvalidAcronym;
-    setInput(input);
+    super();
    this.scanner = new StandardTokenizerImpl(input);
+    init(input, replaceInvalidAcronym);
+  }
+
+  /**
+   * Creates a new StandardTokenizer with a given {@link AttributeSource}. 
+   */
+  public StandardTokenizer(AttributeSource source, Reader input, boolean replaceInvalidAcronym) {
+    super(source);
+    this.scanner = new StandardTokenizerImpl(input);
+    init(input, replaceInvalidAcronym);
+  }
+
+  /**
+   * Creates a new StandardTokenizer with a given {@link AttributeFactory}. 
+   */
+  public StandardTokenizer(AttributeFactory factory, Reader input, boolean replaceInvalidAcronym) {
+    super(factory);
+    this.scanner = new StandardTokenizerImpl(input);
+    init(input, replaceInvalidAcronym);
+  }
+
+  private void init(Reader input, boolean replaceInvalidAcronym) {
+    this.replaceInvalidAcronym = replaceInvalidAcronym;
+    setInput(input);    
    termAtt = (TermAttribute) addAttribute(TermAttribute.class);
    offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
    posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
    typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
  }
-
+  
  // this tokenizer generates three attributes:
  // offset, positionIncrement and type
  private TermAttribute termAtt;