LUCENE-1826: Add constructors that take AttributeSource and AttributeFactory to all Tokenizer implementations.

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@806942 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael Busch 2009-08-23 08:34:22 +00:00
parent fd861761f0
commit 64ed5f39a5
16 changed files with 329 additions and 31 deletions

View File

@ -402,6 +402,10 @@ API Changes
36. LUCENE-1808: Query.createWeight has been changed from protected to
public. (Tim Smith, Shai Erera via Mark Miller)
37. LUCENE-1826: Add constructors that take AttributeSource and
AttributeFactory to all Tokenizer implementations.
(Michael Busch)
Bug fixes
1. LUCENE-1415: MultiPhraseQuery has incorrect hashCode() and equals()

View File

@ -19,6 +19,7 @@ package org.apache.lucene.analysis.ar;
import java.io.Reader;
import org.apache.lucene.analysis.LetterTokenizer;
import org.apache.lucene.util.AttributeSource;
/**
* The problem with the standard Letter tokenizer is that it fails on diacritics.
@ -32,6 +33,14 @@ public class ArabicLetterTokenizer extends LetterTokenizer {
super(in);
}
public ArabicLetterTokenizer(AttributeSource source, Reader in) {
super(source, in);
}
public ArabicLetterTokenizer(AttributeFactory factory, Reader in) {
super(factory, in);
}
/**
* Allows for Letter category or NonspacingMark category
* @see org.apache.lucene.analysis.LetterTokenizer#isTokenChar(char)

View File

@ -24,6 +24,8 @@ import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.AttributeSource.AttributeFactory;
/**
@ -111,11 +113,25 @@ public final class CJKTokenizer extends Tokenizer {
*/
public CJKTokenizer(Reader in) {
super(in);
init();
}
public CJKTokenizer(AttributeSource source, Reader in) {
super(source, in);
init();
}
public CJKTokenizer(AttributeFactory factory, Reader in) {
super(factory, in);
init();
}
private void init() {
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
}
//~ Methods ----------------------------------------------------------------
/**

View File

@ -24,6 +24,7 @@ import java.io.Reader;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.util.AttributeSource;
/**
@ -59,10 +60,24 @@ public final class ChineseTokenizer extends Tokenizer {
public ChineseTokenizer(Reader in) {
super(in);
init();
}
public ChineseTokenizer(AttributeSource source, Reader in) {
super(source, in);
init();
}
public ChineseTokenizer(AttributeFactory factory, Reader in) {
super(factory, in);
init();
}
private void init() {
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
}
private int offset = 0, bufferIndex=0, dataLen=0;
private final static int MAX_WORD_LEN = 255;
private final static int IO_BUFFER_SIZE = 1024;

View File

@ -21,6 +21,7 @@ import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.util.AttributeSource;
import java.io.IOException;
import java.io.Reader;
@ -88,7 +89,76 @@ public class EdgeNGramTokenizer extends Tokenizer {
*/
public EdgeNGramTokenizer(Reader input, Side side, int minGram, int maxGram) {
super(input);
init(side, minGram, maxGram);
}
/**
* Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
*
* @param source {@link AttributeSource} to use
* @param input {@link Reader} holding the input to be tokenized
* @param side the {@link Side} from which to chop off an n-gram
* @param minGram the smallest n-gram to generate
* @param maxGram the largest n-gram to generate
*/
public EdgeNGramTokenizer(AttributeSource source, Reader input, Side side, int minGram, int maxGram) {
super(source, input);
init(side, minGram, maxGram);
}
/**
* Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
*
* @param factory {@link AttributeFactory} to use
* @param input {@link Reader} holding the input to be tokenized
* @param side the {@link Side} from which to chop off an n-gram
* @param minGram the smallest n-gram to generate
* @param maxGram the largest n-gram to generate
*/
public EdgeNGramTokenizer(AttributeFactory factory, Reader input, Side side, int minGram, int maxGram) {
super(factory, input);
init(side, minGram, maxGram);
}
/**
* Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
*
* @param input {@link Reader} holding the input to be tokenized
* @param sideLabel the name of the {@link Side} from which to chop off an n-gram
* @param minGram the smallest n-gram to generate
* @param maxGram the largest n-gram to generate
*/
public EdgeNGramTokenizer(Reader input, String sideLabel, int minGram, int maxGram) {
this(input, Side.getSide(sideLabel), minGram, maxGram);
}
/**
* Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
*
* @param source {@link AttributeSource} to use
* @param input {@link Reader} holding the input to be tokenized
* @param sideLabel the name of the {@link Side} from which to chop off an n-gram
* @param minGram the smallest n-gram to generate
* @param maxGram the largest n-gram to generate
*/
public EdgeNGramTokenizer(AttributeSource source, Reader input, String sideLabel, int minGram, int maxGram) {
this(source, input, Side.getSide(sideLabel), minGram, maxGram);
}
/**
* Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
*
* @param factory {@link AttributeFactory} to use
* @param input {@link Reader} holding the input to be tokenized
* @param sideLabel the name of the {@link Side} from which to chop off an n-gram
* @param minGram the smallest n-gram to generate
* @param maxGram the largest n-gram to generate
*/
public EdgeNGramTokenizer(AttributeFactory factory, Reader input, String sideLabel, int minGram, int maxGram) {
this(factory, input, Side.getSide(sideLabel), minGram, maxGram);
}
private void init(Side side, int minGram, int maxGram) {
if (side == null) {
throw new IllegalArgumentException("sideLabel must be either front or back");
}
@ -107,17 +177,7 @@ public class EdgeNGramTokenizer extends Tokenizer {
this.termAtt = (TermAttribute) addAttribute(TermAttribute.class);
this.offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
}
/**
* Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
*
* @param input {@link Reader} holding the input to be tokenized
* @param sideLabel the name of the {@link Side} from which to chop off an n-gram
* @param minGram the smallest n-gram to generate
* @param maxGram the largest n-gram to generate
*/
public EdgeNGramTokenizer(Reader input, String sideLabel, int minGram, int maxGram) {
this(input, Side.getSide(sideLabel), minGram, maxGram);
}
/** Returns the next token in the stream, or null at EOS. */

View File

@ -21,6 +21,7 @@ import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.util.AttributeSource;
import java.io.IOException;
import java.io.Reader;
@ -50,6 +51,42 @@ public class NGramTokenizer extends Tokenizer {
*/
public NGramTokenizer(Reader input, int minGram, int maxGram) {
super(input);
init(minGram, maxGram);
}
/**
* Creates NGramTokenizer with given min and max n-grams.
* @param source {@link AttributeSource} to use
* @param input {@link Reader} holding the input to be tokenized
* @param minGram the smallest n-gram to generate
* @param maxGram the largest n-gram to generate
*/
public NGramTokenizer(AttributeSource source, Reader input, int minGram, int maxGram) {
super(source, input);
init(minGram, maxGram);
}
/**
* Creates NGramTokenizer with given min and max n-grams.
* @param factory {@link AttributeFactory} to use
* @param input {@link Reader} holding the input to be tokenized
* @param minGram the smallest n-gram to generate
* @param maxGram the largest n-gram to generate
*/
public NGramTokenizer(AttributeFactory factory, Reader input, int minGram, int maxGram) {
super(factory, input);
init(minGram, maxGram);
}
/**
* Creates NGramTokenizer with default min and max n-grams.
* @param input {@link Reader} holding the input to be tokenized
*/
public NGramTokenizer(Reader input) {
this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE);
}
private void init(int minGram, int maxGram) {
if (minGram < 1) {
throw new IllegalArgumentException("minGram must be greater than zero");
}
@ -60,14 +97,7 @@ public class NGramTokenizer extends Tokenizer {
this.maxGram = maxGram;
this.termAtt = (TermAttribute) addAttribute(TermAttribute.class);
this.offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
}
/**
* Creates NGramTokenizer with default min and max n-grams.
* @param input {@link Reader} holding the input to be tokenized
*/
public NGramTokenizer(Reader input) {
this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE);
this.offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
}
/** Returns the next token in the stream, or null at EOS. */

View File

@ -21,6 +21,7 @@ import java.io.Reader;
import org.apache.lucene.analysis.CharTokenizer;
import org.apache.lucene.analysis.Tokenizer; // for javadocs
import org.apache.lucene.analysis.LetterTokenizer; // for javadocs
import org.apache.lucene.util.AttributeSource;
/**
* A RussianLetterTokenizer is a {@link Tokenizer} that extends {@link LetterTokenizer}
@ -57,6 +58,18 @@ public class RussianLetterTokenizer extends CharTokenizer
this(in, RussianCharsets.UnicodeRussian);
}
public RussianLetterTokenizer(AttributeSource source, Reader in, char[] charset)
{
super(source, in);
this.charset = charset;
}
public RussianLetterTokenizer(AttributeFactory factory, Reader in, char[] charset)
{
super(factory, in);
this.charset = charset;
}
/**
* Collects only characters which satisfy
* {@link Character#isLetter(char)}.

View File

@ -24,6 +24,7 @@ import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.AttributeSource;
/**
* Tokenizes input text into sentences.
@ -48,11 +49,25 @@ public final class SentenceTokenizer extends Tokenizer {
public SentenceTokenizer(Reader reader) {
super(reader);
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
init();
}
public SentenceTokenizer(AttributeSource source, Reader reader) {
super(source, reader);
init();
}
public SentenceTokenizer(AttributeFactory factory, Reader reader) {
super(factory, reader);
init();
}
private void init() {
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
}
public boolean incrementToken() throws IOException {
clearAttributes();
buffer.setLength(0);

View File

@ -151,14 +151,46 @@ public class WikipediaTokenizer extends Tokenizer {
*/
public WikipediaTokenizer(Reader input, int tokenOutput, Set untokenizedTypes) {
super(input);
this.tokenOutput = tokenOutput;
this.scanner = new WikipediaTokenizerImpl(input);
init(tokenOutput, untokenizedTypes);
}
/**
* Createa a new instance of the {@link org.apache.lucene.wikipedia.analysis.WikipediaTokenizer}. Attaches the
* <conde>input</code> to a the newly created JFlex scanner. Uses the given {@link AttributeFactory}.
*
* @param input The input
* @param tokenOutput One of {@link #TOKENS_ONLY}, {@link #UNTOKENIZED_ONLY}, {@link #BOTH}
* @param untokenizedTypes
*/
public WikipediaTokenizer(AttributeFactory factory, Reader input, int tokenOutput, Set untokenizedTypes) {
super(factory, input);
this.scanner = new WikipediaTokenizerImpl(input);
init(tokenOutput, untokenizedTypes);
}
/**
* Createa a new instance of the {@link org.apache.lucene.wikipedia.analysis.WikipediaTokenizer}. Attaches the
* <conde>input</code> to a the newly created JFlex scanner. Uses the given {@link AttributeSource}.
*
* @param input The input
* @param tokenOutput One of {@link #TOKENS_ONLY}, {@link #UNTOKENIZED_ONLY}, {@link #BOTH}
* @param untokenizedTypes
*/
public WikipediaTokenizer(AttributeSource source, Reader input, int tokenOutput, Set untokenizedTypes) {
super(source, input);
this.scanner = new WikipediaTokenizerImpl(input);
init(tokenOutput, untokenizedTypes);
}
private void init(int tokenOutput, Set untokenizedTypes) {
this.tokenOutput = tokenOutput;
this.untokenizedTypes = untokenizedTypes;
this.offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
this.typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
this.posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
this.termAtt = (TermAttribute) addAttribute(TermAttribute.class);
this.flagsAtt = (FlagsAttribute) addAttribute(FlagsAttribute.class);
this.flagsAtt = (FlagsAttribute) addAttribute(FlagsAttribute.class);
}
/** @deprecated Will be removed in Lucene 3.0. This method is final, as it should

View File

@ -22,6 +22,8 @@ import java.io.Reader;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.AttributeSource.AttributeFactory;
/** An abstract base class for simple, character-oriented tokenizers.*/
public abstract class CharTokenizer extends Tokenizer {
@ -31,6 +33,18 @@ public abstract class CharTokenizer extends Tokenizer {
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
}
public CharTokenizer(AttributeSource source, Reader input) {
super(source, input);
offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
}
public CharTokenizer(AttributeFactory factory, Reader input) {
super(factory, input);
offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
}
private int offset = 0, bufferIndex = 0, dataLen = 0;
private static final int MAX_WORD_LEN = 255;
private static final int IO_BUFFER_SIZE = 4096;

View File

@ -22,6 +22,7 @@ import java.io.Reader;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.util.AttributeSource;
/**
* Emits the entire input as a single token.
@ -41,10 +42,24 @@ public class KeywordTokenizer extends Tokenizer {
public KeywordTokenizer(Reader input, int bufferSize) {
super(input);
init(bufferSize);
}
public KeywordTokenizer(AttributeSource source, Reader input, int bufferSize) {
super(source, input);
init(bufferSize);
}
public KeywordTokenizer(AttributeFactory factory, Reader input, int bufferSize) {
super(factory, input);
init(bufferSize);
}
private void init(int bufferSize) {
this.done = false;
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
termAtt.resizeTermBuffer(bufferSize);
termAtt.resizeTermBuffer(bufferSize);
}
public final boolean incrementToken() throws IOException {

View File

@ -19,6 +19,9 @@ package org.apache.lucene.analysis;
import java.io.Reader;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.AttributeSource.AttributeFactory;
/** A LetterTokenizer is a tokenizer that divides text at non-letters. That's
to say, it defines tokens as maximal strings of adjacent letters, as defined
by java.lang.Character.isLetter() predicate.
@ -31,6 +34,16 @@ public class LetterTokenizer extends CharTokenizer {
public LetterTokenizer(Reader in) {
super(in);
}
/** Construct a new LetterTokenizer using a given {@link AttributeSource}. */
public LetterTokenizer(AttributeSource source, Reader in) {
super(source, in);
}
/** Construct a new LetterTokenizer using a given {@link AttributeFactory}. */
public LetterTokenizer(AttributeFactory factory, Reader in) {
super(factory, in);
}
/** Collects only characters which satisfy
* {@link Character#isLetter(char)}.*/

View File

@ -19,6 +19,9 @@ package org.apache.lucene.analysis;
import java.io.Reader;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.AttributeSource.AttributeFactory;
/**
* LowerCaseTokenizer performs the function of LetterTokenizer
* and LowerCaseFilter together. It divides text at non-letters and converts
@ -35,6 +38,16 @@ public final class LowerCaseTokenizer extends LetterTokenizer {
super(in);
}
/** Construct a new LowerCaseTokenizer using a given {@link AttributeSource}. */
public LowerCaseTokenizer(AttributeSource source, Reader in) {
super(source, in);
}
/** Construct a new LowerCaseTokenizer using a given {@link AttributeFactory}. */
public LowerCaseTokenizer(AttributeFactory factory, Reader in) {
super(factory, in);
}
/** Converts char to lower case
* {@link Character#toLowerCase(char)}.*/
protected char normalize(char c) {

View File

@ -77,6 +77,18 @@ public abstract class Tokenizer extends TokenStream {
super(source);
}
/** Construct a token stream processing the given input using the given AttributeSource. */
protected Tokenizer(AttributeSource source, Reader input) {
super(source);
this.input = CharReader.get(input);
}
/** Construct a token stream processing the given input using the given AttributeSource. */
protected Tokenizer(AttributeSource source, CharStream input) {
super(source);
this.input = input;
}
/** By default, closes the input Reader. */
public void close() throws IOException {
input.close();

View File

@ -19,15 +19,28 @@ package org.apache.lucene.analysis;
import java.io.Reader;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.AttributeSource.AttributeFactory;
/** A WhitespaceTokenizer is a tokenizer that divides text at whitespace.
* Adjacent sequences of non-Whitespace characters form tokens. */
public class WhitespaceTokenizer extends CharTokenizer {
/** Construct a new WhitespaceTokenizer. */
/** Construct a new WhitespaceTokenizer using a given {@link AttributeSource}. */
public WhitespaceTokenizer(Reader in) {
super(in);
}
/** Construct a new WhitespaceTokenizer using a given {@link AttributeSource}. */
public WhitespaceTokenizer(AttributeSource source, Reader in) {
super(source, in);
}
/** Construct a new WhitespaceTokenizer using a given {@link AttributeFactory}. */
public WhitespaceTokenizer(AttributeFactory factory, Reader in) {
super(factory, in);
}
/** Collects only characters which do not satisfy
* {@link Character#isWhitespace(char)}.*/
protected boolean isTokenChar(char c) {

View File

@ -27,6 +27,7 @@ import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.AttributeSource;
/** A grammar-based tokenizer constructed with JFlex
*
@ -126,15 +127,38 @@ public class StandardTokenizer extends Tokenizer {
* See http://issues.apache.org/jira/browse/LUCENE-1068
*/
public StandardTokenizer(Reader input, boolean replaceInvalidAcronym) {
this.replaceInvalidAcronym = replaceInvalidAcronym;
setInput(input);
super();
this.scanner = new StandardTokenizerImpl(input);
init(input, replaceInvalidAcronym);
}
/**
* Creates a new StandardTokenizer with a given {@link AttributeSource}.
*/
public StandardTokenizer(AttributeSource source, Reader input, boolean replaceInvalidAcronym) {
super(source);
this.scanner = new StandardTokenizerImpl(input);
init(input, replaceInvalidAcronym);
}
/**
* Creates a new StandardTokenizer with a given {@link AttributeFactory}.
*/
public StandardTokenizer(AttributeFactory factory, Reader input, boolean replaceInvalidAcronym) {
super(factory);
this.scanner = new StandardTokenizerImpl(input);
init(input, replaceInvalidAcronym);
}
private void init(Reader input, boolean replaceInvalidAcronym) {
this.replaceInvalidAcronym = replaceInvalidAcronym;
setInput(input);
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
}
// this tokenizer generates three attributes:
// offset, positionIncrement and type
private TermAttribute termAtt;