mirror of https://github.com/apache/lucene.git
LUCENE-1826: Add constructors that take AttributeSource and AttributeFactory to all Tokenizer implementations.
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@806942 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
fd861761f0
commit
64ed5f39a5
|
@ -402,6 +402,10 @@ API Changes
|
|||
36. LUCENE-1808: Query.createWeight has been changed from protected to
|
||||
public. (Tim Smith, Shai Erera via Mark Miller)
|
||||
|
||||
37. LUCENE-1826: Add constructors that take AttributeSource and
|
||||
AttributeFactory to all Tokenizer implementations.
|
||||
(Michael Busch)
|
||||
|
||||
Bug fixes
|
||||
|
||||
1. LUCENE-1415: MultiPhraseQuery has incorrect hashCode() and equals()
|
||||
|
|
|
@ -19,6 +19,7 @@ package org.apache.lucene.analysis.ar;
|
|||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.LetterTokenizer;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
|
||||
/**
|
||||
* The problem with the standard Letter tokenizer is that it fails on diacritics.
|
||||
|
@ -32,6 +33,14 @@ public class ArabicLetterTokenizer extends LetterTokenizer {
|
|||
super(in);
|
||||
}
|
||||
|
||||
public ArabicLetterTokenizer(AttributeSource source, Reader in) {
|
||||
super(source, in);
|
||||
}
|
||||
|
||||
public ArabicLetterTokenizer(AttributeFactory factory, Reader in) {
|
||||
super(factory, in);
|
||||
}
|
||||
|
||||
/**
|
||||
* Allows for Letter category or NonspacingMark category
|
||||
* @see org.apache.lucene.analysis.LetterTokenizer#isTokenChar(char)
|
||||
|
|
|
@ -24,6 +24,8 @@ import org.apache.lucene.analysis.Tokenizer;
|
|||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
import org.apache.lucene.util.AttributeSource.AttributeFactory;
|
||||
|
||||
|
||||
/**
|
||||
|
@ -111,11 +113,25 @@ public final class CJKTokenizer extends Tokenizer {
|
|||
*/
|
||||
public CJKTokenizer(Reader in) {
|
||||
super(in);
|
||||
init();
|
||||
}
|
||||
|
||||
public CJKTokenizer(AttributeSource source, Reader in) {
|
||||
super(source, in);
|
||||
init();
|
||||
}
|
||||
|
||||
public CJKTokenizer(AttributeFactory factory, Reader in) {
|
||||
super(factory, in);
|
||||
init();
|
||||
}
|
||||
|
||||
private void init() {
|
||||
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||
offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
|
||||
typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
|
||||
}
|
||||
|
||||
|
||||
//~ Methods ----------------------------------------------------------------
|
||||
|
||||
/**
|
||||
|
|
|
@ -24,6 +24,7 @@ import java.io.Reader;
|
|||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
|
||||
|
||||
/**
|
||||
|
@ -59,10 +60,24 @@ public final class ChineseTokenizer extends Tokenizer {
|
|||
|
||||
public ChineseTokenizer(Reader in) {
|
||||
super(in);
|
||||
init();
|
||||
}
|
||||
|
||||
public ChineseTokenizer(AttributeSource source, Reader in) {
|
||||
super(source, in);
|
||||
init();
|
||||
}
|
||||
|
||||
public ChineseTokenizer(AttributeFactory factory, Reader in) {
|
||||
super(factory, in);
|
||||
init();
|
||||
}
|
||||
|
||||
private void init() {
|
||||
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||
offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
|
||||
}
|
||||
|
||||
|
||||
private int offset = 0, bufferIndex=0, dataLen=0;
|
||||
private final static int MAX_WORD_LEN = 255;
|
||||
private final static int IO_BUFFER_SIZE = 1024;
|
||||
|
|
|
@ -21,6 +21,7 @@ import org.apache.lucene.analysis.Token;
|
|||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
@ -88,7 +89,76 @@ public class EdgeNGramTokenizer extends Tokenizer {
|
|||
*/
|
||||
public EdgeNGramTokenizer(Reader input, Side side, int minGram, int maxGram) {
|
||||
super(input);
|
||||
init(side, minGram, maxGram);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
|
||||
*
|
||||
* @param source {@link AttributeSource} to use
|
||||
* @param input {@link Reader} holding the input to be tokenized
|
||||
* @param side the {@link Side} from which to chop off an n-gram
|
||||
* @param minGram the smallest n-gram to generate
|
||||
* @param maxGram the largest n-gram to generate
|
||||
*/
|
||||
public EdgeNGramTokenizer(AttributeSource source, Reader input, Side side, int minGram, int maxGram) {
|
||||
super(source, input);
|
||||
init(side, minGram, maxGram);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
|
||||
*
|
||||
* @param factory {@link AttributeFactory} to use
|
||||
* @param input {@link Reader} holding the input to be tokenized
|
||||
* @param side the {@link Side} from which to chop off an n-gram
|
||||
* @param minGram the smallest n-gram to generate
|
||||
* @param maxGram the largest n-gram to generate
|
||||
*/
|
||||
public EdgeNGramTokenizer(AttributeFactory factory, Reader input, Side side, int minGram, int maxGram) {
|
||||
super(factory, input);
|
||||
init(side, minGram, maxGram);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
|
||||
*
|
||||
* @param input {@link Reader} holding the input to be tokenized
|
||||
* @param sideLabel the name of the {@link Side} from which to chop off an n-gram
|
||||
* @param minGram the smallest n-gram to generate
|
||||
* @param maxGram the largest n-gram to generate
|
||||
*/
|
||||
public EdgeNGramTokenizer(Reader input, String sideLabel, int minGram, int maxGram) {
|
||||
this(input, Side.getSide(sideLabel), minGram, maxGram);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
|
||||
*
|
||||
* @param source {@link AttributeSource} to use
|
||||
* @param input {@link Reader} holding the input to be tokenized
|
||||
* @param sideLabel the name of the {@link Side} from which to chop off an n-gram
|
||||
* @param minGram the smallest n-gram to generate
|
||||
* @param maxGram the largest n-gram to generate
|
||||
*/
|
||||
public EdgeNGramTokenizer(AttributeSource source, Reader input, String sideLabel, int minGram, int maxGram) {
|
||||
this(source, input, Side.getSide(sideLabel), minGram, maxGram);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
|
||||
*
|
||||
* @param factory {@link AttributeFactory} to use
|
||||
* @param input {@link Reader} holding the input to be tokenized
|
||||
* @param sideLabel the name of the {@link Side} from which to chop off an n-gram
|
||||
* @param minGram the smallest n-gram to generate
|
||||
* @param maxGram the largest n-gram to generate
|
||||
*/
|
||||
public EdgeNGramTokenizer(AttributeFactory factory, Reader input, String sideLabel, int minGram, int maxGram) {
|
||||
this(factory, input, Side.getSide(sideLabel), minGram, maxGram);
|
||||
}
|
||||
|
||||
private void init(Side side, int minGram, int maxGram) {
|
||||
if (side == null) {
|
||||
throw new IllegalArgumentException("sideLabel must be either front or back");
|
||||
}
|
||||
|
@ -107,17 +177,7 @@ public class EdgeNGramTokenizer extends Tokenizer {
|
|||
|
||||
this.termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||
this.offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
|
||||
}
|
||||
/**
|
||||
* Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
|
||||
*
|
||||
* @param input {@link Reader} holding the input to be tokenized
|
||||
* @param sideLabel the name of the {@link Side} from which to chop off an n-gram
|
||||
* @param minGram the smallest n-gram to generate
|
||||
* @param maxGram the largest n-gram to generate
|
||||
*/
|
||||
public EdgeNGramTokenizer(Reader input, String sideLabel, int minGram, int maxGram) {
|
||||
this(input, Side.getSide(sideLabel), minGram, maxGram);
|
||||
|
||||
}
|
||||
|
||||
/** Returns the next token in the stream, or null at EOS. */
|
||||
|
|
|
@ -21,6 +21,7 @@ import org.apache.lucene.analysis.Token;
|
|||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
@ -50,6 +51,42 @@ public class NGramTokenizer extends Tokenizer {
|
|||
*/
|
||||
public NGramTokenizer(Reader input, int minGram, int maxGram) {
|
||||
super(input);
|
||||
init(minGram, maxGram);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates NGramTokenizer with given min and max n-grams.
|
||||
* @param source {@link AttributeSource} to use
|
||||
* @param input {@link Reader} holding the input to be tokenized
|
||||
* @param minGram the smallest n-gram to generate
|
||||
* @param maxGram the largest n-gram to generate
|
||||
*/
|
||||
public NGramTokenizer(AttributeSource source, Reader input, int minGram, int maxGram) {
|
||||
super(source, input);
|
||||
init(minGram, maxGram);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates NGramTokenizer with given min and max n-grams.
|
||||
* @param factory {@link AttributeFactory} to use
|
||||
* @param input {@link Reader} holding the input to be tokenized
|
||||
* @param minGram the smallest n-gram to generate
|
||||
* @param maxGram the largest n-gram to generate
|
||||
*/
|
||||
public NGramTokenizer(AttributeFactory factory, Reader input, int minGram, int maxGram) {
|
||||
super(factory, input);
|
||||
init(minGram, maxGram);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates NGramTokenizer with default min and max n-grams.
|
||||
* @param input {@link Reader} holding the input to be tokenized
|
||||
*/
|
||||
public NGramTokenizer(Reader input) {
|
||||
this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE);
|
||||
}
|
||||
|
||||
private void init(int minGram, int maxGram) {
|
||||
if (minGram < 1) {
|
||||
throw new IllegalArgumentException("minGram must be greater than zero");
|
||||
}
|
||||
|
@ -60,14 +97,7 @@ public class NGramTokenizer extends Tokenizer {
|
|||
this.maxGram = maxGram;
|
||||
|
||||
this.termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||
this.offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
|
||||
}
|
||||
/**
|
||||
* Creates NGramTokenizer with default min and max n-grams.
|
||||
* @param input {@link Reader} holding the input to be tokenized
|
||||
*/
|
||||
public NGramTokenizer(Reader input) {
|
||||
this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE);
|
||||
this.offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
|
||||
}
|
||||
|
||||
/** Returns the next token in the stream, or null at EOS. */
|
||||
|
|
|
@ -21,6 +21,7 @@ import java.io.Reader;
|
|||
import org.apache.lucene.analysis.CharTokenizer;
|
||||
import org.apache.lucene.analysis.Tokenizer; // for javadocs
|
||||
import org.apache.lucene.analysis.LetterTokenizer; // for javadocs
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
|
||||
/**
|
||||
* A RussianLetterTokenizer is a {@link Tokenizer} that extends {@link LetterTokenizer}
|
||||
|
@ -57,6 +58,18 @@ public class RussianLetterTokenizer extends CharTokenizer
|
|||
this(in, RussianCharsets.UnicodeRussian);
|
||||
}
|
||||
|
||||
public RussianLetterTokenizer(AttributeSource source, Reader in, char[] charset)
|
||||
{
|
||||
super(source, in);
|
||||
this.charset = charset;
|
||||
}
|
||||
|
||||
public RussianLetterTokenizer(AttributeFactory factory, Reader in, char[] charset)
|
||||
{
|
||||
super(factory, in);
|
||||
this.charset = charset;
|
||||
}
|
||||
|
||||
/**
|
||||
* Collects only characters which satisfy
|
||||
* {@link Character#isLetter(char)}.
|
||||
|
|
|
@ -24,6 +24,7 @@ import org.apache.lucene.analysis.Tokenizer;
|
|||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
|
||||
/**
|
||||
* Tokenizes input text into sentences.
|
||||
|
@ -48,11 +49,25 @@ public final class SentenceTokenizer extends Tokenizer {
|
|||
|
||||
public SentenceTokenizer(Reader reader) {
|
||||
super(reader);
|
||||
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||
offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
|
||||
typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
|
||||
init();
|
||||
}
|
||||
|
||||
public SentenceTokenizer(AttributeSource source, Reader reader) {
|
||||
super(source, reader);
|
||||
init();
|
||||
}
|
||||
|
||||
public SentenceTokenizer(AttributeFactory factory, Reader reader) {
|
||||
super(factory, reader);
|
||||
init();
|
||||
}
|
||||
|
||||
private void init() {
|
||||
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||
offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
|
||||
typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
|
||||
}
|
||||
|
||||
public boolean incrementToken() throws IOException {
|
||||
clearAttributes();
|
||||
buffer.setLength(0);
|
||||
|
|
|
@ -151,14 +151,46 @@ public class WikipediaTokenizer extends Tokenizer {
|
|||
*/
|
||||
public WikipediaTokenizer(Reader input, int tokenOutput, Set untokenizedTypes) {
|
||||
super(input);
|
||||
this.tokenOutput = tokenOutput;
|
||||
this.scanner = new WikipediaTokenizerImpl(input);
|
||||
init(tokenOutput, untokenizedTypes);
|
||||
}
|
||||
|
||||
/**
|
||||
* Createa a new instance of the {@link org.apache.lucene.wikipedia.analysis.WikipediaTokenizer}. Attaches the
|
||||
* <conde>input</code> to a the newly created JFlex scanner. Uses the given {@link AttributeFactory}.
|
||||
*
|
||||
* @param input The input
|
||||
* @param tokenOutput One of {@link #TOKENS_ONLY}, {@link #UNTOKENIZED_ONLY}, {@link #BOTH}
|
||||
* @param untokenizedTypes
|
||||
*/
|
||||
public WikipediaTokenizer(AttributeFactory factory, Reader input, int tokenOutput, Set untokenizedTypes) {
|
||||
super(factory, input);
|
||||
this.scanner = new WikipediaTokenizerImpl(input);
|
||||
init(tokenOutput, untokenizedTypes);
|
||||
}
|
||||
|
||||
/**
|
||||
* Createa a new instance of the {@link org.apache.lucene.wikipedia.analysis.WikipediaTokenizer}. Attaches the
|
||||
* <conde>input</code> to a the newly created JFlex scanner. Uses the given {@link AttributeSource}.
|
||||
*
|
||||
* @param input The input
|
||||
* @param tokenOutput One of {@link #TOKENS_ONLY}, {@link #UNTOKENIZED_ONLY}, {@link #BOTH}
|
||||
* @param untokenizedTypes
|
||||
*/
|
||||
public WikipediaTokenizer(AttributeSource source, Reader input, int tokenOutput, Set untokenizedTypes) {
|
||||
super(source, input);
|
||||
this.scanner = new WikipediaTokenizerImpl(input);
|
||||
init(tokenOutput, untokenizedTypes);
|
||||
}
|
||||
|
||||
private void init(int tokenOutput, Set untokenizedTypes) {
|
||||
this.tokenOutput = tokenOutput;
|
||||
this.untokenizedTypes = untokenizedTypes;
|
||||
this.offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
|
||||
this.typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
|
||||
this.posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
|
||||
this.termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||
this.flagsAtt = (FlagsAttribute) addAttribute(FlagsAttribute.class);
|
||||
this.flagsAtt = (FlagsAttribute) addAttribute(FlagsAttribute.class);
|
||||
}
|
||||
|
||||
/** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
|
||||
|
|
|
@ -22,6 +22,8 @@ import java.io.Reader;
|
|||
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
import org.apache.lucene.util.AttributeSource.AttributeFactory;
|
||||
|
||||
/** An abstract base class for simple, character-oriented tokenizers.*/
|
||||
public abstract class CharTokenizer extends Tokenizer {
|
||||
|
@ -31,6 +33,18 @@ public abstract class CharTokenizer extends Tokenizer {
|
|||
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||
}
|
||||
|
||||
public CharTokenizer(AttributeSource source, Reader input) {
|
||||
super(source, input);
|
||||
offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
|
||||
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||
}
|
||||
|
||||
public CharTokenizer(AttributeFactory factory, Reader input) {
|
||||
super(factory, input);
|
||||
offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
|
||||
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||
}
|
||||
|
||||
private int offset = 0, bufferIndex = 0, dataLen = 0;
|
||||
private static final int MAX_WORD_LEN = 255;
|
||||
private static final int IO_BUFFER_SIZE = 4096;
|
||||
|
|
|
@ -22,6 +22,7 @@ import java.io.Reader;
|
|||
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
|
||||
/**
|
||||
* Emits the entire input as a single token.
|
||||
|
@ -41,10 +42,24 @@ public class KeywordTokenizer extends Tokenizer {
|
|||
|
||||
public KeywordTokenizer(Reader input, int bufferSize) {
|
||||
super(input);
|
||||
init(bufferSize);
|
||||
}
|
||||
|
||||
public KeywordTokenizer(AttributeSource source, Reader input, int bufferSize) {
|
||||
super(source, input);
|
||||
init(bufferSize);
|
||||
}
|
||||
|
||||
public KeywordTokenizer(AttributeFactory factory, Reader input, int bufferSize) {
|
||||
super(factory, input);
|
||||
init(bufferSize);
|
||||
}
|
||||
|
||||
private void init(int bufferSize) {
|
||||
this.done = false;
|
||||
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||
offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
|
||||
termAtt.resizeTermBuffer(bufferSize);
|
||||
termAtt.resizeTermBuffer(bufferSize);
|
||||
}
|
||||
|
||||
public final boolean incrementToken() throws IOException {
|
||||
|
|
|
@ -19,6 +19,9 @@ package org.apache.lucene.analysis;
|
|||
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
import org.apache.lucene.util.AttributeSource.AttributeFactory;
|
||||
|
||||
/** A LetterTokenizer is a tokenizer that divides text at non-letters. That's
|
||||
to say, it defines tokens as maximal strings of adjacent letters, as defined
|
||||
by java.lang.Character.isLetter() predicate.
|
||||
|
@ -31,6 +34,16 @@ public class LetterTokenizer extends CharTokenizer {
|
|||
public LetterTokenizer(Reader in) {
|
||||
super(in);
|
||||
}
|
||||
|
||||
/** Construct a new LetterTokenizer using a given {@link AttributeSource}. */
|
||||
public LetterTokenizer(AttributeSource source, Reader in) {
|
||||
super(source, in);
|
||||
}
|
||||
|
||||
/** Construct a new LetterTokenizer using a given {@link AttributeFactory}. */
|
||||
public LetterTokenizer(AttributeFactory factory, Reader in) {
|
||||
super(factory, in);
|
||||
}
|
||||
|
||||
/** Collects only characters which satisfy
|
||||
* {@link Character#isLetter(char)}.*/
|
||||
|
|
|
@ -19,6 +19,9 @@ package org.apache.lucene.analysis;
|
|||
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
import org.apache.lucene.util.AttributeSource.AttributeFactory;
|
||||
|
||||
/**
|
||||
* LowerCaseTokenizer performs the function of LetterTokenizer
|
||||
* and LowerCaseFilter together. It divides text at non-letters and converts
|
||||
|
@ -35,6 +38,16 @@ public final class LowerCaseTokenizer extends LetterTokenizer {
|
|||
super(in);
|
||||
}
|
||||
|
||||
/** Construct a new LowerCaseTokenizer using a given {@link AttributeSource}. */
|
||||
public LowerCaseTokenizer(AttributeSource source, Reader in) {
|
||||
super(source, in);
|
||||
}
|
||||
|
||||
/** Construct a new LowerCaseTokenizer using a given {@link AttributeFactory}. */
|
||||
public LowerCaseTokenizer(AttributeFactory factory, Reader in) {
|
||||
super(factory, in);
|
||||
}
|
||||
|
||||
/** Converts char to lower case
|
||||
* {@link Character#toLowerCase(char)}.*/
|
||||
protected char normalize(char c) {
|
||||
|
|
|
@ -77,6 +77,18 @@ public abstract class Tokenizer extends TokenStream {
|
|||
super(source);
|
||||
}
|
||||
|
||||
/** Construct a token stream processing the given input using the given AttributeSource. */
|
||||
protected Tokenizer(AttributeSource source, Reader input) {
|
||||
super(source);
|
||||
this.input = CharReader.get(input);
|
||||
}
|
||||
|
||||
/** Construct a token stream processing the given input using the given AttributeSource. */
|
||||
protected Tokenizer(AttributeSource source, CharStream input) {
|
||||
super(source);
|
||||
this.input = input;
|
||||
}
|
||||
|
||||
/** By default, closes the input Reader. */
|
||||
public void close() throws IOException {
|
||||
input.close();
|
||||
|
|
|
@ -19,15 +19,28 @@ package org.apache.lucene.analysis;
|
|||
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
import org.apache.lucene.util.AttributeSource.AttributeFactory;
|
||||
|
||||
/** A WhitespaceTokenizer is a tokenizer that divides text at whitespace.
|
||||
* Adjacent sequences of non-Whitespace characters form tokens. */
|
||||
|
||||
public class WhitespaceTokenizer extends CharTokenizer {
|
||||
/** Construct a new WhitespaceTokenizer. */
|
||||
/** Construct a new WhitespaceTokenizer using a given {@link AttributeSource}. */
|
||||
public WhitespaceTokenizer(Reader in) {
|
||||
super(in);
|
||||
}
|
||||
|
||||
/** Construct a new WhitespaceTokenizer using a given {@link AttributeSource}. */
|
||||
public WhitespaceTokenizer(AttributeSource source, Reader in) {
|
||||
super(source, in);
|
||||
}
|
||||
|
||||
/** Construct a new WhitespaceTokenizer using a given {@link AttributeFactory}. */
|
||||
public WhitespaceTokenizer(AttributeFactory factory, Reader in) {
|
||||
super(factory, in);
|
||||
}
|
||||
|
||||
/** Collects only characters which do not satisfy
|
||||
* {@link Character#isWhitespace(char)}.*/
|
||||
protected boolean isTokenChar(char c) {
|
||||
|
|
|
@ -27,6 +27,7 @@ import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
|||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
|
||||
/** A grammar-based tokenizer constructed with JFlex
|
||||
*
|
||||
|
@ -126,15 +127,38 @@ public class StandardTokenizer extends Tokenizer {
|
|||
* See http://issues.apache.org/jira/browse/LUCENE-1068
|
||||
*/
|
||||
public StandardTokenizer(Reader input, boolean replaceInvalidAcronym) {
|
||||
this.replaceInvalidAcronym = replaceInvalidAcronym;
|
||||
setInput(input);
|
||||
super();
|
||||
this.scanner = new StandardTokenizerImpl(input);
|
||||
init(input, replaceInvalidAcronym);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new StandardTokenizer with a given {@link AttributeSource}.
|
||||
*/
|
||||
public StandardTokenizer(AttributeSource source, Reader input, boolean replaceInvalidAcronym) {
|
||||
super(source);
|
||||
this.scanner = new StandardTokenizerImpl(input);
|
||||
init(input, replaceInvalidAcronym);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new StandardTokenizer with a given {@link AttributeFactory}.
|
||||
*/
|
||||
public StandardTokenizer(AttributeFactory factory, Reader input, boolean replaceInvalidAcronym) {
|
||||
super(factory);
|
||||
this.scanner = new StandardTokenizerImpl(input);
|
||||
init(input, replaceInvalidAcronym);
|
||||
}
|
||||
|
||||
private void init(Reader input, boolean replaceInvalidAcronym) {
|
||||
this.replaceInvalidAcronym = replaceInvalidAcronym;
|
||||
setInput(input);
|
||||
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||
offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
|
||||
posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
|
||||
typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
|
||||
}
|
||||
|
||||
|
||||
// this tokenizer generates three attributes:
|
||||
// offset, positionIncrement and type
|
||||
private TermAttribute termAtt;
|
||||
|
|
Loading…
Reference in New Issue