mirror of https://github.com/apache/lucene.git
- 2-char indentation
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@513866 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
1c3ec1d1d2
commit
6636d88def
|
@ -28,63 +28,63 @@ import java.io.Reader;
|
|||
* @author Otis Gospodnetic
|
||||
*/
|
||||
public class NGramTokenizer extends Tokenizer {
|
||||
public static final int DEFAULT_MIN_NGRAM_SIZE = 1;
|
||||
public static final int DEFAULT_MAX_NGRAM_SIZE = 2;
|
||||
public static final int DEFAULT_MIN_NGRAM_SIZE = 1;
|
||||
public static final int DEFAULT_MAX_NGRAM_SIZE = 2;
|
||||
|
||||
private int minGram, maxGram;
|
||||
private int gramSize;
|
||||
private int pos = 0;
|
||||
private int inLen;
|
||||
private String inStr;
|
||||
private boolean started = false;
|
||||
private int minGram, maxGram;
|
||||
private int gramSize;
|
||||
private int pos = 0;
|
||||
private int inLen;
|
||||
private String inStr;
|
||||
private boolean started = false;
|
||||
|
||||
/**
|
||||
* Creates NGramTokenizer with given min and max n-grams.
|
||||
* @param input Reader holding the input to be tokenized
|
||||
* @param minGram the smallest n-gram to generate
|
||||
* @param maxGram the largest n-gram to generate
|
||||
*/
|
||||
public NGramTokenizer(Reader input, int minGram, int maxGram) {
|
||||
super(input);
|
||||
if (minGram < 1) {
|
||||
throw new IllegalArgumentException("minGram must be greater than zero");
|
||||
}
|
||||
if (minGram > maxGram) {
|
||||
throw new IllegalArgumentException("minGram must not be greater than maxGram");
|
||||
}
|
||||
this.minGram = minGram;
|
||||
this.maxGram = maxGram;
|
||||
/**
|
||||
* Creates NGramTokenizer with given min and max n-grams.
|
||||
* @param input Reader holding the input to be tokenized
|
||||
* @param minGram the smallest n-gram to generate
|
||||
* @param maxGram the largest n-gram to generate
|
||||
*/
|
||||
public NGramTokenizer(Reader input, int minGram, int maxGram) {
|
||||
super(input);
|
||||
if (minGram < 1) {
|
||||
throw new IllegalArgumentException("minGram must be greater than zero");
|
||||
}
|
||||
/**
|
||||
* Creates NGramTokenizer with default min and max n-grams.
|
||||
* @param input Reader holding the input to be tokenized
|
||||
*/
|
||||
public NGramTokenizer(Reader input) {
|
||||
this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE);
|
||||
if (minGram > maxGram) {
|
||||
throw new IllegalArgumentException("minGram must not be greater than maxGram");
|
||||
}
|
||||
this.minGram = minGram;
|
||||
this.maxGram = maxGram;
|
||||
}
|
||||
/**
|
||||
* Creates NGramTokenizer with default min and max n-grams.
|
||||
* @param input Reader holding the input to be tokenized
|
||||
*/
|
||||
public NGramTokenizer(Reader input) {
|
||||
this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE);
|
||||
}
|
||||
|
||||
/** Returns the next token in the stream, or null at EOS. */
|
||||
public final Token next() throws IOException {
|
||||
if (!started) {
|
||||
started = true;
|
||||
gramSize = minGram;
|
||||
char[] chars = new char[1024];
|
||||
input.read(chars);
|
||||
inStr = new String(chars).trim(); // remove any trailing empty strings
|
||||
inLen = inStr.length();
|
||||
}
|
||||
|
||||
/** Returns the next token in the stream, or null at EOS. */
|
||||
public final Token next() throws IOException {
|
||||
if (!started) {
|
||||
started = true;
|
||||
gramSize = minGram;
|
||||
char[] chars = new char[1024];
|
||||
input.read(chars);
|
||||
inStr = new String(chars).trim(); // remove any trailing empty strings
|
||||
inLen = inStr.length();
|
||||
}
|
||||
|
||||
if (pos+gramSize > inLen) { // if we hit the end of the string
|
||||
pos = 0; // reset to beginning of string
|
||||
gramSize++; // increase n-gram size
|
||||
if (gramSize > maxGram) // we are done
|
||||
return null;
|
||||
if (pos+gramSize > inLen)
|
||||
return null;
|
||||
}
|
||||
String gram = inStr.substring(pos, pos+gramSize);
|
||||
int oldPos = pos;
|
||||
pos++;
|
||||
return new Token(gram, oldPos, oldPos+gramSize);
|
||||
if (pos+gramSize > inLen) { // if we hit the end of the string
|
||||
pos = 0; // reset to beginning of string
|
||||
gramSize++; // increase n-gram size
|
||||
if (gramSize > maxGram) // we are done
|
||||
return null;
|
||||
if (pos+gramSize > inLen)
|
||||
return null;
|
||||
}
|
||||
String gram = inStr.substring(pos, pos+gramSize);
|
||||
int oldPos = pos;
|
||||
pos++;
|
||||
return new Token(gram, oldPos, oldPos+gramSize);
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue