mirror of https://github.com/apache/lucene.git
- 2-char indentation
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@513866 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
1c3ec1d1d2
commit
6636d88def
|
@ -28,63 +28,63 @@ import java.io.Reader;
|
||||||
* @author Otis Gospodnetic
|
* @author Otis Gospodnetic
|
||||||
*/
|
*/
|
||||||
public class NGramTokenizer extends Tokenizer {
|
public class NGramTokenizer extends Tokenizer {
|
||||||
public static final int DEFAULT_MIN_NGRAM_SIZE = 1;
|
public static final int DEFAULT_MIN_NGRAM_SIZE = 1;
|
||||||
public static final int DEFAULT_MAX_NGRAM_SIZE = 2;
|
public static final int DEFAULT_MAX_NGRAM_SIZE = 2;
|
||||||
|
|
||||||
private int minGram, maxGram;
|
private int minGram, maxGram;
|
||||||
private int gramSize;
|
private int gramSize;
|
||||||
private int pos = 0;
|
private int pos = 0;
|
||||||
private int inLen;
|
private int inLen;
|
||||||
private String inStr;
|
private String inStr;
|
||||||
private boolean started = false;
|
private boolean started = false;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates NGramTokenizer with given min and max n-grams.
|
* Creates NGramTokenizer with given min and max n-grams.
|
||||||
* @param input Reader holding the input to be tokenized
|
* @param input Reader holding the input to be tokenized
|
||||||
* @param minGram the smallest n-gram to generate
|
* @param minGram the smallest n-gram to generate
|
||||||
* @param maxGram the largest n-gram to generate
|
* @param maxGram the largest n-gram to generate
|
||||||
*/
|
*/
|
||||||
public NGramTokenizer(Reader input, int minGram, int maxGram) {
|
public NGramTokenizer(Reader input, int minGram, int maxGram) {
|
||||||
super(input);
|
super(input);
|
||||||
if (minGram < 1) {
|
if (minGram < 1) {
|
||||||
throw new IllegalArgumentException("minGram must be greater than zero");
|
throw new IllegalArgumentException("minGram must be greater than zero");
|
||||||
}
|
|
||||||
if (minGram > maxGram) {
|
|
||||||
throw new IllegalArgumentException("minGram must not be greater than maxGram");
|
|
||||||
}
|
|
||||||
this.minGram = minGram;
|
|
||||||
this.maxGram = maxGram;
|
|
||||||
}
|
}
|
||||||
/**
|
if (minGram > maxGram) {
|
||||||
* Creates NGramTokenizer with default min and max n-grams.
|
throw new IllegalArgumentException("minGram must not be greater than maxGram");
|
||||||
* @param input Reader holding the input to be tokenized
|
}
|
||||||
*/
|
this.minGram = minGram;
|
||||||
public NGramTokenizer(Reader input) {
|
this.maxGram = maxGram;
|
||||||
this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE);
|
}
|
||||||
|
/**
|
||||||
|
* Creates NGramTokenizer with default min and max n-grams.
|
||||||
|
* @param input Reader holding the input to be tokenized
|
||||||
|
*/
|
||||||
|
public NGramTokenizer(Reader input) {
|
||||||
|
this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Returns the next token in the stream, or null at EOS. */
|
||||||
|
public final Token next() throws IOException {
|
||||||
|
if (!started) {
|
||||||
|
started = true;
|
||||||
|
gramSize = minGram;
|
||||||
|
char[] chars = new char[1024];
|
||||||
|
input.read(chars);
|
||||||
|
inStr = new String(chars).trim(); // remove any trailing empty strings
|
||||||
|
inLen = inStr.length();
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Returns the next token in the stream, or null at EOS. */
|
if (pos+gramSize > inLen) { // if we hit the end of the string
|
||||||
public final Token next() throws IOException {
|
pos = 0; // reset to beginning of string
|
||||||
if (!started) {
|
gramSize++; // increase n-gram size
|
||||||
started = true;
|
if (gramSize > maxGram) // we are done
|
||||||
gramSize = minGram;
|
return null;
|
||||||
char[] chars = new char[1024];
|
if (pos+gramSize > inLen)
|
||||||
input.read(chars);
|
return null;
|
||||||
inStr = new String(chars).trim(); // remove any trailing empty strings
|
|
||||||
inLen = inStr.length();
|
|
||||||
}
|
|
||||||
|
|
||||||
if (pos+gramSize > inLen) { // if we hit the end of the string
|
|
||||||
pos = 0; // reset to beginning of string
|
|
||||||
gramSize++; // increase n-gram size
|
|
||||||
if (gramSize > maxGram) // we are done
|
|
||||||
return null;
|
|
||||||
if (pos+gramSize > inLen)
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
String gram = inStr.substring(pos, pos+gramSize);
|
|
||||||
int oldPos = pos;
|
|
||||||
pos++;
|
|
||||||
return new Token(gram, oldPos, oldPos+gramSize);
|
|
||||||
}
|
}
|
||||||
|
String gram = inStr.substring(pos, pos+gramSize);
|
||||||
|
int oldPos = pos;
|
||||||
|
pos++;
|
||||||
|
return new Token(gram, oldPos, oldPos+gramSize);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue