mirror of https://github.com/apache/lucene.git
LUCENE-7690: Add preserveOriginal option to the NGram and EdgeNGram filters
This commit is contained in:
parent
59087d148a
commit
2c1ab31b4e
|
@ -203,6 +203,9 @@ New Features
|
||||||
now use to also take pending deletes into account which ensures that all file
|
now use to also take pending deletes into account which ensures that all file
|
||||||
generations per segment always go forward. (Simon Willnauer)
|
generations per segment always go forward. (Simon Willnauer)
|
||||||
|
|
||||||
|
* LUCENE-7690: Add preserveOriginal option to the NGram and EdgeNGram filters.
|
||||||
|
(Ingomar Wesp, Shawn Heisey via Robert Muir)
|
||||||
|
|
||||||
* LUCENE-8335: Enforce soft-deletes field up-front. Soft deletes field must be marked
|
* LUCENE-8335: Enforce soft-deletes field up-front. Soft deletes field must be marked
|
||||||
as such once it's introduced and can't be changed after the fact.
|
as such once it's introduced and can't be changed after the fact.
|
||||||
(Nhat Nguyen via Simon Willnauer)
|
(Nhat Nguyen via Simon Willnauer)
|
||||||
|
|
|
@ -29,19 +29,21 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||||
* <fieldType name="text_edgngrm" class="solr.TextField" positionIncrementGap="100">
|
* <fieldType name="text_edgngrm" class="solr.TextField" positionIncrementGap="100">
|
||||||
* <analyzer>
|
* <analyzer>
|
||||||
* <tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
* <tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||||
* <filter class="solr.EdgeNGramFilterFactory" minGramSize="1" maxGramSize="1"/>
|
* <filter class="solr.EdgeNGramFilterFactory" minGramSize="1" maxGramSize="2" preserveOriginal="true"/>
|
||||||
* </analyzer>
|
* </analyzer>
|
||||||
* </fieldType></pre>
|
* </fieldType></pre>
|
||||||
*/
|
*/
|
||||||
public class EdgeNGramFilterFactory extends TokenFilterFactory {
|
public class EdgeNGramFilterFactory extends TokenFilterFactory {
|
||||||
private final int maxGramSize;
|
private final int maxGramSize;
|
||||||
private final int minGramSize;
|
private final int minGramSize;
|
||||||
|
private final boolean preserveOriginal;
|
||||||
|
|
||||||
/** Creates a new EdgeNGramFilterFactory */
|
/** Creates a new EdgeNGramFilterFactory */
|
||||||
public EdgeNGramFilterFactory(Map<String, String> args) {
|
public EdgeNGramFilterFactory(Map<String, String> args) {
|
||||||
super(args);
|
super(args);
|
||||||
minGramSize = getInt(args, "minGramSize", EdgeNGramTokenFilter.DEFAULT_MIN_GRAM_SIZE);
|
minGramSize = getInt(args, "minGramSize", EdgeNGramTokenFilter.DEFAULT_MIN_GRAM_SIZE);
|
||||||
maxGramSize = getInt(args, "maxGramSize", EdgeNGramTokenFilter.DEFAULT_MAX_GRAM_SIZE);
|
maxGramSize = getInt(args, "maxGramSize", EdgeNGramTokenFilter.DEFAULT_MAX_GRAM_SIZE);
|
||||||
|
preserveOriginal = getBoolean(args, "preserveOriginal", EdgeNGramTokenFilter.DEFAULT_PRESERVE_ORIGINAL);
|
||||||
if (!args.isEmpty()) {
|
if (!args.isEmpty()) {
|
||||||
throw new IllegalArgumentException("Unknown parameters: " + args);
|
throw new IllegalArgumentException("Unknown parameters: " + args);
|
||||||
}
|
}
|
||||||
|
@ -49,6 +51,6 @@ public class EdgeNGramFilterFactory extends TokenFilterFactory {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public TokenFilter create(TokenStream input) {
|
public TokenFilter create(TokenStream input) {
|
||||||
return new EdgeNGramTokenFilter(input, minGramSize, maxGramSize);
|
return new EdgeNGramTokenFilter(input, minGramSize, maxGramSize, preserveOriginal);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -32,29 +32,46 @@ import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
* supplementary characters.
|
* supplementary characters.
|
||||||
*/
|
*/
|
||||||
public final class EdgeNGramTokenFilter extends TokenFilter {
|
public final class EdgeNGramTokenFilter extends TokenFilter {
|
||||||
|
/**
|
||||||
|
* @deprecated since 7.4 - this value will be required.
|
||||||
|
*/
|
||||||
|
@Deprecated
|
||||||
public static final int DEFAULT_MAX_GRAM_SIZE = 1;
|
public static final int DEFAULT_MAX_GRAM_SIZE = 1;
|
||||||
|
/**
|
||||||
|
* @deprecated since 7.4 - this value will be required.
|
||||||
|
*/
|
||||||
|
@Deprecated
|
||||||
public static final int DEFAULT_MIN_GRAM_SIZE = 1;
|
public static final int DEFAULT_MIN_GRAM_SIZE = 1;
|
||||||
|
public static final boolean DEFAULT_PRESERVE_ORIGINAL = false;
|
||||||
|
|
||||||
private final int minGram;
|
private final int minGram;
|
||||||
private final int maxGram;
|
private final int maxGram;
|
||||||
|
private final boolean preserveOriginal;
|
||||||
|
|
||||||
private char[] curTermBuffer;
|
private char[] curTermBuffer;
|
||||||
private int curTermLength;
|
private int curTermLength;
|
||||||
private int curCodePointCount;
|
private int curTermCodePointCount;
|
||||||
private int curGramSize;
|
private int curGramSize;
|
||||||
private int savePosIncr;
|
private int curPosIncr;
|
||||||
private State state;
|
private State state;
|
||||||
|
|
||||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates EdgeNGramTokenFilter that can generate n-grams in the sizes of the given range
|
* Creates an EdgeNGramTokenFilter that, for a given input term, produces all
|
||||||
|
* edge n-grams with lengths >= minGram and <= maxGram. Will
|
||||||
|
* optionally preserve the original term when its length is outside of the
|
||||||
|
* defined range.
|
||||||
*
|
*
|
||||||
* @param input {@link TokenStream} holding the input to be tokenized
|
* @param input {@link TokenStream} holding the input to be tokenized
|
||||||
* @param minGram the smallest n-gram to generate
|
* @param minGram the minimum length of the generated n-grams
|
||||||
* @param maxGram the largest n-gram to generate
|
* @param maxGram the maximum length of the generated n-grams
|
||||||
|
* @param preserveOriginal Whether or not to keep the original term when it
|
||||||
|
* is outside the min/max size range.
|
||||||
*/
|
*/
|
||||||
public EdgeNGramTokenFilter(TokenStream input, int minGram, int maxGram) {
|
public EdgeNGramTokenFilter(
|
||||||
|
TokenStream input, int minGram, int maxGram, boolean preserveOriginal) {
|
||||||
super(input);
|
super(input);
|
||||||
|
|
||||||
if (minGram < 1) {
|
if (minGram < 1) {
|
||||||
|
@ -67,6 +84,39 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
|
||||||
|
|
||||||
this.minGram = minGram;
|
this.minGram = minGram;
|
||||||
this.maxGram = maxGram;
|
this.maxGram = maxGram;
|
||||||
|
this.preserveOriginal = preserveOriginal;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates an EdgeNGramTokenFilter that produces edge n-grams of the given
|
||||||
|
* size.
|
||||||
|
*
|
||||||
|
* @param input {@link TokenStream} holding the input to be tokenized
|
||||||
|
* @param gramSize the n-gram size to generate.
|
||||||
|
*/
|
||||||
|
public EdgeNGramTokenFilter(TokenStream input, int gramSize) {
|
||||||
|
this(input, gramSize, gramSize, DEFAULT_PRESERVE_ORIGINAL);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates an EdgeNGramTokenFilter that, for a given input term, produces all
|
||||||
|
* edge n-grams with lengths >= minGram and <= maxGram.
|
||||||
|
*
|
||||||
|
* <p>
|
||||||
|
* Behaves the same as
|
||||||
|
* {@link #EdgeNGramTokenFilter(TokenStream, int, int, boolean)
|
||||||
|
* NGramTokenFilter(input, minGram, maxGram, false)}
|
||||||
|
*
|
||||||
|
* @param input {@link TokenStream} holding the input to be tokenized
|
||||||
|
* @param minGram the minimum length of the generated n-grams
|
||||||
|
* @param maxGram the maximum length of the generated n-grams
|
||||||
|
*
|
||||||
|
* @deprecated since 7.4. Use
|
||||||
|
* {@link #EdgeNGramTokenFilter(TokenStream, int, int, boolean)} instead.
|
||||||
|
*/
|
||||||
|
@Deprecated
|
||||||
|
public EdgeNGramTokenFilter(TokenStream input, int minGram, int maxGram) {
|
||||||
|
this(input, minGram, maxGram, DEFAULT_PRESERVE_ORIGINAL);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -75,32 +125,46 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
|
||||||
if (curTermBuffer == null) {
|
if (curTermBuffer == null) {
|
||||||
if (!input.incrementToken()) {
|
if (!input.incrementToken()) {
|
||||||
return false;
|
return false;
|
||||||
} else {
|
|
||||||
curTermBuffer = termAtt.buffer().clone();
|
|
||||||
curTermLength = termAtt.length();
|
|
||||||
curCodePointCount = Character.codePointCount(termAtt, 0, termAtt.length());
|
|
||||||
curGramSize = minGram;
|
|
||||||
state = captureState();
|
|
||||||
savePosIncr += posIncrAtt.getPositionIncrement();
|
|
||||||
}
|
}
|
||||||
|
state = captureState();
|
||||||
|
|
||||||
|
curTermLength = termAtt.length();
|
||||||
|
curTermCodePointCount = Character.codePointCount(termAtt, 0, curTermLength);
|
||||||
|
curPosIncr += posIncrAtt.getPositionIncrement();
|
||||||
|
|
||||||
|
if (preserveOriginal && curTermCodePointCount < minGram) {
|
||||||
|
// Token is shorter than minGram, but we'd still like to keep it.
|
||||||
|
posIncrAtt.setPositionIncrement(curPosIncr);
|
||||||
|
curPosIncr = 0;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
curTermBuffer = termAtt.buffer().clone();
|
||||||
|
curGramSize = minGram;
|
||||||
}
|
}
|
||||||
if (curGramSize <= maxGram) { // if we have hit the end of our n-gram size range, quit
|
|
||||||
if (curGramSize <= curCodePointCount) { // if the remaining input is too short, we can't generate any n-grams
|
if (curGramSize <= curTermCodePointCount) {
|
||||||
// grab gramSize chars from front or back
|
if (curGramSize <= maxGram) { // curGramSize is between minGram and maxGram
|
||||||
restoreState(state);
|
restoreState(state);
|
||||||
// first ngram gets increment, others don't
|
// first ngram gets increment, others don't
|
||||||
if (curGramSize == minGram) {
|
posIncrAtt.setPositionIncrement(curPosIncr);
|
||||||
posIncrAtt.setPositionIncrement(savePosIncr);
|
curPosIncr = 0;
|
||||||
savePosIncr = 0;
|
|
||||||
} else {
|
|
||||||
posIncrAtt.setPositionIncrement(0);
|
|
||||||
}
|
|
||||||
final int charLength = Character.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curGramSize);
|
final int charLength = Character.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curGramSize);
|
||||||
termAtt.copyBuffer(curTermBuffer, 0, charLength);
|
termAtt.copyBuffer(curTermBuffer, 0, charLength);
|
||||||
curGramSize++;
|
curGramSize++;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
else if (preserveOriginal) {
|
||||||
|
// Token is longer than maxGram, but we'd still like to keep it.
|
||||||
|
restoreState(state);
|
||||||
|
posIncrAtt.setPositionIncrement(0);
|
||||||
|
termAtt.copyBuffer(curTermBuffer, 0, curTermLength);
|
||||||
|
curTermBuffer = null;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
// Done with this input token, get next token on the next iteration.
|
||||||
curTermBuffer = null;
|
curTermBuffer = null;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -109,6 +173,6 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
|
||||||
public void reset() throws IOException {
|
public void reset() throws IOException {
|
||||||
super.reset();
|
super.reset();
|
||||||
curTermBuffer = null;
|
curTermBuffer = null;
|
||||||
savePosIncr = 0;
|
curPosIncr = 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -29,19 +29,21 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||||
* <fieldType name="text_ngrm" class="solr.TextField" positionIncrementGap="100">
|
* <fieldType name="text_ngrm" class="solr.TextField" positionIncrementGap="100">
|
||||||
* <analyzer>
|
* <analyzer>
|
||||||
* <tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
* <tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||||
* <filter class="solr.NGramFilterFactory" minGramSize="1" maxGramSize="2"/>
|
* <filter class="solr.NGramFilterFactory" minGramSize="1" maxGramSize="2" preserveOriginal="true"/>
|
||||||
* </analyzer>
|
* </analyzer>
|
||||||
* </fieldType></pre>
|
* </fieldType></pre>
|
||||||
*/
|
*/
|
||||||
public class NGramFilterFactory extends TokenFilterFactory {
|
public class NGramFilterFactory extends TokenFilterFactory {
|
||||||
private final int maxGramSize;
|
private final int maxGramSize;
|
||||||
private final int minGramSize;
|
private final int minGramSize;
|
||||||
|
private final boolean preserveOriginal;
|
||||||
|
|
||||||
/** Creates a new NGramFilterFactory */
|
/** Creates a new NGramFilterFactory */
|
||||||
public NGramFilterFactory(Map<String, String> args) {
|
public NGramFilterFactory(Map<String, String> args) {
|
||||||
super(args);
|
super(args);
|
||||||
minGramSize = getInt(args, "minGramSize", NGramTokenFilter.DEFAULT_MIN_NGRAM_SIZE);
|
minGramSize = getInt(args, "minGramSize", NGramTokenFilter.DEFAULT_MIN_NGRAM_SIZE);
|
||||||
maxGramSize = getInt(args, "maxGramSize", NGramTokenFilter.DEFAULT_MAX_NGRAM_SIZE);
|
maxGramSize = getInt(args, "maxGramSize", NGramTokenFilter.DEFAULT_MAX_NGRAM_SIZE);
|
||||||
|
preserveOriginal = getBoolean(args, "keepShortTerm", NGramTokenFilter.DEFAULT_PRESERVE_ORIGINAL);
|
||||||
if (!args.isEmpty()) {
|
if (!args.isEmpty()) {
|
||||||
throw new IllegalArgumentException("Unknown parameters: " + args);
|
throw new IllegalArgumentException("Unknown parameters: " + args);
|
||||||
}
|
}
|
||||||
|
@ -49,6 +51,6 @@ public class NGramFilterFactory extends TokenFilterFactory {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public TokenFilter create(TokenStream input) {
|
public TokenFilter create(TokenStream input) {
|
||||||
return new NGramTokenFilter(input, minGramSize, maxGramSize);
|
return new NGramTokenFilter(input, minGramSize, maxGramSize, preserveOriginal);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -21,7 +21,6 @@ import java.io.IOException;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.miscellaneous.CodepointCountFilter;
|
|
||||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
|
|
||||||
|
@ -40,30 +39,52 @@ import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
* override {@link NGramTokenizer#isTokenChar(int)} to perform pre-tokenization.
|
* override {@link NGramTokenizer#isTokenChar(int)} to perform pre-tokenization.
|
||||||
*/
|
*/
|
||||||
public final class NGramTokenFilter extends TokenFilter {
|
public final class NGramTokenFilter extends TokenFilter {
|
||||||
|
/**
|
||||||
|
* @deprecated since 7.4 - this value will be required.
|
||||||
|
*/
|
||||||
|
@Deprecated
|
||||||
public static final int DEFAULT_MIN_NGRAM_SIZE = 1;
|
public static final int DEFAULT_MIN_NGRAM_SIZE = 1;
|
||||||
public static final int DEFAULT_MAX_NGRAM_SIZE = 2;
|
|
||||||
|
|
||||||
private final int minGram, maxGram;
|
/**
|
||||||
|
* @deprecated since 7.4 - this value will be required.
|
||||||
|
*/
|
||||||
|
@Deprecated
|
||||||
|
public static final int DEFAULT_MAX_NGRAM_SIZE = 2;
|
||||||
|
public static final boolean DEFAULT_PRESERVE_ORIGINAL = false;
|
||||||
|
|
||||||
|
private final int minGram;
|
||||||
|
private final int maxGram;
|
||||||
|
private final boolean preserveOriginal;
|
||||||
|
|
||||||
private char[] curTermBuffer;
|
private char[] curTermBuffer;
|
||||||
private int curTermLength;
|
private int curTermLength;
|
||||||
private int curCodePointCount;
|
private int curTermCodePointCount;
|
||||||
private int curGramSize;
|
private int curGramSize;
|
||||||
private int curPos;
|
private int curPos;
|
||||||
private int curPosInc;
|
private int curPosIncr;
|
||||||
private State state;
|
private State state;
|
||||||
|
|
||||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
private final PositionIncrementAttribute posIncAtt;
|
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates NGramTokenFilter with given min and max n-grams.
|
* Creates an NGramTokenFilter that, for a given input term, produces all
|
||||||
|
* contained n-grams with lengths >= minGram and <= maxGram. Will
|
||||||
|
* optionally preserve the original term when its length is outside of the
|
||||||
|
* defined range.
|
||||||
|
*
|
||||||
|
* Note: Care must be taken when choosing minGram and maxGram; depending
|
||||||
|
* on the input token size, this filter potentially produces a huge number
|
||||||
|
* of terms.
|
||||||
|
*
|
||||||
* @param input {@link TokenStream} holding the input to be tokenized
|
* @param input {@link TokenStream} holding the input to be tokenized
|
||||||
* @param minGram the smallest n-gram to generate
|
* @param minGram the minimum length of the generated n-grams
|
||||||
* @param maxGram the largest n-gram to generate
|
* @param maxGram the maximum length of the generated n-grams
|
||||||
|
* @param preserveOriginal Whether or not to keep the original term when it
|
||||||
|
* is shorter than minGram or longer than maxGram
|
||||||
*/
|
*/
|
||||||
public NGramTokenFilter(TokenStream input, int minGram, int maxGram) {
|
public NGramTokenFilter(TokenStream input, int minGram, int maxGram, boolean preserveOriginal) {
|
||||||
super(new CodepointCountFilter(input, minGram, Integer.MAX_VALUE));
|
super(input);
|
||||||
if (minGram < 1) {
|
if (minGram < 1) {
|
||||||
throw new IllegalArgumentException("minGram must be greater than zero");
|
throw new IllegalArgumentException("minGram must be greater than zero");
|
||||||
}
|
}
|
||||||
|
@ -72,50 +93,106 @@ public final class NGramTokenFilter extends TokenFilter {
|
||||||
}
|
}
|
||||||
this.minGram = minGram;
|
this.minGram = minGram;
|
||||||
this.maxGram = maxGram;
|
this.maxGram = maxGram;
|
||||||
|
this.preserveOriginal = preserveOriginal;
|
||||||
|
}
|
||||||
|
|
||||||
posIncAtt = addAttribute(PositionIncrementAttribute.class);
|
/**
|
||||||
|
* Creates an NGramTokenFilter that produces n-grams of the indicated size.
|
||||||
|
*
|
||||||
|
* @param input {@link TokenStream} holding the input to be tokenized
|
||||||
|
* @param gramSize the size of n-grams to generate.
|
||||||
|
*/
|
||||||
|
public NGramTokenFilter(TokenStream input, int gramSize) {
|
||||||
|
this(input, gramSize, gramSize, DEFAULT_PRESERVE_ORIGINAL);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates an NGramTokenFilter that, for a given input term, produces all
|
||||||
|
* contained n-grams with lengths >= minGram and <= maxGram.
|
||||||
|
*
|
||||||
|
* <p>
|
||||||
|
* Behaves the same as
|
||||||
|
* {@link #NGramTokenFilter(TokenStream, int, int, boolean)
|
||||||
|
* NGramTokenFilter(input, minGram, maxGram, false)}
|
||||||
|
*
|
||||||
|
* @param input {@link TokenStream} holding the input to be tokenized
|
||||||
|
* @param minGram the minimum length of the generated n-grams
|
||||||
|
* @param maxGram the maximum length of the generated n-grams
|
||||||
|
*
|
||||||
|
* @deprecated since 7.4. Use
|
||||||
|
* {@link #NGramTokenFilter(TokenStream, int, int, boolean)} instead.
|
||||||
|
*/
|
||||||
|
@Deprecated
|
||||||
|
public NGramTokenFilter(TokenStream input, int minGram, int maxGram) {
|
||||||
|
this(input, minGram, maxGram, DEFAULT_PRESERVE_ORIGINAL);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates NGramTokenFilter with default min and max n-grams.
|
* Creates NGramTokenFilter with default min and max n-grams.
|
||||||
|
*
|
||||||
|
* <p>
|
||||||
|
* Behaves the same as
|
||||||
|
* {@link #NGramTokenFilter(TokenStream, int, int, boolean)
|
||||||
|
* NGramTokenFilter(input, 1, 2, false)}
|
||||||
|
*
|
||||||
* @param input {@link TokenStream} holding the input to be tokenized
|
* @param input {@link TokenStream} holding the input to be tokenized
|
||||||
|
* @deprecated since 7.4. Use
|
||||||
|
* {@link #NGramTokenFilter(TokenStream, int, int, boolean)} instead.
|
||||||
*/
|
*/
|
||||||
|
@Deprecated
|
||||||
public NGramTokenFilter(TokenStream input) {
|
public NGramTokenFilter(TokenStream input) {
|
||||||
this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE);
|
this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE, DEFAULT_PRESERVE_ORIGINAL);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Returns the next token in the stream, or null at EOS. */
|
|
||||||
@Override
|
@Override
|
||||||
public final boolean incrementToken() throws IOException {
|
public final boolean incrementToken() throws IOException {
|
||||||
while (true) {
|
while (true) {
|
||||||
if (curTermBuffer == null) {
|
if (curTermBuffer == null) {
|
||||||
if (!input.incrementToken()) {
|
if (!input.incrementToken()) {
|
||||||
return false;
|
return false;
|
||||||
} else {
|
|
||||||
curTermBuffer = termAtt.buffer().clone();
|
|
||||||
curTermLength = termAtt.length();
|
|
||||||
curCodePointCount = Character.codePointCount(termAtt, 0, termAtt.length());
|
|
||||||
curGramSize = minGram;
|
|
||||||
curPos = 0;
|
|
||||||
curPosInc = posIncAtt.getPositionIncrement();
|
|
||||||
state = captureState();
|
|
||||||
}
|
}
|
||||||
|
state = captureState();
|
||||||
|
|
||||||
|
curTermLength = termAtt.length();
|
||||||
|
curTermCodePointCount = Character.codePointCount(termAtt, 0, termAtt.length());
|
||||||
|
curPosIncr += posIncrAtt.getPositionIncrement();
|
||||||
|
curPos = 0;
|
||||||
|
|
||||||
|
if (preserveOriginal && curTermCodePointCount < minGram) {
|
||||||
|
// Token is shorter than minGram, but we'd still like to keep it.
|
||||||
|
posIncrAtt.setPositionIncrement(curPosIncr);
|
||||||
|
curPosIncr = 0;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
curTermBuffer = termAtt.buffer().clone();
|
||||||
|
curGramSize = minGram;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (curGramSize > maxGram || (curPos + curGramSize) > curCodePointCount) {
|
if (curGramSize > maxGram || (curPos + curGramSize) > curTermCodePointCount) {
|
||||||
++curPos;
|
++curPos;
|
||||||
curGramSize = minGram;
|
curGramSize = minGram;
|
||||||
}
|
}
|
||||||
if ((curPos + curGramSize) <= curCodePointCount) {
|
if ((curPos + curGramSize) <= curTermCodePointCount) {
|
||||||
restoreState(state);
|
restoreState(state);
|
||||||
final int start = Character.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curPos);
|
final int start = Character.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curPos);
|
||||||
final int end = Character.offsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize);
|
final int end = Character.offsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize);
|
||||||
termAtt.copyBuffer(curTermBuffer, start, end - start);
|
termAtt.copyBuffer(curTermBuffer, start, end - start);
|
||||||
posIncAtt.setPositionIncrement(curPosInc);
|
posIncrAtt.setPositionIncrement(curPosIncr);
|
||||||
curPosInc = 0;
|
curPosIncr = 0;
|
||||||
curGramSize++;
|
curGramSize++;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
else if (preserveOriginal && curTermCodePointCount > maxGram) {
|
||||||
|
// Token is longer than maxGram, but we'd still like to keep it.
|
||||||
|
restoreState(state);
|
||||||
|
posIncrAtt.setPositionIncrement(0);
|
||||||
|
termAtt.copyBuffer(curTermBuffer, 0, curTermLength);
|
||||||
|
curTermBuffer = null;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Done with this input token, get next token on next iteration.
|
||||||
curTermBuffer = null;
|
curTermBuffer = null;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -124,5 +201,6 @@ public final class NGramTokenFilter extends TokenFilter {
|
||||||
public void reset() throws IOException {
|
public void reset() throws IOException {
|
||||||
super.reset();
|
super.reset();
|
||||||
curTermBuffer = null;
|
curTermBuffer = null;
|
||||||
|
curPosIncr = 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -236,7 +236,7 @@ public class TestBugInSomething extends BaseTokenStreamTestCase {
|
||||||
//TokenStream stream = new SopTokenFilter(tokenizer);
|
//TokenStream stream = new SopTokenFilter(tokenizer);
|
||||||
TokenStream stream = new ShingleFilter(tokenizer, 5);
|
TokenStream stream = new ShingleFilter(tokenizer, 5);
|
||||||
//stream = new SopTokenFilter(stream);
|
//stream = new SopTokenFilter(stream);
|
||||||
stream = new NGramTokenFilter(stream, 55, 83);
|
stream = new NGramTokenFilter(stream, 55, 83, false);
|
||||||
//stream = new SopTokenFilter(stream);
|
//stream = new SopTokenFilter(stream);
|
||||||
return new TokenStreamComponents(tokenizer, stream);
|
return new TokenStreamComponents(tokenizer, stream);
|
||||||
}
|
}
|
||||||
|
|
|
@ -50,49 +50,73 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
public void testInvalidInput() throws Exception {
|
public void testInvalidInput() throws Exception {
|
||||||
expectThrows(IllegalArgumentException.class, () -> {
|
expectThrows(IllegalArgumentException.class, () -> {
|
||||||
new EdgeNGramTokenFilter(input, 0, 0);
|
new EdgeNGramTokenFilter(input, 0, 0, false);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testInvalidInput2() throws Exception {
|
public void testInvalidInput2() throws Exception {
|
||||||
expectThrows(IllegalArgumentException.class, () -> {
|
expectThrows(IllegalArgumentException.class, () -> {
|
||||||
new EdgeNGramTokenFilter(input, 2, 1);
|
new EdgeNGramTokenFilter(input, 2, 1, false);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testInvalidInput3() throws Exception {
|
public void testInvalidInput3() throws Exception {
|
||||||
expectThrows(IllegalArgumentException.class, () -> {
|
expectThrows(IllegalArgumentException.class, () -> {
|
||||||
new EdgeNGramTokenFilter(input, -1, 2);
|
new EdgeNGramTokenFilter(input, -1, 2, false);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testFrontUnigram() throws Exception {
|
public void testFrontUnigram() throws Exception {
|
||||||
EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, 1, 1);
|
EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, 1, 1, false);
|
||||||
assertTokenStreamContents(tokenizer, new String[]{"a"}, new int[]{0}, new int[]{5});
|
assertTokenStreamContents(tokenizer, new String[]{"a"}, new int[]{0}, new int[]{5});
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testOversizedNgrams() throws Exception {
|
public void testOversizedNgrams() throws Exception {
|
||||||
EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, 6, 6);
|
EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, 6, 6, false);
|
||||||
assertTokenStreamContents(tokenizer, new String[0], new int[0], new int[0]);
|
assertTokenStreamContents(tokenizer, new String[0], new int[0], new int[0]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testOversizedNgramsPreserveOriginal() throws Exception {
|
||||||
|
EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, 6, 6, true);
|
||||||
|
assertTokenStreamContents(tokenizer, new String[] {"abcde"}, new int[] {0}, new int[] {5});
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testPreserveOriginal() throws Exception {
|
||||||
|
final String inputString = "a bcd efghi jk";
|
||||||
|
|
||||||
|
{ // preserveOriginal = false
|
||||||
|
TokenStream ts = whitespaceMockTokenizer(inputString);
|
||||||
|
EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(ts, 2, 3, false);
|
||||||
|
assertTokenStreamContents(filter,
|
||||||
|
new String[] { "bc", "bcd", "ef", "efg", "jk" },
|
||||||
|
new int[] { 2, 2, 6, 6, 12 },
|
||||||
|
new int[] { 5, 5, 11, 11, 14 },
|
||||||
|
new int[] { 2, 0, 1, 0, 1 });
|
||||||
|
}
|
||||||
|
|
||||||
|
{ // preserveOriginal = true
|
||||||
|
TokenStream ts = whitespaceMockTokenizer(inputString);
|
||||||
|
EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(ts, 2, 3, true);
|
||||||
|
assertTokenStreamContents(filter,
|
||||||
|
new String[] { "a", "bc", "bcd", "ef", "efg", "efghi", "jk" },
|
||||||
|
new int[] { 0, 2, 2, 6, 6, 6, 12 },
|
||||||
|
new int[] { 1, 5, 5, 11, 11, 11, 14 },
|
||||||
|
new int[] { 1, 1, 0, 1, 0, 0, 1 });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public void testFrontRangeOfNgrams() throws Exception {
|
public void testFrontRangeOfNgrams() throws Exception {
|
||||||
EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, 1, 3);
|
EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, 1, 3, false);
|
||||||
assertTokenStreamContents(tokenizer, new String[]{"a","ab","abc"}, new int[]{0,0,0}, new int[]{5,5,5});
|
assertTokenStreamContents(tokenizer, new String[]{"a","ab","abc"}, new int[]{0,0,0}, new int[]{5,5,5});
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testFilterPositions() throws Exception {
|
public void testFilterPositions() throws Exception {
|
||||||
TokenStream ts = whitespaceMockTokenizer("abcde vwxyz");
|
TokenStream ts = whitespaceMockTokenizer("abcde vwxyz");
|
||||||
EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(ts, 1, 3);
|
EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(ts, 1, 3, false);
|
||||||
assertTokenStreamContents(tokenizer,
|
assertTokenStreamContents(tokenizer,
|
||||||
new String[]{"a","ab","abc","v","vw","vwx"},
|
new String[] {"a","ab","abc","v","vw","vwx"},
|
||||||
new int[]{0,0,0,6,6,6},
|
new int[] {0, 0, 0, 6, 6, 6},
|
||||||
new int[]{5,5,5,11,11,11},
|
new int[] {5, 5, 5, 11, 11, 11});
|
||||||
null,
|
|
||||||
new int[]{1,0,0,1,0,0},
|
|
||||||
null,
|
|
||||||
null,
|
|
||||||
false);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private static class PositionFilter extends TokenFilter {
|
private static class PositionFilter extends TokenFilter {
|
||||||
|
@ -128,7 +152,7 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase {
|
||||||
public void testFirstTokenPositionIncrement() throws Exception {
|
public void testFirstTokenPositionIncrement() throws Exception {
|
||||||
TokenStream ts = whitespaceMockTokenizer("a abc");
|
TokenStream ts = whitespaceMockTokenizer("a abc");
|
||||||
ts = new PositionFilter(ts); // All but first token will get 0 position increment
|
ts = new PositionFilter(ts); // All but first token will get 0 position increment
|
||||||
EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(ts, 2, 3);
|
EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(ts, 2, 3, false);
|
||||||
// The first token "a" will not be output, since it's smaller than the mingram size of 2.
|
// The first token "a" will not be output, since it's smaller than the mingram size of 2.
|
||||||
// The second token on input to EdgeNGramTokenFilter will have position increment of 0,
|
// The second token on input to EdgeNGramTokenFilter will have position increment of 0,
|
||||||
// which should be increased to 1, since this is the first output token in the stream.
|
// which should be increased to 1, since this is the first output token in the stream.
|
||||||
|
@ -142,14 +166,14 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
public void testSmallTokenInStream() throws Exception {
|
public void testSmallTokenInStream() throws Exception {
|
||||||
input = whitespaceMockTokenizer("abc de fgh");
|
input = whitespaceMockTokenizer("abc de fgh");
|
||||||
EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, 3, 3);
|
EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, 3, 3, false);
|
||||||
assertTokenStreamContents(tokenizer, new String[]{"abc","fgh"}, new int[]{0,7}, new int[]{3,10});
|
assertTokenStreamContents(tokenizer, new String[]{"abc","fgh"}, new int[]{0,7}, new int[]{3,10});
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testReset() throws Exception {
|
public void testReset() throws Exception {
|
||||||
WhitespaceTokenizer tokenizer = new WhitespaceTokenizer();
|
WhitespaceTokenizer tokenizer = new WhitespaceTokenizer();
|
||||||
tokenizer.setReader(new StringReader("abcde"));
|
tokenizer.setReader(new StringReader("abcde"));
|
||||||
EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(tokenizer, 1, 3);
|
EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(tokenizer, 1, 3, false);
|
||||||
assertTokenStreamContents(filter, new String[]{"a","ab","abc"}, new int[]{0,0,0}, new int[]{5,5,5});
|
assertTokenStreamContents(filter, new String[]{"a","ab","abc"}, new int[]{0,0,0}, new int[]{5,5,5});
|
||||||
tokenizer.setReader(new StringReader("abcde"));
|
tokenizer.setReader(new StringReader("abcde"));
|
||||||
assertTokenStreamContents(filter, new String[]{"a","ab","abc"}, new int[]{0,0,0}, new int[]{5,5,5});
|
assertTokenStreamContents(filter, new String[]{"a","ab","abc"}, new int[]{0,0,0}, new int[]{5,5,5});
|
||||||
|
@ -160,13 +184,14 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase {
|
||||||
for (int i = 0; i < 10; i++) {
|
for (int i = 0; i < 10; i++) {
|
||||||
final int min = TestUtil.nextInt(random(), 2, 10);
|
final int min = TestUtil.nextInt(random(), 2, 10);
|
||||||
final int max = TestUtil.nextInt(random(), min, 20);
|
final int max = TestUtil.nextInt(random(), min, 20);
|
||||||
|
final boolean preserveOriginal = TestUtil.nextInt(random(), 0, 1) % 2 == 0;
|
||||||
|
|
||||||
Analyzer a = new Analyzer() {
|
Analyzer a = new Analyzer() {
|
||||||
@Override
|
@Override
|
||||||
protected TokenStreamComponents createComponents(String fieldName) {
|
protected TokenStreamComponents createComponents(String fieldName) {
|
||||||
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
|
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
|
||||||
return new TokenStreamComponents(tokenizer,
|
return new TokenStreamComponents(tokenizer,
|
||||||
new EdgeNGramTokenFilter(tokenizer, min, max));
|
new EdgeNGramTokenFilter(tokenizer, min, max, preserveOriginal));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
checkRandomData(random(), a, 100*RANDOM_MULTIPLIER);
|
checkRandomData(random(), a, 100*RANDOM_MULTIPLIER);
|
||||||
|
@ -181,7 +206,7 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase {
|
||||||
protected TokenStreamComponents createComponents(String fieldName) {
|
protected TokenStreamComponents createComponents(String fieldName) {
|
||||||
Tokenizer tokenizer = new KeywordTokenizer();
|
Tokenizer tokenizer = new KeywordTokenizer();
|
||||||
return new TokenStreamComponents(tokenizer,
|
return new TokenStreamComponents(tokenizer,
|
||||||
new EdgeNGramTokenFilter(tokenizer, 2, 15));
|
new EdgeNGramTokenFilter(tokenizer, 2, 15, false));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
checkAnalysisConsistency(random, a, random.nextBoolean(), "");
|
checkAnalysisConsistency(random, a, random.nextBoolean(), "");
|
||||||
|
@ -192,7 +217,7 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase {
|
||||||
TokenStream tk = new LetterTokenizer();
|
TokenStream tk = new LetterTokenizer();
|
||||||
((Tokenizer)tk).setReader(new StringReader("abc d efgh ij klmno p q"));
|
((Tokenizer)tk).setReader(new StringReader("abc d efgh ij klmno p q"));
|
||||||
tk = new ShingleFilter(tk);
|
tk = new ShingleFilter(tk);
|
||||||
tk = new EdgeNGramTokenFilter(tk, 7, 10);
|
tk = new EdgeNGramTokenFilter(tk, 7, 10, false);
|
||||||
assertTokenStreamContents(tk,
|
assertTokenStreamContents(tk,
|
||||||
new String[] { "efgh ij", "ij klmn", "ij klmno", "klmno p" },
|
new String[] { "efgh ij", "ij klmn", "ij klmno", "klmno p" },
|
||||||
new int[] { 6,11,11,14 },
|
new int[] { 6,11,11,14 },
|
||||||
|
@ -204,23 +229,44 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testSupplementaryCharacters() throws IOException {
|
public void testSupplementaryCharacters() throws IOException {
|
||||||
final String s = TestUtil.randomUnicodeString(random(), 10);
|
for (int i = 0; i < 20; i++) {
|
||||||
final int codePointCount = s.codePointCount(0, s.length());
|
final String s = TestUtil.randomUnicodeString(random(), 10);
|
||||||
final int minGram = TestUtil.nextInt(random(), 1, 3);
|
final int codePointCount = s.codePointCount(0, s.length());
|
||||||
final int maxGram = TestUtil.nextInt(random(), minGram, 10);
|
final int minGram = TestUtil.nextInt(random(), 1, 3);
|
||||||
TokenStream tk = new KeywordTokenizer();
|
final int maxGram = TestUtil.nextInt(random(), minGram, 10);
|
||||||
((Tokenizer)tk).setReader(new StringReader(s));
|
final boolean preserveOriginal = TestUtil.nextInt(random(), 0, 1) % 2 == 0;
|
||||||
tk = new EdgeNGramTokenFilter(tk, minGram, maxGram);
|
|
||||||
final CharTermAttribute termAtt = tk.addAttribute(CharTermAttribute.class);
|
TokenStream tk = new KeywordTokenizer();
|
||||||
final OffsetAttribute offsetAtt = tk.addAttribute(OffsetAttribute.class);
|
((Tokenizer)tk).setReader(new StringReader(s));
|
||||||
tk.reset();
|
tk = new EdgeNGramTokenFilter(tk, minGram, maxGram, preserveOriginal);
|
||||||
for (int i = minGram; i <= Math.min(codePointCount, maxGram); ++i) {
|
final CharTermAttribute termAtt = tk.addAttribute(CharTermAttribute.class);
|
||||||
assertTrue(tk.incrementToken());
|
final OffsetAttribute offsetAtt = tk.addAttribute(OffsetAttribute.class);
|
||||||
assertEquals(0, offsetAtt.startOffset());
|
tk.reset();
|
||||||
assertEquals(s.length(), offsetAtt.endOffset());
|
|
||||||
final int end = Character.offsetByCodePoints(s, 0, i);
|
if (codePointCount < minGram && preserveOriginal) {
|
||||||
assertEquals(s.substring(0, end), termAtt.toString());
|
assertTrue(tk.incrementToken());
|
||||||
|
assertEquals(0, offsetAtt.startOffset());
|
||||||
|
assertEquals(s.length(), offsetAtt.endOffset());
|
||||||
|
assertEquals(s, termAtt.toString());
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int j = minGram; j <= Math.min(codePointCount, maxGram); j++) {
|
||||||
|
assertTrue(tk.incrementToken());
|
||||||
|
assertEquals(0, offsetAtt.startOffset());
|
||||||
|
assertEquals(s.length(), offsetAtt.endOffset());
|
||||||
|
final int end = Character.offsetByCodePoints(s, 0, j);
|
||||||
|
assertEquals(s.substring(0, end), termAtt.toString());
|
||||||
|
}
|
||||||
|
|
||||||
|
if (codePointCount > maxGram && preserveOriginal) {
|
||||||
|
assertTrue(tk.incrementToken());
|
||||||
|
assertEquals(0, offsetAtt.startOffset());
|
||||||
|
assertEquals(s.length(), offsetAtt.endOffset());
|
||||||
|
assertEquals(s, termAtt.toString());
|
||||||
|
}
|
||||||
|
|
||||||
|
assertFalse(tk.incrementToken());
|
||||||
|
tk.close();
|
||||||
}
|
}
|
||||||
assertFalse(tk.incrementToken());
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -48,28 +48,28 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
public void testInvalidInput() throws Exception {
|
public void testInvalidInput() throws Exception {
|
||||||
expectThrows(IllegalArgumentException.class, () -> {
|
expectThrows(IllegalArgumentException.class, () -> {
|
||||||
new NGramTokenFilter(input, 2, 1);
|
new NGramTokenFilter(input, 2, 1, false);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testInvalidInput2() throws Exception {
|
public void testInvalidInput2() throws Exception {
|
||||||
expectThrows(IllegalArgumentException.class, () -> {
|
expectThrows(IllegalArgumentException.class, () -> {
|
||||||
new NGramTokenFilter(input, 0, 1);
|
new NGramTokenFilter(input, 0, 1, false);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testUnigrams() throws Exception {
|
public void testUnigrams() throws Exception {
|
||||||
NGramTokenFilter filter = new NGramTokenFilter(input, 1, 1);
|
NGramTokenFilter filter = new NGramTokenFilter(input, 1, 1, false);
|
||||||
assertTokenStreamContents(filter, new String[]{"a","b","c","d","e"}, new int[]{0,0,0,0,0}, new int[]{5,5,5,5,5}, new int[]{1,0,0,0,0});
|
assertTokenStreamContents(filter, new String[]{"a","b","c","d","e"}, new int[]{0,0,0,0,0}, new int[]{5,5,5,5,5}, new int[]{1,0,0,0,0});
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testBigrams() throws Exception {
|
public void testBigrams() throws Exception {
|
||||||
NGramTokenFilter filter = new NGramTokenFilter(input, 2, 2);
|
NGramTokenFilter filter = new NGramTokenFilter(input, 2, 2, false);
|
||||||
assertTokenStreamContents(filter, new String[]{"ab","bc","cd","de"}, new int[]{0,0,0,0}, new int[]{5,5,5,5}, new int[]{1,0,0,0});
|
assertTokenStreamContents(filter, new String[]{"ab","bc","cd","de"}, new int[]{0,0,0,0}, new int[]{5,5,5,5}, new int[]{1,0,0,0});
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testNgrams() throws Exception {
|
public void testNgrams() throws Exception {
|
||||||
NGramTokenFilter filter = new NGramTokenFilter(input, 1, 3);
|
NGramTokenFilter filter = new NGramTokenFilter(input, 1, 3, false);
|
||||||
assertTokenStreamContents(filter,
|
assertTokenStreamContents(filter,
|
||||||
new String[]{"a","ab","abc","b","bc","bcd","c","cd","cde","d","de","e"},
|
new String[]{"a","ab","abc","b","bc","bcd","c","cd","cde","d","de","e"},
|
||||||
new int[]{0,0,0,0,0,0,0,0,0,0,0,0},
|
new int[]{0,0,0,0,0,0,0,0,0,0,0,0},
|
||||||
|
@ -81,7 +81,7 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testNgramsNoIncrement() throws Exception {
|
public void testNgramsNoIncrement() throws Exception {
|
||||||
NGramTokenFilter filter = new NGramTokenFilter(input, 1, 3);
|
NGramTokenFilter filter = new NGramTokenFilter(input, 1, 3, false);
|
||||||
assertTokenStreamContents(filter,
|
assertTokenStreamContents(filter,
|
||||||
new String[]{"a","ab","abc","b","bc","bcd","c","cd","cde","d","de","e"},
|
new String[]{"a","ab","abc","b","bc","bcd","c","cd","cde","d","de","e"},
|
||||||
new int[]{0,0,0,0,0,0,0,0,0,0,0,0},
|
new int[]{0,0,0,0,0,0,0,0,0,0,0,0},
|
||||||
|
@ -93,25 +93,61 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testOversizedNgrams() throws Exception {
|
public void testOversizedNgrams() throws Exception {
|
||||||
NGramTokenFilter filter = new NGramTokenFilter(input, 6, 7);
|
NGramTokenFilter filter = new NGramTokenFilter(input, 6, 7, false);
|
||||||
assertTokenStreamContents(filter, new String[0], new int[0], new int[0]);
|
assertTokenStreamContents(filter, new String[0], new int[0], new int[0]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testOversizedNgramsPreserveOriginal() throws Exception {
|
||||||
|
NGramTokenFilter tokenizer = new NGramTokenFilter(input, 6, 6, true);
|
||||||
|
assertTokenStreamContents(tokenizer, new String[] {"abcde"}, new int[] {0}, new int[] {5});
|
||||||
|
}
|
||||||
|
|
||||||
public void testSmallTokenInStream() throws Exception {
|
public void testSmallTokenInStream() throws Exception {
|
||||||
input = whitespaceMockTokenizer("abc de fgh");
|
input = whitespaceMockTokenizer("abc de fgh");
|
||||||
NGramTokenFilter filter = new NGramTokenFilter(input, 3, 3);
|
NGramTokenFilter tokenizer = new NGramTokenFilter(input, 3, 3, false);
|
||||||
assertTokenStreamContents(filter, new String[]{"abc","fgh"}, new int[]{0,7}, new int[]{3,10}, new int[] {1, 2});
|
assertTokenStreamContents(tokenizer, new String[]{"abc","fgh"}, new int[]{0,7}, new int[]{3,10}, new int[] {1, 2});
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testSmallTokenInStreamPreserveOriginal() throws Exception {
|
||||||
|
input = whitespaceMockTokenizer("abc de fgh");
|
||||||
|
NGramTokenFilter tokenizer = new NGramTokenFilter(input, 3, 3, true);
|
||||||
|
assertTokenStreamContents(tokenizer, new String[]{"abc","de","fgh"}, new int[]{0,4,7}, new int[]{3,6,10}, new int[] {1, 1, 1});
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testReset() throws Exception {
|
public void testReset() throws Exception {
|
||||||
WhitespaceTokenizer tokenizer = new WhitespaceTokenizer();
|
WhitespaceTokenizer tokenizer = new WhitespaceTokenizer();
|
||||||
tokenizer.setReader(new StringReader("abcde"));
|
tokenizer.setReader(new StringReader("abcde"));
|
||||||
NGramTokenFilter filter = new NGramTokenFilter(tokenizer, 1, 1);
|
NGramTokenFilter filter = new NGramTokenFilter(tokenizer, 1, 1, false);
|
||||||
assertTokenStreamContents(filter, new String[]{"a","b","c","d","e"}, new int[]{0,0,0,0,0}, new int[]{5,5,5,5,5}, new int[]{1,0,0,0,0});
|
assertTokenStreamContents(filter, new String[]{"a","b","c","d","e"}, new int[]{0,0,0,0,0}, new int[]{5,5,5,5,5}, new int[]{1,0,0,0,0});
|
||||||
tokenizer.setReader(new StringReader("abcde"));
|
tokenizer.setReader(new StringReader("abcde"));
|
||||||
assertTokenStreamContents(filter, new String[]{"a","b","c","d","e"}, new int[]{0,0,0,0,0}, new int[]{5,5,5,5,5}, new int[]{1,0,0,0,0});
|
assertTokenStreamContents(filter, new String[]{"a","b","c","d","e"}, new int[]{0,0,0,0,0}, new int[]{5,5,5,5,5}, new int[]{1,0,0,0,0});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testKeepShortTermKeepLongTerm() throws Exception {
|
||||||
|
final String inputString = "a bcd efghi jk";
|
||||||
|
|
||||||
|
{ // preserveOriginal = false
|
||||||
|
TokenStream ts = whitespaceMockTokenizer(inputString);
|
||||||
|
NGramTokenFilter filter = new NGramTokenFilter(ts, 2, 3, false);
|
||||||
|
assertTokenStreamContents(filter,
|
||||||
|
new String[] { "bc", "bcd", "cd", "ef", "efg", "fg", "fgh", "gh", "ghi", "hi", "jk" },
|
||||||
|
new int[] { 2, 2, 2, 6, 6, 6, 6, 6, 6, 6, 12 },
|
||||||
|
new int[] { 5, 5, 5, 11, 11, 11, 11, 11, 11, 11, 14 },
|
||||||
|
new int[] { 2, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1 });
|
||||||
|
}
|
||||||
|
|
||||||
|
{ // preserveOriginal = true
|
||||||
|
TokenStream ts = whitespaceMockTokenizer(inputString);
|
||||||
|
NGramTokenFilter filter = new NGramTokenFilter(ts, 2, 3, true);
|
||||||
|
assertTokenStreamContents(filter,
|
||||||
|
new String[] { "a", "bc", "bcd", "cd", "ef", "efg", "fg", "fgh", "gh", "ghi", "hi", "efghi", "jk" },
|
||||||
|
new int[] { 0, 2, 2, 2, 6, 6, 6, 6, 6, 6, 6, 6, 12 },
|
||||||
|
new int[] { 1, 5, 5, 5, 11, 11, 11, 11, 11, 11, 11, 11, 14 },
|
||||||
|
new int[] { 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1 });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// LUCENE-3642
|
// LUCENE-3642
|
||||||
// EdgeNgram blindly adds term length to offset, but this can take things out of bounds
|
// EdgeNgram blindly adds term length to offset, but this can take things out of bounds
|
||||||
// wrt original text if a previous filter increases the length of the word (in this case æ -> ae)
|
// wrt original text if a previous filter increases the length of the word (in this case æ -> ae)
|
||||||
|
@ -122,7 +158,7 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase {
|
||||||
protected TokenStreamComponents createComponents(String fieldName) {
|
protected TokenStreamComponents createComponents(String fieldName) {
|
||||||
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
|
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
|
||||||
TokenFilter filters = new ASCIIFoldingFilter(tokenizer);
|
TokenFilter filters = new ASCIIFoldingFilter(tokenizer);
|
||||||
filters = new NGramTokenFilter(filters, 2, 2);
|
filters = new NGramTokenFilter(filters, 2, 2, false);
|
||||||
return new TokenStreamComponents(tokenizer, filters);
|
return new TokenStreamComponents(tokenizer, filters);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -139,12 +175,14 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase {
|
||||||
for (int i = 0; i < 10; i++) {
|
for (int i = 0; i < 10; i++) {
|
||||||
final int min = TestUtil.nextInt(random(), 2, 10);
|
final int min = TestUtil.nextInt(random(), 2, 10);
|
||||||
final int max = TestUtil.nextInt(random(), min, 20);
|
final int max = TestUtil.nextInt(random(), min, 20);
|
||||||
|
final boolean preserveOriginal = TestUtil.nextInt(random(), 0, 1) % 2 == 0;
|
||||||
|
|
||||||
Analyzer a = new Analyzer() {
|
Analyzer a = new Analyzer() {
|
||||||
@Override
|
@Override
|
||||||
protected TokenStreamComponents createComponents(String fieldName) {
|
protected TokenStreamComponents createComponents(String fieldName) {
|
||||||
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
|
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
|
||||||
return new TokenStreamComponents(tokenizer,
|
return new TokenStreamComponents(tokenizer,
|
||||||
new NGramTokenFilter(tokenizer, min, max));
|
new NGramTokenFilter(tokenizer, min, max, preserveOriginal));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
checkRandomData(random(), a, 200*RANDOM_MULTIPLIER, 20);
|
checkRandomData(random(), a, 200*RANDOM_MULTIPLIER, 20);
|
||||||
|
@ -159,7 +197,7 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase {
|
||||||
protected TokenStreamComponents createComponents(String fieldName) {
|
protected TokenStreamComponents createComponents(String fieldName) {
|
||||||
Tokenizer tokenizer = new KeywordTokenizer();
|
Tokenizer tokenizer = new KeywordTokenizer();
|
||||||
return new TokenStreamComponents(tokenizer,
|
return new TokenStreamComponents(tokenizer,
|
||||||
new NGramTokenFilter(tokenizer, 2, 15));
|
new NGramTokenFilter(tokenizer, 2, 15, false));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
checkAnalysisConsistency(random, a, random.nextBoolean(), "");
|
checkAnalysisConsistency(random, a, random.nextBoolean(), "");
|
||||||
|
@ -167,27 +205,47 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testSupplementaryCharacters() throws IOException {
|
public void testSupplementaryCharacters() throws IOException {
|
||||||
final String s = TestUtil.randomUnicodeString(random(), 10);
|
for (int i = 0; i < 20; i++) {
|
||||||
final int codePointCount = s.codePointCount(0, s.length());
|
final String s = TestUtil.randomUnicodeString(random(), 10);
|
||||||
final int minGram = TestUtil.nextInt(random(), 1, 3);
|
final int codePointCount = s.codePointCount(0, s.length());
|
||||||
final int maxGram = TestUtil.nextInt(random(), minGram, 10);
|
final int minGram = TestUtil.nextInt(random(), 1, 3);
|
||||||
TokenStream tk = new KeywordTokenizer();
|
final int maxGram = TestUtil.nextInt(random(), minGram, 10);
|
||||||
((Tokenizer)tk).setReader(new StringReader(s));
|
final boolean preserveOriginal = TestUtil.nextInt(random(), 0, 1) % 2 == 0;
|
||||||
tk = new NGramTokenFilter(tk, minGram, maxGram);
|
|
||||||
final CharTermAttribute termAtt = tk.addAttribute(CharTermAttribute.class);
|
TokenStream tk = new KeywordTokenizer();
|
||||||
final OffsetAttribute offsetAtt = tk.addAttribute(OffsetAttribute.class);
|
((Tokenizer)tk).setReader(new StringReader(s));
|
||||||
tk.reset();
|
tk = new NGramTokenFilter(tk, minGram, maxGram, preserveOriginal);
|
||||||
for (int start = 0; start < codePointCount; ++start) {
|
final CharTermAttribute termAtt = tk.addAttribute(CharTermAttribute.class);
|
||||||
for (int end = start + minGram; end <= Math.min(codePointCount, start + maxGram); ++end) {
|
final OffsetAttribute offsetAtt = tk.addAttribute(OffsetAttribute.class);
|
||||||
|
tk.reset();
|
||||||
|
|
||||||
|
if (codePointCount < minGram && preserveOriginal) {
|
||||||
assertTrue(tk.incrementToken());
|
assertTrue(tk.incrementToken());
|
||||||
assertEquals(0, offsetAtt.startOffset());
|
assertEquals(0, offsetAtt.startOffset());
|
||||||
assertEquals(s.length(), offsetAtt.endOffset());
|
assertEquals(s.length(), offsetAtt.endOffset());
|
||||||
final int startIndex = Character.offsetByCodePoints(s, 0, start);
|
assertEquals(s, termAtt.toString());
|
||||||
final int endIndex = Character.offsetByCodePoints(s, 0, end);
|
|
||||||
assertEquals(s.substring(startIndex, endIndex), termAtt.toString());
|
|
||||||
}
|
}
|
||||||
}
|
|
||||||
assertFalse(tk.incrementToken());
|
|
||||||
}
|
|
||||||
|
|
||||||
|
for (int start = 0; start < codePointCount; ++start) {
|
||||||
|
for (int end = start + minGram; end <= Math.min(codePointCount, start + maxGram); ++end) {
|
||||||
|
assertTrue(tk.incrementToken());
|
||||||
|
assertEquals(0, offsetAtt.startOffset());
|
||||||
|
assertEquals(s.length(), offsetAtt.endOffset());
|
||||||
|
final int startIndex = Character.offsetByCodePoints(s, 0, start);
|
||||||
|
final int endIndex = Character.offsetByCodePoints(s, 0, end);
|
||||||
|
assertEquals(s.substring(startIndex, endIndex), termAtt.toString());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (codePointCount > maxGram && preserveOriginal) {
|
||||||
|
assertTrue(tk.incrementToken());
|
||||||
|
assertEquals(0, offsetAtt.startOffset());
|
||||||
|
assertEquals(s.length(), offsetAtt.endOffset());
|
||||||
|
assertEquals(s, termAtt.toString());
|
||||||
|
}
|
||||||
|
|
||||||
|
assertFalse(tk.incrementToken());
|
||||||
|
tk.close();
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -56,12 +56,14 @@ public class TestNGramFilters extends BaseTokenStreamFactoryTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test the NGramFilterFactory
|
* Test the NGramFilterFactory with old defaults
|
||||||
*/
|
*/
|
||||||
public void testNGramFilter() throws Exception {
|
public void testNGramFilter() throws Exception {
|
||||||
Reader reader = new StringReader("test");
|
Reader reader = new StringReader("test");
|
||||||
TokenStream stream = whitespaceMockTokenizer(reader);
|
TokenStream stream = whitespaceMockTokenizer(reader);
|
||||||
stream = tokenFilterFactory("NGram").create(stream);
|
stream = tokenFilterFactory("NGram",
|
||||||
|
"minGramSize", "1",
|
||||||
|
"maxGramSize", "2").create(stream);
|
||||||
assertTokenStreamContents(stream,
|
assertTokenStreamContents(stream,
|
||||||
new String[] { "t", "te", "e", "es", "s", "st", "t" });
|
new String[] { "t", "te", "e", "es", "s", "st", "t" });
|
||||||
}
|
}
|
||||||
|
@ -126,12 +128,13 @@ public class TestNGramFilters extends BaseTokenStreamFactoryTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test EdgeNGramFilterFactory
|
* Test EdgeNGramFilterFactory with old defaults
|
||||||
*/
|
*/
|
||||||
public void testEdgeNGramFilter() throws Exception {
|
public void testEdgeNGramFilter() throws Exception {
|
||||||
Reader reader = new StringReader("test");
|
Reader reader = new StringReader("test");
|
||||||
TokenStream stream = whitespaceMockTokenizer(reader);
|
TokenStream stream = whitespaceMockTokenizer(reader);
|
||||||
stream = tokenFilterFactory("EdgeNGram").create(stream);
|
stream = tokenFilterFactory("EdgeNGram", "minGramSize", "1",
|
||||||
|
"maxGramSize", "1").create(stream);
|
||||||
assertTokenStreamContents(stream,
|
assertTokenStreamContents(stream,
|
||||||
new String[] { "t" });
|
new String[] { "t" });
|
||||||
}
|
}
|
||||||
|
@ -173,7 +176,8 @@ public class TestNGramFilters extends BaseTokenStreamFactoryTestCase {
|
||||||
|
|
||||||
/** Test that bogus arguments result in exception */
|
/** Test that bogus arguments result in exception */
|
||||||
public void testBogusArguments() throws Exception {
|
public void testBogusArguments() throws Exception {
|
||||||
IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {
|
IllegalArgumentException expected = null;
|
||||||
|
expected = expectThrows(IllegalArgumentException.class, () -> {
|
||||||
tokenizerFactory("NGram", "bogusArg", "bogusValue");
|
tokenizerFactory("NGram", "bogusArg", "bogusValue");
|
||||||
});
|
});
|
||||||
assertTrue(expected.getMessage().contains("Unknown parameters"));
|
assertTrue(expected.getMessage().contains("Unknown parameters"));
|
||||||
|
@ -184,12 +188,12 @@ public class TestNGramFilters extends BaseTokenStreamFactoryTestCase {
|
||||||
assertTrue(expected.getMessage().contains("Unknown parameters"));
|
assertTrue(expected.getMessage().contains("Unknown parameters"));
|
||||||
|
|
||||||
expected = expectThrows(IllegalArgumentException.class, () -> {
|
expected = expectThrows(IllegalArgumentException.class, () -> {
|
||||||
tokenFilterFactory("NGram", "bogusArg", "bogusValue");
|
tokenFilterFactory("NGram", "minGramSize", "2", "maxGramSize", "5", "bogusArg", "bogusValue");
|
||||||
});
|
});
|
||||||
assertTrue(expected.getMessage().contains("Unknown parameters"));
|
assertTrue(expected.getMessage().contains("Unknown parameters"));
|
||||||
|
|
||||||
expected = expectThrows(IllegalArgumentException.class, () -> {
|
expected = expectThrows(IllegalArgumentException.class, () -> {
|
||||||
tokenFilterFactory("EdgeNGram", "bogusArg", "bogusValue");
|
tokenFilterFactory("EdgeNGram", "minGramSize", "2", "maxGramSize", "5", "bogusArg", "bogusValue");
|
||||||
});
|
});
|
||||||
assertTrue(expected.getMessage().contains("Unknown parameters"));
|
assertTrue(expected.getMessage().contains("Unknown parameters"));
|
||||||
}
|
}
|
||||||
|
|
|
@ -87,7 +87,7 @@ public class BM25NBClassifierTest extends ClassificationTestBase<BytesRef> {
|
||||||
@Override
|
@Override
|
||||||
protected TokenStreamComponents createComponents(String fieldName) {
|
protected TokenStreamComponents createComponents(String fieldName) {
|
||||||
final Tokenizer tokenizer = new KeywordTokenizer();
|
final Tokenizer tokenizer = new KeywordTokenizer();
|
||||||
return new TokenStreamComponents(tokenizer, new ReverseStringFilter(new EdgeNGramTokenFilter(new ReverseStringFilter(tokenizer), 10, 20)));
|
return new TokenStreamComponents(tokenizer, new ReverseStringFilter(new EdgeNGramTokenFilter(new ReverseStringFilter(tokenizer), 10, 20, false)));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -86,7 +86,7 @@ public class CachingNaiveBayesClassifierTest extends ClassificationTestBase<Byte
|
||||||
@Override
|
@Override
|
||||||
protected TokenStreamComponents createComponents(String fieldName) {
|
protected TokenStreamComponents createComponents(String fieldName) {
|
||||||
final Tokenizer tokenizer = new KeywordTokenizer();
|
final Tokenizer tokenizer = new KeywordTokenizer();
|
||||||
return new TokenStreamComponents(tokenizer, new ReverseStringFilter(new EdgeNGramTokenFilter(new ReverseStringFilter(tokenizer), 10, 20)));
|
return new TokenStreamComponents(tokenizer, new ReverseStringFilter(new EdgeNGramTokenFilter(new ReverseStringFilter(tokenizer), 10, 20, false)));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -89,7 +89,7 @@ public class SimpleNaiveBayesClassifierTest extends ClassificationTestBase<Bytes
|
||||||
@Override
|
@Override
|
||||||
protected TokenStreamComponents createComponents(String fieldName) {
|
protected TokenStreamComponents createComponents(String fieldName) {
|
||||||
final Tokenizer tokenizer = new KeywordTokenizer();
|
final Tokenizer tokenizer = new KeywordTokenizer();
|
||||||
return new TokenStreamComponents(tokenizer, new ReverseStringFilter(new EdgeNGramTokenFilter(new ReverseStringFilter(tokenizer), 10, 20)));
|
return new TokenStreamComponents(tokenizer, new ReverseStringFilter(new EdgeNGramTokenFilter(new ReverseStringFilter(tokenizer), 10, 20, false)));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -359,7 +359,7 @@ public class AnalyzingInfixSuggester extends Lookup implements Closeable {
|
||||||
: "no need \"textgrams\" when minPrefixChars="+minPrefixChars;
|
: "no need \"textgrams\" when minPrefixChars="+minPrefixChars;
|
||||||
if (fieldName.equals(TEXTGRAMS_FIELD_NAME) && minPrefixChars > 0) {
|
if (fieldName.equals(TEXTGRAMS_FIELD_NAME) && minPrefixChars > 0) {
|
||||||
// TODO: should use an EdgeNGramTokenFilterFactory here
|
// TODO: should use an EdgeNGramTokenFilterFactory here
|
||||||
TokenFilter filter = new EdgeNGramTokenFilter(components.getTokenStream(), 1, minPrefixChars);
|
TokenFilter filter = new EdgeNGramTokenFilter(components.getTokenStream(), 1, minPrefixChars, false);
|
||||||
return new TokenStreamComponents(components.getTokenizer(), filter);
|
return new TokenStreamComponents(components.getTokenizer(), filter);
|
||||||
} else {
|
} else {
|
||||||
return components;
|
return components;
|
||||||
|
|
Loading…
Reference in New Issue