mirror of https://github.com/apache/lucene.git
LUCENE-7690: Add preserveOriginal option to the NGram and EdgeNGram filters
This commit is contained in:
parent
59087d148a
commit
2c1ab31b4e
|
@ -202,6 +202,9 @@ New Features
|
|||
IndexFileDeleter already accounts for that for existing files which we can
|
||||
now use to also take pending deletes into account which ensures that all file
|
||||
generations per segment always go forward. (Simon Willnauer)
|
||||
|
||||
* LUCENE-7690: Add preserveOriginal option to the NGram and EdgeNGram filters.
|
||||
(Ingomar Wesp, Shawn Heisey via Robert Muir)
|
||||
|
||||
* LUCENE-8335: Enforce soft-deletes field up-front. Soft deletes field must be marked
|
||||
as such once it's introduced and can't be changed after the fact.
|
||||
|
|
|
@ -29,19 +29,21 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
|
|||
* <fieldType name="text_edgngrm" class="solr.TextField" positionIncrementGap="100">
|
||||
* <analyzer>
|
||||
* <tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||
* <filter class="solr.EdgeNGramFilterFactory" minGramSize="1" maxGramSize="1"/>
|
||||
* <filter class="solr.EdgeNGramFilterFactory" minGramSize="1" maxGramSize="2" preserveOriginal="true"/>
|
||||
* </analyzer>
|
||||
* </fieldType></pre>
|
||||
*/
|
||||
public class EdgeNGramFilterFactory extends TokenFilterFactory {
|
||||
private final int maxGramSize;
|
||||
private final int minGramSize;
|
||||
private final boolean preserveOriginal;
|
||||
|
||||
/** Creates a new EdgeNGramFilterFactory */
|
||||
public EdgeNGramFilterFactory(Map<String, String> args) {
|
||||
super(args);
|
||||
minGramSize = getInt(args, "minGramSize", EdgeNGramTokenFilter.DEFAULT_MIN_GRAM_SIZE);
|
||||
maxGramSize = getInt(args, "maxGramSize", EdgeNGramTokenFilter.DEFAULT_MAX_GRAM_SIZE);
|
||||
preserveOriginal = getBoolean(args, "preserveOriginal", EdgeNGramTokenFilter.DEFAULT_PRESERVE_ORIGINAL);
|
||||
if (!args.isEmpty()) {
|
||||
throw new IllegalArgumentException("Unknown parameters: " + args);
|
||||
}
|
||||
|
@ -49,6 +51,6 @@ public class EdgeNGramFilterFactory extends TokenFilterFactory {
|
|||
|
||||
@Override
|
||||
public TokenFilter create(TokenStream input) {
|
||||
return new EdgeNGramTokenFilter(input, minGramSize, maxGramSize);
|
||||
return new EdgeNGramTokenFilter(input, minGramSize, maxGramSize, preserveOriginal);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -32,29 +32,46 @@ import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
|||
* supplementary characters.
|
||||
*/
|
||||
public final class EdgeNGramTokenFilter extends TokenFilter {
|
||||
/**
|
||||
* @deprecated since 7.4 - this value will be required.
|
||||
*/
|
||||
@Deprecated
|
||||
public static final int DEFAULT_MAX_GRAM_SIZE = 1;
|
||||
/**
|
||||
* @deprecated since 7.4 - this value will be required.
|
||||
*/
|
||||
@Deprecated
|
||||
public static final int DEFAULT_MIN_GRAM_SIZE = 1;
|
||||
public static final boolean DEFAULT_PRESERVE_ORIGINAL = false;
|
||||
|
||||
private final int minGram;
|
||||
private final int maxGram;
|
||||
private final boolean preserveOriginal;
|
||||
|
||||
private char[] curTermBuffer;
|
||||
private int curTermLength;
|
||||
private int curCodePointCount;
|
||||
private int curTermCodePointCount;
|
||||
private int curGramSize;
|
||||
private int savePosIncr;
|
||||
private int curPosIncr;
|
||||
private State state;
|
||||
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
|
||||
/**
|
||||
* Creates EdgeNGramTokenFilter that can generate n-grams in the sizes of the given range
|
||||
*
|
||||
* Creates an EdgeNGramTokenFilter that, for a given input term, produces all
|
||||
* edge n-grams with lengths >= minGram and <= maxGram. Will
|
||||
* optionally preserve the original term when its length is outside of the
|
||||
* defined range.
|
||||
*
|
||||
* @param input {@link TokenStream} holding the input to be tokenized
|
||||
* @param minGram the smallest n-gram to generate
|
||||
* @param maxGram the largest n-gram to generate
|
||||
* @param minGram the minimum length of the generated n-grams
|
||||
* @param maxGram the maximum length of the generated n-grams
|
||||
* @param preserveOriginal Whether or not to keep the original term when it
|
||||
* is outside the min/max size range.
|
||||
*/
|
||||
public EdgeNGramTokenFilter(TokenStream input, int minGram, int maxGram) {
|
||||
public EdgeNGramTokenFilter(
|
||||
TokenStream input, int minGram, int maxGram, boolean preserveOriginal) {
|
||||
super(input);
|
||||
|
||||
if (minGram < 1) {
|
||||
|
@ -67,6 +84,39 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
|
|||
|
||||
this.minGram = minGram;
|
||||
this.maxGram = maxGram;
|
||||
this.preserveOriginal = preserveOriginal;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates an EdgeNGramTokenFilter that produces edge n-grams of the given
|
||||
* size.
|
||||
*
|
||||
* @param input {@link TokenStream} holding the input to be tokenized
|
||||
* @param gramSize the n-gram size to generate.
|
||||
*/
|
||||
public EdgeNGramTokenFilter(TokenStream input, int gramSize) {
|
||||
this(input, gramSize, gramSize, DEFAULT_PRESERVE_ORIGINAL);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates an EdgeNGramTokenFilter that, for a given input term, produces all
|
||||
* edge n-grams with lengths >= minGram and <= maxGram.
|
||||
*
|
||||
* <p>
|
||||
* Behaves the same as
|
||||
* {@link #EdgeNGramTokenFilter(TokenStream, int, int, boolean)
|
||||
* NGramTokenFilter(input, minGram, maxGram, false)}
|
||||
*
|
||||
* @param input {@link TokenStream} holding the input to be tokenized
|
||||
* @param minGram the minimum length of the generated n-grams
|
||||
* @param maxGram the maximum length of the generated n-grams
|
||||
*
|
||||
* @deprecated since 7.4. Use
|
||||
* {@link #EdgeNGramTokenFilter(TokenStream, int, int, boolean)} instead.
|
||||
*/
|
||||
@Deprecated
|
||||
public EdgeNGramTokenFilter(TokenStream input, int minGram, int maxGram) {
|
||||
this(input, minGram, maxGram, DEFAULT_PRESERVE_ORIGINAL);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -75,32 +125,46 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
|
|||
if (curTermBuffer == null) {
|
||||
if (!input.incrementToken()) {
|
||||
return false;
|
||||
} else {
|
||||
curTermBuffer = termAtt.buffer().clone();
|
||||
curTermLength = termAtt.length();
|
||||
curCodePointCount = Character.codePointCount(termAtt, 0, termAtt.length());
|
||||
curGramSize = minGram;
|
||||
state = captureState();
|
||||
savePosIncr += posIncrAtt.getPositionIncrement();
|
||||
}
|
||||
state = captureState();
|
||||
|
||||
curTermLength = termAtt.length();
|
||||
curTermCodePointCount = Character.codePointCount(termAtt, 0, curTermLength);
|
||||
curPosIncr += posIncrAtt.getPositionIncrement();
|
||||
|
||||
if (preserveOriginal && curTermCodePointCount < minGram) {
|
||||
// Token is shorter than minGram, but we'd still like to keep it.
|
||||
posIncrAtt.setPositionIncrement(curPosIncr);
|
||||
curPosIncr = 0;
|
||||
return true;
|
||||
}
|
||||
|
||||
curTermBuffer = termAtt.buffer().clone();
|
||||
curGramSize = minGram;
|
||||
}
|
||||
if (curGramSize <= maxGram) { // if we have hit the end of our n-gram size range, quit
|
||||
if (curGramSize <= curCodePointCount) { // if the remaining input is too short, we can't generate any n-grams
|
||||
// grab gramSize chars from front or back
|
||||
|
||||
if (curGramSize <= curTermCodePointCount) {
|
||||
if (curGramSize <= maxGram) { // curGramSize is between minGram and maxGram
|
||||
restoreState(state);
|
||||
// first ngram gets increment, others don't
|
||||
if (curGramSize == minGram) {
|
||||
posIncrAtt.setPositionIncrement(savePosIncr);
|
||||
savePosIncr = 0;
|
||||
} else {
|
||||
posIncrAtt.setPositionIncrement(0);
|
||||
}
|
||||
posIncrAtt.setPositionIncrement(curPosIncr);
|
||||
curPosIncr = 0;
|
||||
|
||||
final int charLength = Character.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curGramSize);
|
||||
termAtt.copyBuffer(curTermBuffer, 0, charLength);
|
||||
curGramSize++;
|
||||
return true;
|
||||
}
|
||||
else if (preserveOriginal) {
|
||||
// Token is longer than maxGram, but we'd still like to keep it.
|
||||
restoreState(state);
|
||||
posIncrAtt.setPositionIncrement(0);
|
||||
termAtt.copyBuffer(curTermBuffer, 0, curTermLength);
|
||||
curTermBuffer = null;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
// Done with this input token, get next token on the next iteration.
|
||||
curTermBuffer = null;
|
||||
}
|
||||
}
|
||||
|
@ -109,6 +173,6 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
|
|||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
curTermBuffer = null;
|
||||
savePosIncr = 0;
|
||||
curPosIncr = 0;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -29,19 +29,21 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
|
|||
* <fieldType name="text_ngrm" class="solr.TextField" positionIncrementGap="100">
|
||||
* <analyzer>
|
||||
* <tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||
* <filter class="solr.NGramFilterFactory" minGramSize="1" maxGramSize="2"/>
|
||||
* <filter class="solr.NGramFilterFactory" minGramSize="1" maxGramSize="2" preserveOriginal="true"/>
|
||||
* </analyzer>
|
||||
* </fieldType></pre>
|
||||
*/
|
||||
public class NGramFilterFactory extends TokenFilterFactory {
|
||||
private final int maxGramSize;
|
||||
private final int minGramSize;
|
||||
private final boolean preserveOriginal;
|
||||
|
||||
/** Creates a new NGramFilterFactory */
|
||||
public NGramFilterFactory(Map<String, String> args) {
|
||||
super(args);
|
||||
minGramSize = getInt(args, "minGramSize", NGramTokenFilter.DEFAULT_MIN_NGRAM_SIZE);
|
||||
maxGramSize = getInt(args, "maxGramSize", NGramTokenFilter.DEFAULT_MAX_NGRAM_SIZE);
|
||||
preserveOriginal = getBoolean(args, "keepShortTerm", NGramTokenFilter.DEFAULT_PRESERVE_ORIGINAL);
|
||||
if (!args.isEmpty()) {
|
||||
throw new IllegalArgumentException("Unknown parameters: " + args);
|
||||
}
|
||||
|
@ -49,6 +51,6 @@ public class NGramFilterFactory extends TokenFilterFactory {
|
|||
|
||||
@Override
|
||||
public TokenFilter create(TokenStream input) {
|
||||
return new NGramTokenFilter(input, minGramSize, maxGramSize);
|
||||
return new NGramTokenFilter(input, minGramSize, maxGramSize, preserveOriginal);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -21,7 +21,6 @@ import java.io.IOException;
|
|||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.miscellaneous.CodepointCountFilter;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
|
||||
|
@ -40,30 +39,52 @@ import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
|||
* override {@link NGramTokenizer#isTokenChar(int)} to perform pre-tokenization.
|
||||
*/
|
||||
public final class NGramTokenFilter extends TokenFilter {
|
||||
/**
|
||||
* @deprecated since 7.4 - this value will be required.
|
||||
*/
|
||||
@Deprecated
|
||||
public static final int DEFAULT_MIN_NGRAM_SIZE = 1;
|
||||
public static final int DEFAULT_MAX_NGRAM_SIZE = 2;
|
||||
|
||||
private final int minGram, maxGram;
|
||||
/**
|
||||
* @deprecated since 7.4 - this value will be required.
|
||||
*/
|
||||
@Deprecated
|
||||
public static final int DEFAULT_MAX_NGRAM_SIZE = 2;
|
||||
public static final boolean DEFAULT_PRESERVE_ORIGINAL = false;
|
||||
|
||||
private final int minGram;
|
||||
private final int maxGram;
|
||||
private final boolean preserveOriginal;
|
||||
|
||||
private char[] curTermBuffer;
|
||||
private int curTermLength;
|
||||
private int curCodePointCount;
|
||||
private int curTermCodePointCount;
|
||||
private int curGramSize;
|
||||
private int curPos;
|
||||
private int curPosInc;
|
||||
private int curPosIncr;
|
||||
private State state;
|
||||
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final PositionIncrementAttribute posIncAtt;
|
||||
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
|
||||
/**
|
||||
* Creates NGramTokenFilter with given min and max n-grams.
|
||||
* Creates an NGramTokenFilter that, for a given input term, produces all
|
||||
* contained n-grams with lengths >= minGram and <= maxGram. Will
|
||||
* optionally preserve the original term when its length is outside of the
|
||||
* defined range.
|
||||
*
|
||||
* Note: Care must be taken when choosing minGram and maxGram; depending
|
||||
* on the input token size, this filter potentially produces a huge number
|
||||
* of terms.
|
||||
*
|
||||
* @param input {@link TokenStream} holding the input to be tokenized
|
||||
* @param minGram the smallest n-gram to generate
|
||||
* @param maxGram the largest n-gram to generate
|
||||
* @param minGram the minimum length of the generated n-grams
|
||||
* @param maxGram the maximum length of the generated n-grams
|
||||
* @param preserveOriginal Whether or not to keep the original term when it
|
||||
* is shorter than minGram or longer than maxGram
|
||||
*/
|
||||
public NGramTokenFilter(TokenStream input, int minGram, int maxGram) {
|
||||
super(new CodepointCountFilter(input, minGram, Integer.MAX_VALUE));
|
||||
public NGramTokenFilter(TokenStream input, int minGram, int maxGram, boolean preserveOriginal) {
|
||||
super(input);
|
||||
if (minGram < 1) {
|
||||
throw new IllegalArgumentException("minGram must be greater than zero");
|
||||
}
|
||||
|
@ -72,51 +93,107 @@ public final class NGramTokenFilter extends TokenFilter {
|
|||
}
|
||||
this.minGram = minGram;
|
||||
this.maxGram = maxGram;
|
||||
this.preserveOriginal = preserveOriginal;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates an NGramTokenFilter that produces n-grams of the indicated size.
|
||||
*
|
||||
* @param input {@link TokenStream} holding the input to be tokenized
|
||||
* @param gramSize the size of n-grams to generate.
|
||||
*/
|
||||
public NGramTokenFilter(TokenStream input, int gramSize) {
|
||||
this(input, gramSize, gramSize, DEFAULT_PRESERVE_ORIGINAL);
|
||||
}
|
||||
|
||||
posIncAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
/**
|
||||
* Creates an NGramTokenFilter that, for a given input term, produces all
|
||||
* contained n-grams with lengths >= minGram and <= maxGram.
|
||||
*
|
||||
* <p>
|
||||
* Behaves the same as
|
||||
* {@link #NGramTokenFilter(TokenStream, int, int, boolean)
|
||||
* NGramTokenFilter(input, minGram, maxGram, false)}
|
||||
*
|
||||
* @param input {@link TokenStream} holding the input to be tokenized
|
||||
* @param minGram the minimum length of the generated n-grams
|
||||
* @param maxGram the maximum length of the generated n-grams
|
||||
*
|
||||
* @deprecated since 7.4. Use
|
||||
* {@link #NGramTokenFilter(TokenStream, int, int, boolean)} instead.
|
||||
*/
|
||||
@Deprecated
|
||||
public NGramTokenFilter(TokenStream input, int minGram, int maxGram) {
|
||||
this(input, minGram, maxGram, DEFAULT_PRESERVE_ORIGINAL);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates NGramTokenFilter with default min and max n-grams.
|
||||
*
|
||||
* <p>
|
||||
* Behaves the same as
|
||||
* {@link #NGramTokenFilter(TokenStream, int, int, boolean)
|
||||
* NGramTokenFilter(input, 1, 2, false)}
|
||||
*
|
||||
* @param input {@link TokenStream} holding the input to be tokenized
|
||||
* @deprecated since 7.4. Use
|
||||
* {@link #NGramTokenFilter(TokenStream, int, int, boolean)} instead.
|
||||
*/
|
||||
@Deprecated
|
||||
public NGramTokenFilter(TokenStream input) {
|
||||
this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE);
|
||||
this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE, DEFAULT_PRESERVE_ORIGINAL);
|
||||
}
|
||||
|
||||
/** Returns the next token in the stream, or null at EOS. */
|
||||
@Override
|
||||
public final boolean incrementToken() throws IOException {
|
||||
while (true) {
|
||||
if (curTermBuffer == null) {
|
||||
if (!input.incrementToken()) {
|
||||
return false;
|
||||
} else {
|
||||
curTermBuffer = termAtt.buffer().clone();
|
||||
curTermLength = termAtt.length();
|
||||
curCodePointCount = Character.codePointCount(termAtt, 0, termAtt.length());
|
||||
curGramSize = minGram;
|
||||
curPos = 0;
|
||||
curPosInc = posIncAtt.getPositionIncrement();
|
||||
state = captureState();
|
||||
}
|
||||
state = captureState();
|
||||
|
||||
curTermLength = termAtt.length();
|
||||
curTermCodePointCount = Character.codePointCount(termAtt, 0, termAtt.length());
|
||||
curPosIncr += posIncrAtt.getPositionIncrement();
|
||||
curPos = 0;
|
||||
|
||||
if (preserveOriginal && curTermCodePointCount < minGram) {
|
||||
// Token is shorter than minGram, but we'd still like to keep it.
|
||||
posIncrAtt.setPositionIncrement(curPosIncr);
|
||||
curPosIncr = 0;
|
||||
return true;
|
||||
}
|
||||
|
||||
curTermBuffer = termAtt.buffer().clone();
|
||||
curGramSize = minGram;
|
||||
}
|
||||
|
||||
if (curGramSize > maxGram || (curPos + curGramSize) > curCodePointCount) {
|
||||
if (curGramSize > maxGram || (curPos + curGramSize) > curTermCodePointCount) {
|
||||
++curPos;
|
||||
curGramSize = minGram;
|
||||
}
|
||||
if ((curPos + curGramSize) <= curCodePointCount) {
|
||||
if ((curPos + curGramSize) <= curTermCodePointCount) {
|
||||
restoreState(state);
|
||||
final int start = Character.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curPos);
|
||||
final int end = Character.offsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize);
|
||||
termAtt.copyBuffer(curTermBuffer, start, end - start);
|
||||
posIncAtt.setPositionIncrement(curPosInc);
|
||||
curPosInc = 0;
|
||||
posIncrAtt.setPositionIncrement(curPosIncr);
|
||||
curPosIncr = 0;
|
||||
curGramSize++;
|
||||
return true;
|
||||
}
|
||||
curTermBuffer = null;
|
||||
else if (preserveOriginal && curTermCodePointCount > maxGram) {
|
||||
// Token is longer than maxGram, but we'd still like to keep it.
|
||||
restoreState(state);
|
||||
posIncrAtt.setPositionIncrement(0);
|
||||
termAtt.copyBuffer(curTermBuffer, 0, curTermLength);
|
||||
curTermBuffer = null;
|
||||
return true;
|
||||
}
|
||||
|
||||
// Done with this input token, get next token on next iteration.
|
||||
curTermBuffer = null;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -124,5 +201,6 @@ public final class NGramTokenFilter extends TokenFilter {
|
|||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
curTermBuffer = null;
|
||||
curPosIncr = 0;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -236,7 +236,7 @@ public class TestBugInSomething extends BaseTokenStreamTestCase {
|
|||
//TokenStream stream = new SopTokenFilter(tokenizer);
|
||||
TokenStream stream = new ShingleFilter(tokenizer, 5);
|
||||
//stream = new SopTokenFilter(stream);
|
||||
stream = new NGramTokenFilter(stream, 55, 83);
|
||||
stream = new NGramTokenFilter(stream, 55, 83, false);
|
||||
//stream = new SopTokenFilter(stream);
|
||||
return new TokenStreamComponents(tokenizer, stream);
|
||||
}
|
||||
|
|
|
@ -50,49 +50,73 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase {
|
|||
|
||||
public void testInvalidInput() throws Exception {
|
||||
expectThrows(IllegalArgumentException.class, () -> {
|
||||
new EdgeNGramTokenFilter(input, 0, 0);
|
||||
new EdgeNGramTokenFilter(input, 0, 0, false);
|
||||
});
|
||||
}
|
||||
|
||||
public void testInvalidInput2() throws Exception {
|
||||
expectThrows(IllegalArgumentException.class, () -> {
|
||||
new EdgeNGramTokenFilter(input, 2, 1);
|
||||
new EdgeNGramTokenFilter(input, 2, 1, false);
|
||||
});
|
||||
}
|
||||
|
||||
public void testInvalidInput3() throws Exception {
|
||||
expectThrows(IllegalArgumentException.class, () -> {
|
||||
new EdgeNGramTokenFilter(input, -1, 2);
|
||||
new EdgeNGramTokenFilter(input, -1, 2, false);
|
||||
});
|
||||
}
|
||||
|
||||
public void testFrontUnigram() throws Exception {
|
||||
EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, 1, 1);
|
||||
EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, 1, 1, false);
|
||||
assertTokenStreamContents(tokenizer, new String[]{"a"}, new int[]{0}, new int[]{5});
|
||||
}
|
||||
|
||||
public void testOversizedNgrams() throws Exception {
|
||||
EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, 6, 6);
|
||||
EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, 6, 6, false);
|
||||
assertTokenStreamContents(tokenizer, new String[0], new int[0], new int[0]);
|
||||
}
|
||||
|
||||
public void testOversizedNgramsPreserveOriginal() throws Exception {
|
||||
EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, 6, 6, true);
|
||||
assertTokenStreamContents(tokenizer, new String[] {"abcde"}, new int[] {0}, new int[] {5});
|
||||
}
|
||||
|
||||
public void testPreserveOriginal() throws Exception {
|
||||
final String inputString = "a bcd efghi jk";
|
||||
|
||||
{ // preserveOriginal = false
|
||||
TokenStream ts = whitespaceMockTokenizer(inputString);
|
||||
EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(ts, 2, 3, false);
|
||||
assertTokenStreamContents(filter,
|
||||
new String[] { "bc", "bcd", "ef", "efg", "jk" },
|
||||
new int[] { 2, 2, 6, 6, 12 },
|
||||
new int[] { 5, 5, 11, 11, 14 },
|
||||
new int[] { 2, 0, 1, 0, 1 });
|
||||
}
|
||||
|
||||
{ // preserveOriginal = true
|
||||
TokenStream ts = whitespaceMockTokenizer(inputString);
|
||||
EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(ts, 2, 3, true);
|
||||
assertTokenStreamContents(filter,
|
||||
new String[] { "a", "bc", "bcd", "ef", "efg", "efghi", "jk" },
|
||||
new int[] { 0, 2, 2, 6, 6, 6, 12 },
|
||||
new int[] { 1, 5, 5, 11, 11, 11, 14 },
|
||||
new int[] { 1, 1, 0, 1, 0, 0, 1 });
|
||||
}
|
||||
}
|
||||
|
||||
public void testFrontRangeOfNgrams() throws Exception {
|
||||
EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, 1, 3);
|
||||
EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, 1, 3, false);
|
||||
assertTokenStreamContents(tokenizer, new String[]{"a","ab","abc"}, new int[]{0,0,0}, new int[]{5,5,5});
|
||||
}
|
||||
|
||||
public void testFilterPositions() throws Exception {
|
||||
TokenStream ts = whitespaceMockTokenizer("abcde vwxyz");
|
||||
EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(ts, 1, 3);
|
||||
EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(ts, 1, 3, false);
|
||||
assertTokenStreamContents(tokenizer,
|
||||
new String[]{"a","ab","abc","v","vw","vwx"},
|
||||
new int[]{0,0,0,6,6,6},
|
||||
new int[]{5,5,5,11,11,11},
|
||||
null,
|
||||
new int[]{1,0,0,1,0,0},
|
||||
null,
|
||||
null,
|
||||
false);
|
||||
new String[] {"a","ab","abc","v","vw","vwx"},
|
||||
new int[] {0, 0, 0, 6, 6, 6},
|
||||
new int[] {5, 5, 5, 11, 11, 11});
|
||||
}
|
||||
|
||||
private static class PositionFilter extends TokenFilter {
|
||||
|
@ -128,7 +152,7 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase {
|
|||
public void testFirstTokenPositionIncrement() throws Exception {
|
||||
TokenStream ts = whitespaceMockTokenizer("a abc");
|
||||
ts = new PositionFilter(ts); // All but first token will get 0 position increment
|
||||
EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(ts, 2, 3);
|
||||
EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(ts, 2, 3, false);
|
||||
// The first token "a" will not be output, since it's smaller than the mingram size of 2.
|
||||
// The second token on input to EdgeNGramTokenFilter will have position increment of 0,
|
||||
// which should be increased to 1, since this is the first output token in the stream.
|
||||
|
@ -142,14 +166,14 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase {
|
|||
|
||||
public void testSmallTokenInStream() throws Exception {
|
||||
input = whitespaceMockTokenizer("abc de fgh");
|
||||
EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, 3, 3);
|
||||
EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, 3, 3, false);
|
||||
assertTokenStreamContents(tokenizer, new String[]{"abc","fgh"}, new int[]{0,7}, new int[]{3,10});
|
||||
}
|
||||
|
||||
public void testReset() throws Exception {
|
||||
WhitespaceTokenizer tokenizer = new WhitespaceTokenizer();
|
||||
tokenizer.setReader(new StringReader("abcde"));
|
||||
EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(tokenizer, 1, 3);
|
||||
EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(tokenizer, 1, 3, false);
|
||||
assertTokenStreamContents(filter, new String[]{"a","ab","abc"}, new int[]{0,0,0}, new int[]{5,5,5});
|
||||
tokenizer.setReader(new StringReader("abcde"));
|
||||
assertTokenStreamContents(filter, new String[]{"a","ab","abc"}, new int[]{0,0,0}, new int[]{5,5,5});
|
||||
|
@ -160,13 +184,14 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase {
|
|||
for (int i = 0; i < 10; i++) {
|
||||
final int min = TestUtil.nextInt(random(), 2, 10);
|
||||
final int max = TestUtil.nextInt(random(), min, 20);
|
||||
final boolean preserveOriginal = TestUtil.nextInt(random(), 0, 1) % 2 == 0;
|
||||
|
||||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
|
||||
return new TokenStreamComponents(tokenizer,
|
||||
new EdgeNGramTokenFilter(tokenizer, min, max));
|
||||
new EdgeNGramTokenFilter(tokenizer, min, max, preserveOriginal));
|
||||
}
|
||||
};
|
||||
checkRandomData(random(), a, 100*RANDOM_MULTIPLIER);
|
||||
|
@ -181,7 +206,7 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase {
|
|||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer tokenizer = new KeywordTokenizer();
|
||||
return new TokenStreamComponents(tokenizer,
|
||||
new EdgeNGramTokenFilter(tokenizer, 2, 15));
|
||||
new EdgeNGramTokenFilter(tokenizer, 2, 15, false));
|
||||
}
|
||||
};
|
||||
checkAnalysisConsistency(random, a, random.nextBoolean(), "");
|
||||
|
@ -192,7 +217,7 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase {
|
|||
TokenStream tk = new LetterTokenizer();
|
||||
((Tokenizer)tk).setReader(new StringReader("abc d efgh ij klmno p q"));
|
||||
tk = new ShingleFilter(tk);
|
||||
tk = new EdgeNGramTokenFilter(tk, 7, 10);
|
||||
tk = new EdgeNGramTokenFilter(tk, 7, 10, false);
|
||||
assertTokenStreamContents(tk,
|
||||
new String[] { "efgh ij", "ij klmn", "ij klmno", "klmno p" },
|
||||
new int[] { 6,11,11,14 },
|
||||
|
@ -204,23 +229,44 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase {
|
|||
}
|
||||
|
||||
public void testSupplementaryCharacters() throws IOException {
|
||||
final String s = TestUtil.randomUnicodeString(random(), 10);
|
||||
final int codePointCount = s.codePointCount(0, s.length());
|
||||
final int minGram = TestUtil.nextInt(random(), 1, 3);
|
||||
final int maxGram = TestUtil.nextInt(random(), minGram, 10);
|
||||
TokenStream tk = new KeywordTokenizer();
|
||||
((Tokenizer)tk).setReader(new StringReader(s));
|
||||
tk = new EdgeNGramTokenFilter(tk, minGram, maxGram);
|
||||
final CharTermAttribute termAtt = tk.addAttribute(CharTermAttribute.class);
|
||||
final OffsetAttribute offsetAtt = tk.addAttribute(OffsetAttribute.class);
|
||||
tk.reset();
|
||||
for (int i = minGram; i <= Math.min(codePointCount, maxGram); ++i) {
|
||||
assertTrue(tk.incrementToken());
|
||||
assertEquals(0, offsetAtt.startOffset());
|
||||
assertEquals(s.length(), offsetAtt.endOffset());
|
||||
final int end = Character.offsetByCodePoints(s, 0, i);
|
||||
assertEquals(s.substring(0, end), termAtt.toString());
|
||||
for (int i = 0; i < 20; i++) {
|
||||
final String s = TestUtil.randomUnicodeString(random(), 10);
|
||||
final int codePointCount = s.codePointCount(0, s.length());
|
||||
final int minGram = TestUtil.nextInt(random(), 1, 3);
|
||||
final int maxGram = TestUtil.nextInt(random(), minGram, 10);
|
||||
final boolean preserveOriginal = TestUtil.nextInt(random(), 0, 1) % 2 == 0;
|
||||
|
||||
TokenStream tk = new KeywordTokenizer();
|
||||
((Tokenizer)tk).setReader(new StringReader(s));
|
||||
tk = new EdgeNGramTokenFilter(tk, minGram, maxGram, preserveOriginal);
|
||||
final CharTermAttribute termAtt = tk.addAttribute(CharTermAttribute.class);
|
||||
final OffsetAttribute offsetAtt = tk.addAttribute(OffsetAttribute.class);
|
||||
tk.reset();
|
||||
|
||||
if (codePointCount < minGram && preserveOriginal) {
|
||||
assertTrue(tk.incrementToken());
|
||||
assertEquals(0, offsetAtt.startOffset());
|
||||
assertEquals(s.length(), offsetAtt.endOffset());
|
||||
assertEquals(s, termAtt.toString());
|
||||
}
|
||||
|
||||
for (int j = minGram; j <= Math.min(codePointCount, maxGram); j++) {
|
||||
assertTrue(tk.incrementToken());
|
||||
assertEquals(0, offsetAtt.startOffset());
|
||||
assertEquals(s.length(), offsetAtt.endOffset());
|
||||
final int end = Character.offsetByCodePoints(s, 0, j);
|
||||
assertEquals(s.substring(0, end), termAtt.toString());
|
||||
}
|
||||
|
||||
if (codePointCount > maxGram && preserveOriginal) {
|
||||
assertTrue(tk.incrementToken());
|
||||
assertEquals(0, offsetAtt.startOffset());
|
||||
assertEquals(s.length(), offsetAtt.endOffset());
|
||||
assertEquals(s, termAtt.toString());
|
||||
}
|
||||
|
||||
assertFalse(tk.incrementToken());
|
||||
tk.close();
|
||||
}
|
||||
assertFalse(tk.incrementToken());
|
||||
}
|
||||
}
|
||||
|
|
|
@ -48,28 +48,28 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase {
|
|||
|
||||
public void testInvalidInput() throws Exception {
|
||||
expectThrows(IllegalArgumentException.class, () -> {
|
||||
new NGramTokenFilter(input, 2, 1);
|
||||
new NGramTokenFilter(input, 2, 1, false);
|
||||
});
|
||||
}
|
||||
|
||||
public void testInvalidInput2() throws Exception {
|
||||
expectThrows(IllegalArgumentException.class, () -> {
|
||||
new NGramTokenFilter(input, 0, 1);
|
||||
new NGramTokenFilter(input, 0, 1, false);
|
||||
});
|
||||
}
|
||||
|
||||
public void testUnigrams() throws Exception {
|
||||
NGramTokenFilter filter = new NGramTokenFilter(input, 1, 1);
|
||||
NGramTokenFilter filter = new NGramTokenFilter(input, 1, 1, false);
|
||||
assertTokenStreamContents(filter, new String[]{"a","b","c","d","e"}, new int[]{0,0,0,0,0}, new int[]{5,5,5,5,5}, new int[]{1,0,0,0,0});
|
||||
}
|
||||
|
||||
public void testBigrams() throws Exception {
|
||||
NGramTokenFilter filter = new NGramTokenFilter(input, 2, 2);
|
||||
NGramTokenFilter filter = new NGramTokenFilter(input, 2, 2, false);
|
||||
assertTokenStreamContents(filter, new String[]{"ab","bc","cd","de"}, new int[]{0,0,0,0}, new int[]{5,5,5,5}, new int[]{1,0,0,0});
|
||||
}
|
||||
|
||||
public void testNgrams() throws Exception {
|
||||
NGramTokenFilter filter = new NGramTokenFilter(input, 1, 3);
|
||||
NGramTokenFilter filter = new NGramTokenFilter(input, 1, 3, false);
|
||||
assertTokenStreamContents(filter,
|
||||
new String[]{"a","ab","abc","b","bc","bcd","c","cd","cde","d","de","e"},
|
||||
new int[]{0,0,0,0,0,0,0,0,0,0,0,0},
|
||||
|
@ -81,7 +81,7 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase {
|
|||
}
|
||||
|
||||
public void testNgramsNoIncrement() throws Exception {
|
||||
NGramTokenFilter filter = new NGramTokenFilter(input, 1, 3);
|
||||
NGramTokenFilter filter = new NGramTokenFilter(input, 1, 3, false);
|
||||
assertTokenStreamContents(filter,
|
||||
new String[]{"a","ab","abc","b","bc","bcd","c","cd","cde","d","de","e"},
|
||||
new int[]{0,0,0,0,0,0,0,0,0,0,0,0},
|
||||
|
@ -93,25 +93,61 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase {
|
|||
}
|
||||
|
||||
public void testOversizedNgrams() throws Exception {
|
||||
NGramTokenFilter filter = new NGramTokenFilter(input, 6, 7);
|
||||
NGramTokenFilter filter = new NGramTokenFilter(input, 6, 7, false);
|
||||
assertTokenStreamContents(filter, new String[0], new int[0], new int[0]);
|
||||
}
|
||||
|
||||
public void testOversizedNgramsPreserveOriginal() throws Exception {
|
||||
NGramTokenFilter tokenizer = new NGramTokenFilter(input, 6, 6, true);
|
||||
assertTokenStreamContents(tokenizer, new String[] {"abcde"}, new int[] {0}, new int[] {5});
|
||||
}
|
||||
|
||||
public void testSmallTokenInStream() throws Exception {
|
||||
input = whitespaceMockTokenizer("abc de fgh");
|
||||
NGramTokenFilter filter = new NGramTokenFilter(input, 3, 3);
|
||||
assertTokenStreamContents(filter, new String[]{"abc","fgh"}, new int[]{0,7}, new int[]{3,10}, new int[] {1, 2});
|
||||
NGramTokenFilter tokenizer = new NGramTokenFilter(input, 3, 3, false);
|
||||
assertTokenStreamContents(tokenizer, new String[]{"abc","fgh"}, new int[]{0,7}, new int[]{3,10}, new int[] {1, 2});
|
||||
}
|
||||
|
||||
public void testSmallTokenInStreamPreserveOriginal() throws Exception {
|
||||
input = whitespaceMockTokenizer("abc de fgh");
|
||||
NGramTokenFilter tokenizer = new NGramTokenFilter(input, 3, 3, true);
|
||||
assertTokenStreamContents(tokenizer, new String[]{"abc","de","fgh"}, new int[]{0,4,7}, new int[]{3,6,10}, new int[] {1, 1, 1});
|
||||
|
||||
}
|
||||
|
||||
public void testReset() throws Exception {
|
||||
WhitespaceTokenizer tokenizer = new WhitespaceTokenizer();
|
||||
tokenizer.setReader(new StringReader("abcde"));
|
||||
NGramTokenFilter filter = new NGramTokenFilter(tokenizer, 1, 1);
|
||||
NGramTokenFilter filter = new NGramTokenFilter(tokenizer, 1, 1, false);
|
||||
assertTokenStreamContents(filter, new String[]{"a","b","c","d","e"}, new int[]{0,0,0,0,0}, new int[]{5,5,5,5,5}, new int[]{1,0,0,0,0});
|
||||
tokenizer.setReader(new StringReader("abcde"));
|
||||
assertTokenStreamContents(filter, new String[]{"a","b","c","d","e"}, new int[]{0,0,0,0,0}, new int[]{5,5,5,5,5}, new int[]{1,0,0,0,0});
|
||||
}
|
||||
|
||||
public void testKeepShortTermKeepLongTerm() throws Exception {
|
||||
final String inputString = "a bcd efghi jk";
|
||||
|
||||
{ // preserveOriginal = false
|
||||
TokenStream ts = whitespaceMockTokenizer(inputString);
|
||||
NGramTokenFilter filter = new NGramTokenFilter(ts, 2, 3, false);
|
||||
assertTokenStreamContents(filter,
|
||||
new String[] { "bc", "bcd", "cd", "ef", "efg", "fg", "fgh", "gh", "ghi", "hi", "jk" },
|
||||
new int[] { 2, 2, 2, 6, 6, 6, 6, 6, 6, 6, 12 },
|
||||
new int[] { 5, 5, 5, 11, 11, 11, 11, 11, 11, 11, 14 },
|
||||
new int[] { 2, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1 });
|
||||
}
|
||||
|
||||
{ // preserveOriginal = true
|
||||
TokenStream ts = whitespaceMockTokenizer(inputString);
|
||||
NGramTokenFilter filter = new NGramTokenFilter(ts, 2, 3, true);
|
||||
assertTokenStreamContents(filter,
|
||||
new String[] { "a", "bc", "bcd", "cd", "ef", "efg", "fg", "fgh", "gh", "ghi", "hi", "efghi", "jk" },
|
||||
new int[] { 0, 2, 2, 2, 6, 6, 6, 6, 6, 6, 6, 6, 12 },
|
||||
new int[] { 1, 5, 5, 5, 11, 11, 11, 11, 11, 11, 11, 11, 14 },
|
||||
new int[] { 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1 });
|
||||
}
|
||||
}
|
||||
|
||||
// LUCENE-3642
|
||||
// EdgeNgram blindly adds term length to offset, but this can take things out of bounds
|
||||
// wrt original text if a previous filter increases the length of the word (in this case æ -> ae)
|
||||
|
@ -122,7 +158,7 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase {
|
|||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
|
||||
TokenFilter filters = new ASCIIFoldingFilter(tokenizer);
|
||||
filters = new NGramTokenFilter(filters, 2, 2);
|
||||
filters = new NGramTokenFilter(filters, 2, 2, false);
|
||||
return new TokenStreamComponents(tokenizer, filters);
|
||||
}
|
||||
};
|
||||
|
@ -139,12 +175,14 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase {
|
|||
for (int i = 0; i < 10; i++) {
|
||||
final int min = TestUtil.nextInt(random(), 2, 10);
|
||||
final int max = TestUtil.nextInt(random(), min, 20);
|
||||
final boolean preserveOriginal = TestUtil.nextInt(random(), 0, 1) % 2 == 0;
|
||||
|
||||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
|
||||
return new TokenStreamComponents(tokenizer,
|
||||
new NGramTokenFilter(tokenizer, min, max));
|
||||
new NGramTokenFilter(tokenizer, min, max, preserveOriginal));
|
||||
}
|
||||
};
|
||||
checkRandomData(random(), a, 200*RANDOM_MULTIPLIER, 20);
|
||||
|
@ -159,7 +197,7 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase {
|
|||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer tokenizer = new KeywordTokenizer();
|
||||
return new TokenStreamComponents(tokenizer,
|
||||
new NGramTokenFilter(tokenizer, 2, 15));
|
||||
new NGramTokenFilter(tokenizer, 2, 15, false));
|
||||
}
|
||||
};
|
||||
checkAnalysisConsistency(random, a, random.nextBoolean(), "");
|
||||
|
@ -167,27 +205,47 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase {
|
|||
}
|
||||
|
||||
public void testSupplementaryCharacters() throws IOException {
|
||||
final String s = TestUtil.randomUnicodeString(random(), 10);
|
||||
final int codePointCount = s.codePointCount(0, s.length());
|
||||
final int minGram = TestUtil.nextInt(random(), 1, 3);
|
||||
final int maxGram = TestUtil.nextInt(random(), minGram, 10);
|
||||
TokenStream tk = new KeywordTokenizer();
|
||||
((Tokenizer)tk).setReader(new StringReader(s));
|
||||
tk = new NGramTokenFilter(tk, minGram, maxGram);
|
||||
final CharTermAttribute termAtt = tk.addAttribute(CharTermAttribute.class);
|
||||
final OffsetAttribute offsetAtt = tk.addAttribute(OffsetAttribute.class);
|
||||
tk.reset();
|
||||
for (int start = 0; start < codePointCount; ++start) {
|
||||
for (int end = start + minGram; end <= Math.min(codePointCount, start + maxGram); ++end) {
|
||||
for (int i = 0; i < 20; i++) {
|
||||
final String s = TestUtil.randomUnicodeString(random(), 10);
|
||||
final int codePointCount = s.codePointCount(0, s.length());
|
||||
final int minGram = TestUtil.nextInt(random(), 1, 3);
|
||||
final int maxGram = TestUtil.nextInt(random(), minGram, 10);
|
||||
final boolean preserveOriginal = TestUtil.nextInt(random(), 0, 1) % 2 == 0;
|
||||
|
||||
TokenStream tk = new KeywordTokenizer();
|
||||
((Tokenizer)tk).setReader(new StringReader(s));
|
||||
tk = new NGramTokenFilter(tk, minGram, maxGram, preserveOriginal);
|
||||
final CharTermAttribute termAtt = tk.addAttribute(CharTermAttribute.class);
|
||||
final OffsetAttribute offsetAtt = tk.addAttribute(OffsetAttribute.class);
|
||||
tk.reset();
|
||||
|
||||
if (codePointCount < minGram && preserveOriginal) {
|
||||
assertTrue(tk.incrementToken());
|
||||
assertEquals(0, offsetAtt.startOffset());
|
||||
assertEquals(s.length(), offsetAtt.endOffset());
|
||||
final int startIndex = Character.offsetByCodePoints(s, 0, start);
|
||||
final int endIndex = Character.offsetByCodePoints(s, 0, end);
|
||||
assertEquals(s.substring(startIndex, endIndex), termAtt.toString());
|
||||
assertEquals(s, termAtt.toString());
|
||||
}
|
||||
|
||||
for (int start = 0; start < codePointCount; ++start) {
|
||||
for (int end = start + minGram; end <= Math.min(codePointCount, start + maxGram); ++end) {
|
||||
assertTrue(tk.incrementToken());
|
||||
assertEquals(0, offsetAtt.startOffset());
|
||||
assertEquals(s.length(), offsetAtt.endOffset());
|
||||
final int startIndex = Character.offsetByCodePoints(s, 0, start);
|
||||
final int endIndex = Character.offsetByCodePoints(s, 0, end);
|
||||
assertEquals(s.substring(startIndex, endIndex), termAtt.toString());
|
||||
}
|
||||
}
|
||||
|
||||
if (codePointCount > maxGram && preserveOriginal) {
|
||||
assertTrue(tk.incrementToken());
|
||||
assertEquals(0, offsetAtt.startOffset());
|
||||
assertEquals(s.length(), offsetAtt.endOffset());
|
||||
assertEquals(s, termAtt.toString());
|
||||
}
|
||||
|
||||
assertFalse(tk.incrementToken());
|
||||
tk.close();
|
||||
}
|
||||
assertFalse(tk.incrementToken());
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -56,12 +56,14 @@ public class TestNGramFilters extends BaseTokenStreamFactoryTestCase {
|
|||
}
|
||||
|
||||
/**
|
||||
* Test the NGramFilterFactory
|
||||
* Test the NGramFilterFactory with old defaults
|
||||
*/
|
||||
public void testNGramFilter() throws Exception {
|
||||
Reader reader = new StringReader("test");
|
||||
TokenStream stream = whitespaceMockTokenizer(reader);
|
||||
stream = tokenFilterFactory("NGram").create(stream);
|
||||
stream = tokenFilterFactory("NGram",
|
||||
"minGramSize", "1",
|
||||
"maxGramSize", "2").create(stream);
|
||||
assertTokenStreamContents(stream,
|
||||
new String[] { "t", "te", "e", "es", "s", "st", "t" });
|
||||
}
|
||||
|
@ -126,12 +128,13 @@ public class TestNGramFilters extends BaseTokenStreamFactoryTestCase {
|
|||
}
|
||||
|
||||
/**
|
||||
* Test EdgeNGramFilterFactory
|
||||
* Test EdgeNGramFilterFactory with old defaults
|
||||
*/
|
||||
public void testEdgeNGramFilter() throws Exception {
|
||||
Reader reader = new StringReader("test");
|
||||
TokenStream stream = whitespaceMockTokenizer(reader);
|
||||
stream = tokenFilterFactory("EdgeNGram").create(stream);
|
||||
stream = tokenFilterFactory("EdgeNGram", "minGramSize", "1",
|
||||
"maxGramSize", "1").create(stream);
|
||||
assertTokenStreamContents(stream,
|
||||
new String[] { "t" });
|
||||
}
|
||||
|
@ -173,7 +176,8 @@ public class TestNGramFilters extends BaseTokenStreamFactoryTestCase {
|
|||
|
||||
/** Test that bogus arguments result in exception */
|
||||
public void testBogusArguments() throws Exception {
|
||||
IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {
|
||||
IllegalArgumentException expected = null;
|
||||
expected = expectThrows(IllegalArgumentException.class, () -> {
|
||||
tokenizerFactory("NGram", "bogusArg", "bogusValue");
|
||||
});
|
||||
assertTrue(expected.getMessage().contains("Unknown parameters"));
|
||||
|
@ -184,12 +188,12 @@ public class TestNGramFilters extends BaseTokenStreamFactoryTestCase {
|
|||
assertTrue(expected.getMessage().contains("Unknown parameters"));
|
||||
|
||||
expected = expectThrows(IllegalArgumentException.class, () -> {
|
||||
tokenFilterFactory("NGram", "bogusArg", "bogusValue");
|
||||
tokenFilterFactory("NGram", "minGramSize", "2", "maxGramSize", "5", "bogusArg", "bogusValue");
|
||||
});
|
||||
assertTrue(expected.getMessage().contains("Unknown parameters"));
|
||||
|
||||
expected = expectThrows(IllegalArgumentException.class, () -> {
|
||||
tokenFilterFactory("EdgeNGram", "bogusArg", "bogusValue");
|
||||
tokenFilterFactory("EdgeNGram", "minGramSize", "2", "maxGramSize", "5", "bogusArg", "bogusValue");
|
||||
});
|
||||
assertTrue(expected.getMessage().contains("Unknown parameters"));
|
||||
}
|
||||
|
|
|
@ -87,7 +87,7 @@ public class BM25NBClassifierTest extends ClassificationTestBase<BytesRef> {
|
|||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
final Tokenizer tokenizer = new KeywordTokenizer();
|
||||
return new TokenStreamComponents(tokenizer, new ReverseStringFilter(new EdgeNGramTokenFilter(new ReverseStringFilter(tokenizer), 10, 20)));
|
||||
return new TokenStreamComponents(tokenizer, new ReverseStringFilter(new EdgeNGramTokenFilter(new ReverseStringFilter(tokenizer), 10, 20, false)));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -86,7 +86,7 @@ public class CachingNaiveBayesClassifierTest extends ClassificationTestBase<Byte
|
|||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
final Tokenizer tokenizer = new KeywordTokenizer();
|
||||
return new TokenStreamComponents(tokenizer, new ReverseStringFilter(new EdgeNGramTokenFilter(new ReverseStringFilter(tokenizer), 10, 20)));
|
||||
return new TokenStreamComponents(tokenizer, new ReverseStringFilter(new EdgeNGramTokenFilter(new ReverseStringFilter(tokenizer), 10, 20, false)));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -89,7 +89,7 @@ public class SimpleNaiveBayesClassifierTest extends ClassificationTestBase<Bytes
|
|||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
final Tokenizer tokenizer = new KeywordTokenizer();
|
||||
return new TokenStreamComponents(tokenizer, new ReverseStringFilter(new EdgeNGramTokenFilter(new ReverseStringFilter(tokenizer), 10, 20)));
|
||||
return new TokenStreamComponents(tokenizer, new ReverseStringFilter(new EdgeNGramTokenFilter(new ReverseStringFilter(tokenizer), 10, 20, false)));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -359,7 +359,7 @@ public class AnalyzingInfixSuggester extends Lookup implements Closeable {
|
|||
: "no need \"textgrams\" when minPrefixChars="+minPrefixChars;
|
||||
if (fieldName.equals(TEXTGRAMS_FIELD_NAME) && minPrefixChars > 0) {
|
||||
// TODO: should use an EdgeNGramTokenFilterFactory here
|
||||
TokenFilter filter = new EdgeNGramTokenFilter(components.getTokenStream(), 1, minPrefixChars);
|
||||
TokenFilter filter = new EdgeNGramTokenFilter(components.getTokenStream(), 1, minPrefixChars, false);
|
||||
return new TokenStreamComponents(components.getTokenizer(), filter);
|
||||
} else {
|
||||
return components;
|
||||
|
|
Loading…
Reference in New Issue