LUCENE-7690: Add preserveOriginal option to the NGram and EdgeNGram filters

This commit is contained in:
Robert Muir 2018-06-04 21:24:20 -04:00
parent 59087d148a
commit 2c1ab31b4e
13 changed files with 392 additions and 135 deletions

View File

@ -203,6 +203,9 @@ New Features
now use to also take pending deletes into account which ensures that all file
generations per segment always go forward. (Simon Willnauer)
* LUCENE-7690: Add preserveOriginal option to the NGram and EdgeNGram filters.
(Ingomar Wesp, Shawn Heisey via Robert Muir)
* LUCENE-8335: Enforce soft-deletes field up-front. Soft deletes field must be marked
as such once it's introduced and can't be changed after the fact.
(Nhat Nguyen via Simon Willnauer)

View File

@ -29,19 +29,21 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
* <fieldType name="text_edgngrm" class="solr.TextField" positionIncrementGap="100">
* <analyzer>
* <tokenizer class="solr.WhitespaceTokenizerFactory"/>
* <filter class="solr.EdgeNGramFilterFactory" minGramSize="1" maxGramSize="1"/>
* <filter class="solr.EdgeNGramFilterFactory" minGramSize="1" maxGramSize="2" preserveOriginal="true"/>
* </analyzer>
* &lt;/fieldType&gt;</pre>
*/
public class EdgeNGramFilterFactory extends TokenFilterFactory {
private final int maxGramSize;
private final int minGramSize;
private final boolean preserveOriginal;
/** Creates a new EdgeNGramFilterFactory */
public EdgeNGramFilterFactory(Map<String, String> args) {
super(args);
minGramSize = getInt(args, "minGramSize", EdgeNGramTokenFilter.DEFAULT_MIN_GRAM_SIZE);
maxGramSize = getInt(args, "maxGramSize", EdgeNGramTokenFilter.DEFAULT_MAX_GRAM_SIZE);
preserveOriginal = getBoolean(args, "preserveOriginal", EdgeNGramTokenFilter.DEFAULT_PRESERVE_ORIGINAL);
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
@ -49,6 +51,6 @@ public class EdgeNGramFilterFactory extends TokenFilterFactory {
@Override
public TokenFilter create(TokenStream input) {
return new EdgeNGramTokenFilter(input, minGramSize, maxGramSize);
return new EdgeNGramTokenFilter(input, minGramSize, maxGramSize, preserveOriginal);
}
}

View File

@ -32,29 +32,46 @@ import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
* supplementary characters.
*/
public final class EdgeNGramTokenFilter extends TokenFilter {
/**
* @deprecated since 7.4 - this value will be required.
*/
@Deprecated
public static final int DEFAULT_MAX_GRAM_SIZE = 1;
/**
* @deprecated since 7.4 - this value will be required.
*/
@Deprecated
public static final int DEFAULT_MIN_GRAM_SIZE = 1;
public static final boolean DEFAULT_PRESERVE_ORIGINAL = false;
private final int minGram;
private final int maxGram;
private final boolean preserveOriginal;
private char[] curTermBuffer;
private int curTermLength;
private int curCodePointCount;
private int curTermCodePointCount;
private int curGramSize;
private int savePosIncr;
private int curPosIncr;
private State state;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
/**
* Creates EdgeNGramTokenFilter that can generate n-grams in the sizes of the given range
* Creates an EdgeNGramTokenFilter that, for a given input term, produces all
* edge n-grams with lengths &gt;= minGram and &lt;= maxGram. Will
* optionally preserve the original term when its length is outside of the
* defined range.
*
* @param input {@link TokenStream} holding the input to be tokenized
* @param minGram the smallest n-gram to generate
* @param maxGram the largest n-gram to generate
* @param minGram the minimum length of the generated n-grams
* @param maxGram the maximum length of the generated n-grams
* @param preserveOriginal Whether or not to keep the original term when it
* is outside the min/max size range.
*/
public EdgeNGramTokenFilter(TokenStream input, int minGram, int maxGram) {
public EdgeNGramTokenFilter(
TokenStream input, int minGram, int maxGram, boolean preserveOriginal) {
super(input);
if (minGram < 1) {
@ -67,6 +84,39 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
this.minGram = minGram;
this.maxGram = maxGram;
this.preserveOriginal = preserveOriginal;
}
/**
* Creates an EdgeNGramTokenFilter that produces edge n-grams of the given
* size.
*
* @param input {@link TokenStream} holding the input to be tokenized
* @param gramSize the n-gram size to generate.
*/
public EdgeNGramTokenFilter(TokenStream input, int gramSize) {
this(input, gramSize, gramSize, DEFAULT_PRESERVE_ORIGINAL);
}
/**
* Creates an EdgeNGramTokenFilter that, for a given input term, produces all
* edge n-grams with lengths &gt;= minGram and &lt;= maxGram.
*
* <p>
* Behaves the same as
* {@link #EdgeNGramTokenFilter(TokenStream, int, int, boolean)
* NGramTokenFilter(input, minGram, maxGram, false)}
*
* @param input {@link TokenStream} holding the input to be tokenized
* @param minGram the minimum length of the generated n-grams
* @param maxGram the maximum length of the generated n-grams
*
* @deprecated since 7.4. Use
* {@link #EdgeNGramTokenFilter(TokenStream, int, int, boolean)} instead.
*/
@Deprecated
public EdgeNGramTokenFilter(TokenStream input, int minGram, int maxGram) {
this(input, minGram, maxGram, DEFAULT_PRESERVE_ORIGINAL);
}
@Override
@ -75,32 +125,46 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
if (curTermBuffer == null) {
if (!input.incrementToken()) {
return false;
} else {
curTermBuffer = termAtt.buffer().clone();
curTermLength = termAtt.length();
curCodePointCount = Character.codePointCount(termAtt, 0, termAtt.length());
curGramSize = minGram;
}
state = captureState();
savePosIncr += posIncrAtt.getPositionIncrement();
curTermLength = termAtt.length();
curTermCodePointCount = Character.codePointCount(termAtt, 0, curTermLength);
curPosIncr += posIncrAtt.getPositionIncrement();
if (preserveOriginal && curTermCodePointCount < minGram) {
// Token is shorter than minGram, but we'd still like to keep it.
posIncrAtt.setPositionIncrement(curPosIncr);
curPosIncr = 0;
return true;
}
curTermBuffer = termAtt.buffer().clone();
curGramSize = minGram;
}
if (curGramSize <= maxGram) { // if we have hit the end of our n-gram size range, quit
if (curGramSize <= curCodePointCount) { // if the remaining input is too short, we can't generate any n-grams
// grab gramSize chars from front or back
if (curGramSize <= curTermCodePointCount) {
if (curGramSize <= maxGram) { // curGramSize is between minGram and maxGram
restoreState(state);
// first ngram gets increment, others don't
if (curGramSize == minGram) {
posIncrAtt.setPositionIncrement(savePosIncr);
savePosIncr = 0;
} else {
posIncrAtt.setPositionIncrement(0);
}
posIncrAtt.setPositionIncrement(curPosIncr);
curPosIncr = 0;
final int charLength = Character.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curGramSize);
termAtt.copyBuffer(curTermBuffer, 0, charLength);
curGramSize++;
return true;
}
else if (preserveOriginal) {
// Token is longer than maxGram, but we'd still like to keep it.
restoreState(state);
posIncrAtt.setPositionIncrement(0);
termAtt.copyBuffer(curTermBuffer, 0, curTermLength);
curTermBuffer = null;
return true;
}
}
// Done with this input token, get next token on the next iteration.
curTermBuffer = null;
}
}
@ -109,6 +173,6 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
public void reset() throws IOException {
super.reset();
curTermBuffer = null;
savePosIncr = 0;
curPosIncr = 0;
}
}

View File

@ -29,19 +29,21 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
* &lt;fieldType name="text_ngrm" class="solr.TextField" positionIncrementGap="100"&gt;
* &lt;analyzer&gt;
* &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
* &lt;filter class="solr.NGramFilterFactory" minGramSize="1" maxGramSize="2"/&gt;
* &lt;filter class="solr.NGramFilterFactory" minGramSize="1" maxGramSize="2" preserveOriginal="true"/&gt;
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;</pre>
*/
public class NGramFilterFactory extends TokenFilterFactory {
private final int maxGramSize;
private final int minGramSize;
private final boolean preserveOriginal;
/** Creates a new NGramFilterFactory */
public NGramFilterFactory(Map<String, String> args) {
super(args);
minGramSize = getInt(args, "minGramSize", NGramTokenFilter.DEFAULT_MIN_NGRAM_SIZE);
maxGramSize = getInt(args, "maxGramSize", NGramTokenFilter.DEFAULT_MAX_NGRAM_SIZE);
preserveOriginal = getBoolean(args, "keepShortTerm", NGramTokenFilter.DEFAULT_PRESERVE_ORIGINAL);
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
@ -49,6 +51,6 @@ public class NGramFilterFactory extends TokenFilterFactory {
@Override
public TokenFilter create(TokenStream input) {
return new NGramTokenFilter(input, minGramSize, maxGramSize);
return new NGramTokenFilter(input, minGramSize, maxGramSize, preserveOriginal);
}
}

View File

@ -21,7 +21,6 @@ import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.CodepointCountFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
@ -40,30 +39,52 @@ import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
* override {@link NGramTokenizer#isTokenChar(int)} to perform pre-tokenization.
*/
public final class NGramTokenFilter extends TokenFilter {
/**
* @deprecated since 7.4 - this value will be required.
*/
@Deprecated
public static final int DEFAULT_MIN_NGRAM_SIZE = 1;
public static final int DEFAULT_MAX_NGRAM_SIZE = 2;
private final int minGram, maxGram;
/**
* @deprecated since 7.4 - this value will be required.
*/
@Deprecated
public static final int DEFAULT_MAX_NGRAM_SIZE = 2;
public static final boolean DEFAULT_PRESERVE_ORIGINAL = false;
private final int minGram;
private final int maxGram;
private final boolean preserveOriginal;
private char[] curTermBuffer;
private int curTermLength;
private int curCodePointCount;
private int curTermCodePointCount;
private int curGramSize;
private int curPos;
private int curPosInc;
private int curPosIncr;
private State state;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final PositionIncrementAttribute posIncAtt;
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
/**
* Creates NGramTokenFilter with given min and max n-grams.
* Creates an NGramTokenFilter that, for a given input term, produces all
* contained n-grams with lengths &gt;= minGram and &lt;= maxGram. Will
* optionally preserve the original term when its length is outside of the
* defined range.
*
* Note: Care must be taken when choosing minGram and maxGram; depending
* on the input token size, this filter potentially produces a huge number
* of terms.
*
* @param input {@link TokenStream} holding the input to be tokenized
* @param minGram the smallest n-gram to generate
* @param maxGram the largest n-gram to generate
* @param minGram the minimum length of the generated n-grams
* @param maxGram the maximum length of the generated n-grams
* @param preserveOriginal Whether or not to keep the original term when it
* is shorter than minGram or longer than maxGram
*/
public NGramTokenFilter(TokenStream input, int minGram, int maxGram) {
super(new CodepointCountFilter(input, minGram, Integer.MAX_VALUE));
public NGramTokenFilter(TokenStream input, int minGram, int maxGram, boolean preserveOriginal) {
super(input);
if (minGram < 1) {
throw new IllegalArgumentException("minGram must be greater than zero");
}
@ -72,50 +93,106 @@ public final class NGramTokenFilter extends TokenFilter {
}
this.minGram = minGram;
this.maxGram = maxGram;
this.preserveOriginal = preserveOriginal;
}
posIncAtt = addAttribute(PositionIncrementAttribute.class);
/**
* Creates an NGramTokenFilter that produces n-grams of the indicated size.
*
* @param input {@link TokenStream} holding the input to be tokenized
* @param gramSize the size of n-grams to generate.
*/
public NGramTokenFilter(TokenStream input, int gramSize) {
this(input, gramSize, gramSize, DEFAULT_PRESERVE_ORIGINAL);
}
/**
* Creates an NGramTokenFilter that, for a given input term, produces all
* contained n-grams with lengths &gt;= minGram and &lt;= maxGram.
*
* <p>
* Behaves the same as
* {@link #NGramTokenFilter(TokenStream, int, int, boolean)
* NGramTokenFilter(input, minGram, maxGram, false)}
*
* @param input {@link TokenStream} holding the input to be tokenized
* @param minGram the minimum length of the generated n-grams
* @param maxGram the maximum length of the generated n-grams
*
* @deprecated since 7.4. Use
* {@link #NGramTokenFilter(TokenStream, int, int, boolean)} instead.
*/
@Deprecated
public NGramTokenFilter(TokenStream input, int minGram, int maxGram) {
this(input, minGram, maxGram, DEFAULT_PRESERVE_ORIGINAL);
}
/**
* Creates NGramTokenFilter with default min and max n-grams.
*
* <p>
* Behaves the same as
* {@link #NGramTokenFilter(TokenStream, int, int, boolean)
* NGramTokenFilter(input, 1, 2, false)}
*
* @param input {@link TokenStream} holding the input to be tokenized
* @deprecated since 7.4. Use
* {@link #NGramTokenFilter(TokenStream, int, int, boolean)} instead.
*/
@Deprecated
public NGramTokenFilter(TokenStream input) {
this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE);
this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE, DEFAULT_PRESERVE_ORIGINAL);
}
/** Returns the next token in the stream, or null at EOS. */
@Override
public final boolean incrementToken() throws IOException {
while (true) {
if (curTermBuffer == null) {
if (!input.incrementToken()) {
return false;
} else {
curTermBuffer = termAtt.buffer().clone();
curTermLength = termAtt.length();
curCodePointCount = Character.codePointCount(termAtt, 0, termAtt.length());
curGramSize = minGram;
curPos = 0;
curPosInc = posIncAtt.getPositionIncrement();
state = captureState();
}
state = captureState();
curTermLength = termAtt.length();
curTermCodePointCount = Character.codePointCount(termAtt, 0, termAtt.length());
curPosIncr += posIncrAtt.getPositionIncrement();
curPos = 0;
if (preserveOriginal && curTermCodePointCount < minGram) {
// Token is shorter than minGram, but we'd still like to keep it.
posIncrAtt.setPositionIncrement(curPosIncr);
curPosIncr = 0;
return true;
}
if (curGramSize > maxGram || (curPos + curGramSize) > curCodePointCount) {
curTermBuffer = termAtt.buffer().clone();
curGramSize = minGram;
}
if (curGramSize > maxGram || (curPos + curGramSize) > curTermCodePointCount) {
++curPos;
curGramSize = minGram;
}
if ((curPos + curGramSize) <= curCodePointCount) {
if ((curPos + curGramSize) <= curTermCodePointCount) {
restoreState(state);
final int start = Character.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curPos);
final int end = Character.offsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize);
termAtt.copyBuffer(curTermBuffer, start, end - start);
posIncAtt.setPositionIncrement(curPosInc);
curPosInc = 0;
posIncrAtt.setPositionIncrement(curPosIncr);
curPosIncr = 0;
curGramSize++;
return true;
}
else if (preserveOriginal && curTermCodePointCount > maxGram) {
// Token is longer than maxGram, but we'd still like to keep it.
restoreState(state);
posIncrAtt.setPositionIncrement(0);
termAtt.copyBuffer(curTermBuffer, 0, curTermLength);
curTermBuffer = null;
return true;
}
// Done with this input token, get next token on next iteration.
curTermBuffer = null;
}
}
@ -124,5 +201,6 @@ public final class NGramTokenFilter extends TokenFilter {
public void reset() throws IOException {
super.reset();
curTermBuffer = null;
curPosIncr = 0;
}
}

View File

@ -236,7 +236,7 @@ public class TestBugInSomething extends BaseTokenStreamTestCase {
//TokenStream stream = new SopTokenFilter(tokenizer);
TokenStream stream = new ShingleFilter(tokenizer, 5);
//stream = new SopTokenFilter(stream);
stream = new NGramTokenFilter(stream, 55, 83);
stream = new NGramTokenFilter(stream, 55, 83, false);
//stream = new SopTokenFilter(stream);
return new TokenStreamComponents(tokenizer, stream);
}

View File

@ -50,49 +50,73 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase {
public void testInvalidInput() throws Exception {
expectThrows(IllegalArgumentException.class, () -> {
new EdgeNGramTokenFilter(input, 0, 0);
new EdgeNGramTokenFilter(input, 0, 0, false);
});
}
public void testInvalidInput2() throws Exception {
expectThrows(IllegalArgumentException.class, () -> {
new EdgeNGramTokenFilter(input, 2, 1);
new EdgeNGramTokenFilter(input, 2, 1, false);
});
}
public void testInvalidInput3() throws Exception {
expectThrows(IllegalArgumentException.class, () -> {
new EdgeNGramTokenFilter(input, -1, 2);
new EdgeNGramTokenFilter(input, -1, 2, false);
});
}
public void testFrontUnigram() throws Exception {
EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, 1, 1);
EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, 1, 1, false);
assertTokenStreamContents(tokenizer, new String[]{"a"}, new int[]{0}, new int[]{5});
}
public void testOversizedNgrams() throws Exception {
EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, 6, 6);
EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, 6, 6, false);
assertTokenStreamContents(tokenizer, new String[0], new int[0], new int[0]);
}
public void testOversizedNgramsPreserveOriginal() throws Exception {
EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, 6, 6, true);
assertTokenStreamContents(tokenizer, new String[] {"abcde"}, new int[] {0}, new int[] {5});
}
public void testPreserveOriginal() throws Exception {
final String inputString = "a bcd efghi jk";
{ // preserveOriginal = false
TokenStream ts = whitespaceMockTokenizer(inputString);
EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(ts, 2, 3, false);
assertTokenStreamContents(filter,
new String[] { "bc", "bcd", "ef", "efg", "jk" },
new int[] { 2, 2, 6, 6, 12 },
new int[] { 5, 5, 11, 11, 14 },
new int[] { 2, 0, 1, 0, 1 });
}
{ // preserveOriginal = true
TokenStream ts = whitespaceMockTokenizer(inputString);
EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(ts, 2, 3, true);
assertTokenStreamContents(filter,
new String[] { "a", "bc", "bcd", "ef", "efg", "efghi", "jk" },
new int[] { 0, 2, 2, 6, 6, 6, 12 },
new int[] { 1, 5, 5, 11, 11, 11, 14 },
new int[] { 1, 1, 0, 1, 0, 0, 1 });
}
}
public void testFrontRangeOfNgrams() throws Exception {
EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, 1, 3);
EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, 1, 3, false);
assertTokenStreamContents(tokenizer, new String[]{"a","ab","abc"}, new int[]{0,0,0}, new int[]{5,5,5});
}
public void testFilterPositions() throws Exception {
TokenStream ts = whitespaceMockTokenizer("abcde vwxyz");
EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(ts, 1, 3);
EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(ts, 1, 3, false);
assertTokenStreamContents(tokenizer,
new String[]{"a","ab","abc","v","vw","vwx"},
new int[]{0,0,0,6,6,6},
new int[]{5,5,5,11,11,11},
null,
new int[]{1,0,0,1,0,0},
null,
null,
false);
new String[] {"a","ab","abc","v","vw","vwx"},
new int[] {0, 0, 0, 6, 6, 6},
new int[] {5, 5, 5, 11, 11, 11});
}
private static class PositionFilter extends TokenFilter {
@ -128,7 +152,7 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase {
public void testFirstTokenPositionIncrement() throws Exception {
TokenStream ts = whitespaceMockTokenizer("a abc");
ts = new PositionFilter(ts); // All but first token will get 0 position increment
EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(ts, 2, 3);
EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(ts, 2, 3, false);
// The first token "a" will not be output, since it's smaller than the mingram size of 2.
// The second token on input to EdgeNGramTokenFilter will have position increment of 0,
// which should be increased to 1, since this is the first output token in the stream.
@ -142,14 +166,14 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase {
public void testSmallTokenInStream() throws Exception {
input = whitespaceMockTokenizer("abc de fgh");
EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, 3, 3);
EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, 3, 3, false);
assertTokenStreamContents(tokenizer, new String[]{"abc","fgh"}, new int[]{0,7}, new int[]{3,10});
}
public void testReset() throws Exception {
WhitespaceTokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader("abcde"));
EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(tokenizer, 1, 3);
EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(tokenizer, 1, 3, false);
assertTokenStreamContents(filter, new String[]{"a","ab","abc"}, new int[]{0,0,0}, new int[]{5,5,5});
tokenizer.setReader(new StringReader("abcde"));
assertTokenStreamContents(filter, new String[]{"a","ab","abc"}, new int[]{0,0,0}, new int[]{5,5,5});
@ -160,13 +184,14 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase {
for (int i = 0; i < 10; i++) {
final int min = TestUtil.nextInt(random(), 2, 10);
final int max = TestUtil.nextInt(random(), min, 20);
final boolean preserveOriginal = TestUtil.nextInt(random(), 0, 1) % 2 == 0;
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(tokenizer,
new EdgeNGramTokenFilter(tokenizer, min, max));
new EdgeNGramTokenFilter(tokenizer, min, max, preserveOriginal));
}
};
checkRandomData(random(), a, 100*RANDOM_MULTIPLIER);
@ -181,7 +206,7 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase {
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new KeywordTokenizer();
return new TokenStreamComponents(tokenizer,
new EdgeNGramTokenFilter(tokenizer, 2, 15));
new EdgeNGramTokenFilter(tokenizer, 2, 15, false));
}
};
checkAnalysisConsistency(random, a, random.nextBoolean(), "");
@ -192,7 +217,7 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase {
TokenStream tk = new LetterTokenizer();
((Tokenizer)tk).setReader(new StringReader("abc d efgh ij klmno p q"));
tk = new ShingleFilter(tk);
tk = new EdgeNGramTokenFilter(tk, 7, 10);
tk = new EdgeNGramTokenFilter(tk, 7, 10, false);
assertTokenStreamContents(tk,
new String[] { "efgh ij", "ij klmn", "ij klmno", "klmno p" },
new int[] { 6,11,11,14 },
@ -204,23 +229,44 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase {
}
public void testSupplementaryCharacters() throws IOException {
for (int i = 0; i < 20; i++) {
final String s = TestUtil.randomUnicodeString(random(), 10);
final int codePointCount = s.codePointCount(0, s.length());
final int minGram = TestUtil.nextInt(random(), 1, 3);
final int maxGram = TestUtil.nextInt(random(), minGram, 10);
final boolean preserveOriginal = TestUtil.nextInt(random(), 0, 1) % 2 == 0;
TokenStream tk = new KeywordTokenizer();
((Tokenizer)tk).setReader(new StringReader(s));
tk = new EdgeNGramTokenFilter(tk, minGram, maxGram);
tk = new EdgeNGramTokenFilter(tk, minGram, maxGram, preserveOriginal);
final CharTermAttribute termAtt = tk.addAttribute(CharTermAttribute.class);
final OffsetAttribute offsetAtt = tk.addAttribute(OffsetAttribute.class);
tk.reset();
for (int i = minGram; i <= Math.min(codePointCount, maxGram); ++i) {
if (codePointCount < minGram && preserveOriginal) {
assertTrue(tk.incrementToken());
assertEquals(0, offsetAtt.startOffset());
assertEquals(s.length(), offsetAtt.endOffset());
final int end = Character.offsetByCodePoints(s, 0, i);
assertEquals(s, termAtt.toString());
}
for (int j = minGram; j <= Math.min(codePointCount, maxGram); j++) {
assertTrue(tk.incrementToken());
assertEquals(0, offsetAtt.startOffset());
assertEquals(s.length(), offsetAtt.endOffset());
final int end = Character.offsetByCodePoints(s, 0, j);
assertEquals(s.substring(0, end), termAtt.toString());
}
if (codePointCount > maxGram && preserveOriginal) {
assertTrue(tk.incrementToken());
assertEquals(0, offsetAtt.startOffset());
assertEquals(s.length(), offsetAtt.endOffset());
assertEquals(s, termAtt.toString());
}
assertFalse(tk.incrementToken());
tk.close();
}
}
}

View File

@ -48,28 +48,28 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase {
public void testInvalidInput() throws Exception {
expectThrows(IllegalArgumentException.class, () -> {
new NGramTokenFilter(input, 2, 1);
new NGramTokenFilter(input, 2, 1, false);
});
}
public void testInvalidInput2() throws Exception {
expectThrows(IllegalArgumentException.class, () -> {
new NGramTokenFilter(input, 0, 1);
new NGramTokenFilter(input, 0, 1, false);
});
}
public void testUnigrams() throws Exception {
NGramTokenFilter filter = new NGramTokenFilter(input, 1, 1);
NGramTokenFilter filter = new NGramTokenFilter(input, 1, 1, false);
assertTokenStreamContents(filter, new String[]{"a","b","c","d","e"}, new int[]{0,0,0,0,0}, new int[]{5,5,5,5,5}, new int[]{1,0,0,0,0});
}
public void testBigrams() throws Exception {
NGramTokenFilter filter = new NGramTokenFilter(input, 2, 2);
NGramTokenFilter filter = new NGramTokenFilter(input, 2, 2, false);
assertTokenStreamContents(filter, new String[]{"ab","bc","cd","de"}, new int[]{0,0,0,0}, new int[]{5,5,5,5}, new int[]{1,0,0,0});
}
public void testNgrams() throws Exception {
NGramTokenFilter filter = new NGramTokenFilter(input, 1, 3);
NGramTokenFilter filter = new NGramTokenFilter(input, 1, 3, false);
assertTokenStreamContents(filter,
new String[]{"a","ab","abc","b","bc","bcd","c","cd","cde","d","de","e"},
new int[]{0,0,0,0,0,0,0,0,0,0,0,0},
@ -81,7 +81,7 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase {
}
public void testNgramsNoIncrement() throws Exception {
NGramTokenFilter filter = new NGramTokenFilter(input, 1, 3);
NGramTokenFilter filter = new NGramTokenFilter(input, 1, 3, false);
assertTokenStreamContents(filter,
new String[]{"a","ab","abc","b","bc","bcd","c","cd","cde","d","de","e"},
new int[]{0,0,0,0,0,0,0,0,0,0,0,0},
@ -93,25 +93,61 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase {
}
public void testOversizedNgrams() throws Exception {
NGramTokenFilter filter = new NGramTokenFilter(input, 6, 7);
NGramTokenFilter filter = new NGramTokenFilter(input, 6, 7, false);
assertTokenStreamContents(filter, new String[0], new int[0], new int[0]);
}
public void testOversizedNgramsPreserveOriginal() throws Exception {
NGramTokenFilter tokenizer = new NGramTokenFilter(input, 6, 6, true);
assertTokenStreamContents(tokenizer, new String[] {"abcde"}, new int[] {0}, new int[] {5});
}
public void testSmallTokenInStream() throws Exception {
input = whitespaceMockTokenizer("abc de fgh");
NGramTokenFilter filter = new NGramTokenFilter(input, 3, 3);
assertTokenStreamContents(filter, new String[]{"abc","fgh"}, new int[]{0,7}, new int[]{3,10}, new int[] {1, 2});
NGramTokenFilter tokenizer = new NGramTokenFilter(input, 3, 3, false);
assertTokenStreamContents(tokenizer, new String[]{"abc","fgh"}, new int[]{0,7}, new int[]{3,10}, new int[] {1, 2});
}
public void testSmallTokenInStreamPreserveOriginal() throws Exception {
input = whitespaceMockTokenizer("abc de fgh");
NGramTokenFilter tokenizer = new NGramTokenFilter(input, 3, 3, true);
assertTokenStreamContents(tokenizer, new String[]{"abc","de","fgh"}, new int[]{0,4,7}, new int[]{3,6,10}, new int[] {1, 1, 1});
}
public void testReset() throws Exception {
WhitespaceTokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader("abcde"));
NGramTokenFilter filter = new NGramTokenFilter(tokenizer, 1, 1);
NGramTokenFilter filter = new NGramTokenFilter(tokenizer, 1, 1, false);
assertTokenStreamContents(filter, new String[]{"a","b","c","d","e"}, new int[]{0,0,0,0,0}, new int[]{5,5,5,5,5}, new int[]{1,0,0,0,0});
tokenizer.setReader(new StringReader("abcde"));
assertTokenStreamContents(filter, new String[]{"a","b","c","d","e"}, new int[]{0,0,0,0,0}, new int[]{5,5,5,5,5}, new int[]{1,0,0,0,0});
}
public void testKeepShortTermKeepLongTerm() throws Exception {
final String inputString = "a bcd efghi jk";
{ // preserveOriginal = false
TokenStream ts = whitespaceMockTokenizer(inputString);
NGramTokenFilter filter = new NGramTokenFilter(ts, 2, 3, false);
assertTokenStreamContents(filter,
new String[] { "bc", "bcd", "cd", "ef", "efg", "fg", "fgh", "gh", "ghi", "hi", "jk" },
new int[] { 2, 2, 2, 6, 6, 6, 6, 6, 6, 6, 12 },
new int[] { 5, 5, 5, 11, 11, 11, 11, 11, 11, 11, 14 },
new int[] { 2, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1 });
}
{ // preserveOriginal = true
TokenStream ts = whitespaceMockTokenizer(inputString);
NGramTokenFilter filter = new NGramTokenFilter(ts, 2, 3, true);
assertTokenStreamContents(filter,
new String[] { "a", "bc", "bcd", "cd", "ef", "efg", "fg", "fgh", "gh", "ghi", "hi", "efghi", "jk" },
new int[] { 0, 2, 2, 2, 6, 6, 6, 6, 6, 6, 6, 6, 12 },
new int[] { 1, 5, 5, 5, 11, 11, 11, 11, 11, 11, 11, 11, 14 },
new int[] { 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1 });
}
}
// LUCENE-3642
// EdgeNgram blindly adds term length to offset, but this can take things out of bounds
// wrt original text if a previous filter increases the length of the word (in this case æ -> ae)
@ -122,7 +158,7 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase {
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
TokenFilter filters = new ASCIIFoldingFilter(tokenizer);
filters = new NGramTokenFilter(filters, 2, 2);
filters = new NGramTokenFilter(filters, 2, 2, false);
return new TokenStreamComponents(tokenizer, filters);
}
};
@ -139,12 +175,14 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase {
for (int i = 0; i < 10; i++) {
final int min = TestUtil.nextInt(random(), 2, 10);
final int max = TestUtil.nextInt(random(), min, 20);
final boolean preserveOriginal = TestUtil.nextInt(random(), 0, 1) % 2 == 0;
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(tokenizer,
new NGramTokenFilter(tokenizer, min, max));
new NGramTokenFilter(tokenizer, min, max, preserveOriginal));
}
};
checkRandomData(random(), a, 200*RANDOM_MULTIPLIER, 20);
@ -159,7 +197,7 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase {
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new KeywordTokenizer();
return new TokenStreamComponents(tokenizer,
new NGramTokenFilter(tokenizer, 2, 15));
new NGramTokenFilter(tokenizer, 2, 15, false));
}
};
checkAnalysisConsistency(random, a, random.nextBoolean(), "");
@ -167,16 +205,27 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase {
}
public void testSupplementaryCharacters() throws IOException {
for (int i = 0; i < 20; i++) {
final String s = TestUtil.randomUnicodeString(random(), 10);
final int codePointCount = s.codePointCount(0, s.length());
final int minGram = TestUtil.nextInt(random(), 1, 3);
final int maxGram = TestUtil.nextInt(random(), minGram, 10);
final boolean preserveOriginal = TestUtil.nextInt(random(), 0, 1) % 2 == 0;
TokenStream tk = new KeywordTokenizer();
((Tokenizer)tk).setReader(new StringReader(s));
tk = new NGramTokenFilter(tk, minGram, maxGram);
tk = new NGramTokenFilter(tk, minGram, maxGram, preserveOriginal);
final CharTermAttribute termAtt = tk.addAttribute(CharTermAttribute.class);
final OffsetAttribute offsetAtt = tk.addAttribute(OffsetAttribute.class);
tk.reset();
if (codePointCount < minGram && preserveOriginal) {
assertTrue(tk.incrementToken());
assertEquals(0, offsetAtt.startOffset());
assertEquals(s.length(), offsetAtt.endOffset());
assertEquals(s, termAtt.toString());
}
for (int start = 0; start < codePointCount; ++start) {
for (int end = start + minGram; end <= Math.min(codePointCount, start + maxGram); ++end) {
assertTrue(tk.incrementToken());
@ -187,7 +236,16 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase {
assertEquals(s.substring(startIndex, endIndex), termAtt.toString());
}
}
assertFalse(tk.incrementToken());
if (codePointCount > maxGram && preserveOriginal) {
assertTrue(tk.incrementToken());
assertEquals(0, offsetAtt.startOffset());
assertEquals(s.length(), offsetAtt.endOffset());
assertEquals(s, termAtt.toString());
}
assertFalse(tk.incrementToken());
tk.close();
}
}
}

View File

@ -56,12 +56,14 @@ public class TestNGramFilters extends BaseTokenStreamFactoryTestCase {
}
/**
* Test the NGramFilterFactory
* Test the NGramFilterFactory with old defaults
*/
public void testNGramFilter() throws Exception {
Reader reader = new StringReader("test");
TokenStream stream = whitespaceMockTokenizer(reader);
stream = tokenFilterFactory("NGram").create(stream);
stream = tokenFilterFactory("NGram",
"minGramSize", "1",
"maxGramSize", "2").create(stream);
assertTokenStreamContents(stream,
new String[] { "t", "te", "e", "es", "s", "st", "t" });
}
@ -126,12 +128,13 @@ public class TestNGramFilters extends BaseTokenStreamFactoryTestCase {
}
/**
* Test EdgeNGramFilterFactory
* Test EdgeNGramFilterFactory with old defaults
*/
public void testEdgeNGramFilter() throws Exception {
Reader reader = new StringReader("test");
TokenStream stream = whitespaceMockTokenizer(reader);
stream = tokenFilterFactory("EdgeNGram").create(stream);
stream = tokenFilterFactory("EdgeNGram", "minGramSize", "1",
"maxGramSize", "1").create(stream);
assertTokenStreamContents(stream,
new String[] { "t" });
}
@ -173,7 +176,8 @@ public class TestNGramFilters extends BaseTokenStreamFactoryTestCase {
/** Test that bogus arguments result in exception */
public void testBogusArguments() throws Exception {
IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {
IllegalArgumentException expected = null;
expected = expectThrows(IllegalArgumentException.class, () -> {
tokenizerFactory("NGram", "bogusArg", "bogusValue");
});
assertTrue(expected.getMessage().contains("Unknown parameters"));
@ -184,12 +188,12 @@ public class TestNGramFilters extends BaseTokenStreamFactoryTestCase {
assertTrue(expected.getMessage().contains("Unknown parameters"));
expected = expectThrows(IllegalArgumentException.class, () -> {
tokenFilterFactory("NGram", "bogusArg", "bogusValue");
tokenFilterFactory("NGram", "minGramSize", "2", "maxGramSize", "5", "bogusArg", "bogusValue");
});
assertTrue(expected.getMessage().contains("Unknown parameters"));
expected = expectThrows(IllegalArgumentException.class, () -> {
tokenFilterFactory("EdgeNGram", "bogusArg", "bogusValue");
tokenFilterFactory("EdgeNGram", "minGramSize", "2", "maxGramSize", "5", "bogusArg", "bogusValue");
});
assertTrue(expected.getMessage().contains("Unknown parameters"));
}

View File

@ -87,7 +87,7 @@ public class BM25NBClassifierTest extends ClassificationTestBase<BytesRef> {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
final Tokenizer tokenizer = new KeywordTokenizer();
return new TokenStreamComponents(tokenizer, new ReverseStringFilter(new EdgeNGramTokenFilter(new ReverseStringFilter(tokenizer), 10, 20)));
return new TokenStreamComponents(tokenizer, new ReverseStringFilter(new EdgeNGramTokenFilter(new ReverseStringFilter(tokenizer), 10, 20, false)));
}
}

View File

@ -86,7 +86,7 @@ public class CachingNaiveBayesClassifierTest extends ClassificationTestBase<Byte
@Override
protected TokenStreamComponents createComponents(String fieldName) {
final Tokenizer tokenizer = new KeywordTokenizer();
return new TokenStreamComponents(tokenizer, new ReverseStringFilter(new EdgeNGramTokenFilter(new ReverseStringFilter(tokenizer), 10, 20)));
return new TokenStreamComponents(tokenizer, new ReverseStringFilter(new EdgeNGramTokenFilter(new ReverseStringFilter(tokenizer), 10, 20, false)));
}
}

View File

@ -89,7 +89,7 @@ public class SimpleNaiveBayesClassifierTest extends ClassificationTestBase<Bytes
@Override
protected TokenStreamComponents createComponents(String fieldName) {
final Tokenizer tokenizer = new KeywordTokenizer();
return new TokenStreamComponents(tokenizer, new ReverseStringFilter(new EdgeNGramTokenFilter(new ReverseStringFilter(tokenizer), 10, 20)));
return new TokenStreamComponents(tokenizer, new ReverseStringFilter(new EdgeNGramTokenFilter(new ReverseStringFilter(tokenizer), 10, 20, false)));
}
}

View File

@ -359,7 +359,7 @@ public class AnalyzingInfixSuggester extends Lookup implements Closeable {
: "no need \"textgrams\" when minPrefixChars="+minPrefixChars;
if (fieldName.equals(TEXTGRAMS_FIELD_NAME) && minPrefixChars > 0) {
// TODO: should use an EdgeNGramTokenFilterFactory here
TokenFilter filter = new EdgeNGramTokenFilter(components.getTokenStream(), 1, minPrefixChars);
TokenFilter filter = new EdgeNGramTokenFilter(components.getTokenStream(), 1, minPrefixChars, false);
return new TokenStreamComponents(components.getTokenizer(), filter);
} else {
return components;