LUCENE-7690: Add preserveOriginal option to the NGram and EdgeNGram filters

This commit is contained in:
Robert Muir 2018-06-04 21:24:20 -04:00
parent 59087d148a
commit 2c1ab31b4e
13 changed files with 392 additions and 135 deletions

View File

@ -202,6 +202,9 @@ New Features
IndexFileDeleter already accounts for that for existing files which we can
now use to also take pending deletes into account which ensures that all file
generations per segment always go forward. (Simon Willnauer)
* LUCENE-7690: Add preserveOriginal option to the NGram and EdgeNGram filters.
(Ingomar Wesp, Shawn Heisey via Robert Muir)
* LUCENE-8335: Enforce soft-deletes field up-front. Soft deletes field must be marked
as such once it's introduced and can't be changed after the fact.

View File

@ -29,19 +29,21 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
* <fieldType name="text_edgngrm" class="solr.TextField" positionIncrementGap="100">
* <analyzer>
* <tokenizer class="solr.WhitespaceTokenizerFactory"/>
* <filter class="solr.EdgeNGramFilterFactory" minGramSize="1" maxGramSize="1"/>
* <filter class="solr.EdgeNGramFilterFactory" minGramSize="1" maxGramSize="2" preserveOriginal="true"/>
* </analyzer>
* &lt;/fieldType&gt;</pre>
*/
public class EdgeNGramFilterFactory extends TokenFilterFactory {
private final int maxGramSize;
private final int minGramSize;
private final boolean preserveOriginal;
/** Creates a new EdgeNGramFilterFactory */
public EdgeNGramFilterFactory(Map<String, String> args) {
super(args);
minGramSize = getInt(args, "minGramSize", EdgeNGramTokenFilter.DEFAULT_MIN_GRAM_SIZE);
maxGramSize = getInt(args, "maxGramSize", EdgeNGramTokenFilter.DEFAULT_MAX_GRAM_SIZE);
preserveOriginal = getBoolean(args, "preserveOriginal", EdgeNGramTokenFilter.DEFAULT_PRESERVE_ORIGINAL);
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
@ -49,6 +51,6 @@ public class EdgeNGramFilterFactory extends TokenFilterFactory {
@Override
public TokenFilter create(TokenStream input) {
return new EdgeNGramTokenFilter(input, minGramSize, maxGramSize);
return new EdgeNGramTokenFilter(input, minGramSize, maxGramSize, preserveOriginal);
}
}

View File

@ -32,29 +32,46 @@ import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
* supplementary characters.
*/
public final class EdgeNGramTokenFilter extends TokenFilter {
/**
* @deprecated since 7.4 - this value will be required.
*/
@Deprecated
public static final int DEFAULT_MAX_GRAM_SIZE = 1;
/**
* @deprecated since 7.4 - this value will be required.
*/
@Deprecated
public static final int DEFAULT_MIN_GRAM_SIZE = 1;
public static final boolean DEFAULT_PRESERVE_ORIGINAL = false;
private final int minGram;
private final int maxGram;
private final boolean preserveOriginal;
private char[] curTermBuffer;
private int curTermLength;
private int curCodePointCount;
private int curTermCodePointCount;
private int curGramSize;
private int savePosIncr;
private int curPosIncr;
private State state;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
/**
* Creates EdgeNGramTokenFilter that can generate n-grams in the sizes of the given range
*
* Creates an EdgeNGramTokenFilter that, for a given input term, produces all
* edge n-grams with lengths &gt;= minGram and &lt;= maxGram. Will
* optionally preserve the original term when its length is outside of the
* defined range.
*
* @param input {@link TokenStream} holding the input to be tokenized
* @param minGram the smallest n-gram to generate
* @param maxGram the largest n-gram to generate
* @param minGram the minimum length of the generated n-grams
* @param maxGram the maximum length of the generated n-grams
* @param preserveOriginal Whether or not to keep the original term when it
* is outside the min/max size range.
*/
public EdgeNGramTokenFilter(TokenStream input, int minGram, int maxGram) {
public EdgeNGramTokenFilter(
TokenStream input, int minGram, int maxGram, boolean preserveOriginal) {
super(input);
if (minGram < 1) {
@ -67,6 +84,39 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
this.minGram = minGram;
this.maxGram = maxGram;
this.preserveOriginal = preserveOriginal;
}
/**
* Creates an EdgeNGramTokenFilter that produces edge n-grams of the given
* size.
*
* @param input {@link TokenStream} holding the input to be tokenized
* @param gramSize the n-gram size to generate.
*/
public EdgeNGramTokenFilter(TokenStream input, int gramSize) {
this(input, gramSize, gramSize, DEFAULT_PRESERVE_ORIGINAL);
}
/**
* Creates an EdgeNGramTokenFilter that, for a given input term, produces all
* edge n-grams with lengths &gt;= minGram and &lt;= maxGram.
*
* <p>
* Behaves the same as
* {@link #EdgeNGramTokenFilter(TokenStream, int, int, boolean)
* NGramTokenFilter(input, minGram, maxGram, false)}
*
* @param input {@link TokenStream} holding the input to be tokenized
* @param minGram the minimum length of the generated n-grams
* @param maxGram the maximum length of the generated n-grams
*
* @deprecated since 7.4. Use
* {@link #EdgeNGramTokenFilter(TokenStream, int, int, boolean)} instead.
*/
@Deprecated
public EdgeNGramTokenFilter(TokenStream input, int minGram, int maxGram) {
this(input, minGram, maxGram, DEFAULT_PRESERVE_ORIGINAL);
}
@Override
@ -75,32 +125,46 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
if (curTermBuffer == null) {
if (!input.incrementToken()) {
return false;
} else {
curTermBuffer = termAtt.buffer().clone();
curTermLength = termAtt.length();
curCodePointCount = Character.codePointCount(termAtt, 0, termAtt.length());
curGramSize = minGram;
state = captureState();
savePosIncr += posIncrAtt.getPositionIncrement();
}
state = captureState();
curTermLength = termAtt.length();
curTermCodePointCount = Character.codePointCount(termAtt, 0, curTermLength);
curPosIncr += posIncrAtt.getPositionIncrement();
if (preserveOriginal && curTermCodePointCount < minGram) {
// Token is shorter than minGram, but we'd still like to keep it.
posIncrAtt.setPositionIncrement(curPosIncr);
curPosIncr = 0;
return true;
}
curTermBuffer = termAtt.buffer().clone();
curGramSize = minGram;
}
if (curGramSize <= maxGram) { // if we have hit the end of our n-gram size range, quit
if (curGramSize <= curCodePointCount) { // if the remaining input is too short, we can't generate any n-grams
// grab gramSize chars from front or back
if (curGramSize <= curTermCodePointCount) {
if (curGramSize <= maxGram) { // curGramSize is between minGram and maxGram
restoreState(state);
// first ngram gets increment, others don't
if (curGramSize == minGram) {
posIncrAtt.setPositionIncrement(savePosIncr);
savePosIncr = 0;
} else {
posIncrAtt.setPositionIncrement(0);
}
posIncrAtt.setPositionIncrement(curPosIncr);
curPosIncr = 0;
final int charLength = Character.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curGramSize);
termAtt.copyBuffer(curTermBuffer, 0, charLength);
curGramSize++;
return true;
}
else if (preserveOriginal) {
// Token is longer than maxGram, but we'd still like to keep it.
restoreState(state);
posIncrAtt.setPositionIncrement(0);
termAtt.copyBuffer(curTermBuffer, 0, curTermLength);
curTermBuffer = null;
return true;
}
}
// Done with this input token, get next token on the next iteration.
curTermBuffer = null;
}
}
@ -109,6 +173,6 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
public void reset() throws IOException {
super.reset();
curTermBuffer = null;
savePosIncr = 0;
curPosIncr = 0;
}
}

View File

@ -29,19 +29,21 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
* &lt;fieldType name="text_ngrm" class="solr.TextField" positionIncrementGap="100"&gt;
* &lt;analyzer&gt;
* &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
* &lt;filter class="solr.NGramFilterFactory" minGramSize="1" maxGramSize="2"/&gt;
* &lt;filter class="solr.NGramFilterFactory" minGramSize="1" maxGramSize="2" preserveOriginal="true"/&gt;
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;</pre>
*/
public class NGramFilterFactory extends TokenFilterFactory {
private final int maxGramSize;
private final int minGramSize;
private final boolean preserveOriginal;
/** Creates a new NGramFilterFactory */
public NGramFilterFactory(Map<String, String> args) {
super(args);
minGramSize = getInt(args, "minGramSize", NGramTokenFilter.DEFAULT_MIN_NGRAM_SIZE);
maxGramSize = getInt(args, "maxGramSize", NGramTokenFilter.DEFAULT_MAX_NGRAM_SIZE);
preserveOriginal = getBoolean(args, "keepShortTerm", NGramTokenFilter.DEFAULT_PRESERVE_ORIGINAL);
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
@ -49,6 +51,6 @@ public class NGramFilterFactory extends TokenFilterFactory {
@Override
public TokenFilter create(TokenStream input) {
return new NGramTokenFilter(input, minGramSize, maxGramSize);
return new NGramTokenFilter(input, minGramSize, maxGramSize, preserveOriginal);
}
}

View File

@ -21,7 +21,6 @@ import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.CodepointCountFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
@ -40,30 +39,52 @@ import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
* override {@link NGramTokenizer#isTokenChar(int)} to perform pre-tokenization.
*/
public final class NGramTokenFilter extends TokenFilter {
/**
* @deprecated since 7.4 - this value will be required.
*/
@Deprecated
public static final int DEFAULT_MIN_NGRAM_SIZE = 1;
public static final int DEFAULT_MAX_NGRAM_SIZE = 2;
private final int minGram, maxGram;
/**
* @deprecated since 7.4 - this value will be required.
*/
@Deprecated
public static final int DEFAULT_MAX_NGRAM_SIZE = 2;
public static final boolean DEFAULT_PRESERVE_ORIGINAL = false;
private final int minGram;
private final int maxGram;
private final boolean preserveOriginal;
private char[] curTermBuffer;
private int curTermLength;
private int curCodePointCount;
private int curTermCodePointCount;
private int curGramSize;
private int curPos;
private int curPosInc;
private int curPosIncr;
private State state;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final PositionIncrementAttribute posIncAtt;
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
/**
* Creates NGramTokenFilter with given min and max n-grams.
* Creates an NGramTokenFilter that, for a given input term, produces all
* contained n-grams with lengths &gt;= minGram and &lt;= maxGram. Will
* optionally preserve the original term when its length is outside of the
* defined range.
*
* Note: Care must be taken when choosing minGram and maxGram; depending
* on the input token size, this filter potentially produces a huge number
* of terms.
*
* @param input {@link TokenStream} holding the input to be tokenized
* @param minGram the smallest n-gram to generate
* @param maxGram the largest n-gram to generate
* @param minGram the minimum length of the generated n-grams
* @param maxGram the maximum length of the generated n-grams
* @param preserveOriginal Whether or not to keep the original term when it
* is shorter than minGram or longer than maxGram
*/
public NGramTokenFilter(TokenStream input, int minGram, int maxGram) {
super(new CodepointCountFilter(input, minGram, Integer.MAX_VALUE));
public NGramTokenFilter(TokenStream input, int minGram, int maxGram, boolean preserveOriginal) {
super(input);
if (minGram < 1) {
throw new IllegalArgumentException("minGram must be greater than zero");
}
@ -72,51 +93,107 @@ public final class NGramTokenFilter extends TokenFilter {
}
this.minGram = minGram;
this.maxGram = maxGram;
this.preserveOriginal = preserveOriginal;
}
/**
* Creates an NGramTokenFilter that produces n-grams of the indicated size.
*
* @param input {@link TokenStream} holding the input to be tokenized
* @param gramSize the size of n-grams to generate.
*/
public NGramTokenFilter(TokenStream input, int gramSize) {
this(input, gramSize, gramSize, DEFAULT_PRESERVE_ORIGINAL);
}
posIncAtt = addAttribute(PositionIncrementAttribute.class);
/**
* Creates an NGramTokenFilter that, for a given input term, produces all
* contained n-grams with lengths &gt;= minGram and &lt;= maxGram.
*
* <p>
* Behaves the same as
* {@link #NGramTokenFilter(TokenStream, int, int, boolean)
* NGramTokenFilter(input, minGram, maxGram, false)}
*
* @param input {@link TokenStream} holding the input to be tokenized
* @param minGram the minimum length of the generated n-grams
* @param maxGram the maximum length of the generated n-grams
*
* @deprecated since 7.4. Use
* {@link #NGramTokenFilter(TokenStream, int, int, boolean)} instead.
*/
@Deprecated
public NGramTokenFilter(TokenStream input, int minGram, int maxGram) {
this(input, minGram, maxGram, DEFAULT_PRESERVE_ORIGINAL);
}
/**
* Creates NGramTokenFilter with default min and max n-grams.
*
* <p>
* Behaves the same as
* {@link #NGramTokenFilter(TokenStream, int, int, boolean)
* NGramTokenFilter(input, 1, 2, false)}
*
* @param input {@link TokenStream} holding the input to be tokenized
* @deprecated since 7.4. Use
* {@link #NGramTokenFilter(TokenStream, int, int, boolean)} instead.
*/
@Deprecated
public NGramTokenFilter(TokenStream input) {
this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE);
this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE, DEFAULT_PRESERVE_ORIGINAL);
}
/** Returns the next token in the stream, or null at EOS. */
@Override
public final boolean incrementToken() throws IOException {
while (true) {
if (curTermBuffer == null) {
if (!input.incrementToken()) {
return false;
} else {
curTermBuffer = termAtt.buffer().clone();
curTermLength = termAtt.length();
curCodePointCount = Character.codePointCount(termAtt, 0, termAtt.length());
curGramSize = minGram;
curPos = 0;
curPosInc = posIncAtt.getPositionIncrement();
state = captureState();
}
state = captureState();
curTermLength = termAtt.length();
curTermCodePointCount = Character.codePointCount(termAtt, 0, termAtt.length());
curPosIncr += posIncrAtt.getPositionIncrement();
curPos = 0;
if (preserveOriginal && curTermCodePointCount < minGram) {
// Token is shorter than minGram, but we'd still like to keep it.
posIncrAtt.setPositionIncrement(curPosIncr);
curPosIncr = 0;
return true;
}
curTermBuffer = termAtt.buffer().clone();
curGramSize = minGram;
}
if (curGramSize > maxGram || (curPos + curGramSize) > curCodePointCount) {
if (curGramSize > maxGram || (curPos + curGramSize) > curTermCodePointCount) {
++curPos;
curGramSize = minGram;
}
if ((curPos + curGramSize) <= curCodePointCount) {
if ((curPos + curGramSize) <= curTermCodePointCount) {
restoreState(state);
final int start = Character.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curPos);
final int end = Character.offsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize);
termAtt.copyBuffer(curTermBuffer, start, end - start);
posIncAtt.setPositionIncrement(curPosInc);
curPosInc = 0;
posIncrAtt.setPositionIncrement(curPosIncr);
curPosIncr = 0;
curGramSize++;
return true;
}
curTermBuffer = null;
else if (preserveOriginal && curTermCodePointCount > maxGram) {
// Token is longer than maxGram, but we'd still like to keep it.
restoreState(state);
posIncrAtt.setPositionIncrement(0);
termAtt.copyBuffer(curTermBuffer, 0, curTermLength);
curTermBuffer = null;
return true;
}
// Done with this input token, get next token on next iteration.
curTermBuffer = null;
}
}
@ -124,5 +201,6 @@ public final class NGramTokenFilter extends TokenFilter {
public void reset() throws IOException {
super.reset();
curTermBuffer = null;
curPosIncr = 0;
}
}

View File

@ -236,7 +236,7 @@ public class TestBugInSomething extends BaseTokenStreamTestCase {
//TokenStream stream = new SopTokenFilter(tokenizer);
TokenStream stream = new ShingleFilter(tokenizer, 5);
//stream = new SopTokenFilter(stream);
stream = new NGramTokenFilter(stream, 55, 83);
stream = new NGramTokenFilter(stream, 55, 83, false);
//stream = new SopTokenFilter(stream);
return new TokenStreamComponents(tokenizer, stream);
}

View File

@ -50,49 +50,73 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase {
public void testInvalidInput() throws Exception {
expectThrows(IllegalArgumentException.class, () -> {
new EdgeNGramTokenFilter(input, 0, 0);
new EdgeNGramTokenFilter(input, 0, 0, false);
});
}
public void testInvalidInput2() throws Exception {
expectThrows(IllegalArgumentException.class, () -> {
new EdgeNGramTokenFilter(input, 2, 1);
new EdgeNGramTokenFilter(input, 2, 1, false);
});
}
public void testInvalidInput3() throws Exception {
expectThrows(IllegalArgumentException.class, () -> {
new EdgeNGramTokenFilter(input, -1, 2);
new EdgeNGramTokenFilter(input, -1, 2, false);
});
}
public void testFrontUnigram() throws Exception {
EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, 1, 1);
EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, 1, 1, false);
assertTokenStreamContents(tokenizer, new String[]{"a"}, new int[]{0}, new int[]{5});
}
public void testOversizedNgrams() throws Exception {
EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, 6, 6);
EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, 6, 6, false);
assertTokenStreamContents(tokenizer, new String[0], new int[0], new int[0]);
}
public void testOversizedNgramsPreserveOriginal() throws Exception {
EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, 6, 6, true);
assertTokenStreamContents(tokenizer, new String[] {"abcde"}, new int[] {0}, new int[] {5});
}
public void testPreserveOriginal() throws Exception {
final String inputString = "a bcd efghi jk";
{ // preserveOriginal = false
TokenStream ts = whitespaceMockTokenizer(inputString);
EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(ts, 2, 3, false);
assertTokenStreamContents(filter,
new String[] { "bc", "bcd", "ef", "efg", "jk" },
new int[] { 2, 2, 6, 6, 12 },
new int[] { 5, 5, 11, 11, 14 },
new int[] { 2, 0, 1, 0, 1 });
}
{ // preserveOriginal = true
TokenStream ts = whitespaceMockTokenizer(inputString);
EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(ts, 2, 3, true);
assertTokenStreamContents(filter,
new String[] { "a", "bc", "bcd", "ef", "efg", "efghi", "jk" },
new int[] { 0, 2, 2, 6, 6, 6, 12 },
new int[] { 1, 5, 5, 11, 11, 11, 14 },
new int[] { 1, 1, 0, 1, 0, 0, 1 });
}
}
public void testFrontRangeOfNgrams() throws Exception {
EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, 1, 3);
EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, 1, 3, false);
assertTokenStreamContents(tokenizer, new String[]{"a","ab","abc"}, new int[]{0,0,0}, new int[]{5,5,5});
}
public void testFilterPositions() throws Exception {
TokenStream ts = whitespaceMockTokenizer("abcde vwxyz");
EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(ts, 1, 3);
EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(ts, 1, 3, false);
assertTokenStreamContents(tokenizer,
new String[]{"a","ab","abc","v","vw","vwx"},
new int[]{0,0,0,6,6,6},
new int[]{5,5,5,11,11,11},
null,
new int[]{1,0,0,1,0,0},
null,
null,
false);
new String[] {"a","ab","abc","v","vw","vwx"},
new int[] {0, 0, 0, 6, 6, 6},
new int[] {5, 5, 5, 11, 11, 11});
}
private static class PositionFilter extends TokenFilter {
@ -128,7 +152,7 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase {
public void testFirstTokenPositionIncrement() throws Exception {
TokenStream ts = whitespaceMockTokenizer("a abc");
ts = new PositionFilter(ts); // All but first token will get 0 position increment
EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(ts, 2, 3);
EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(ts, 2, 3, false);
// The first token "a" will not be output, since it's smaller than the mingram size of 2.
// The second token on input to EdgeNGramTokenFilter will have position increment of 0,
// which should be increased to 1, since this is the first output token in the stream.
@ -142,14 +166,14 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase {
public void testSmallTokenInStream() throws Exception {
input = whitespaceMockTokenizer("abc de fgh");
EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, 3, 3);
EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, 3, 3, false);
assertTokenStreamContents(tokenizer, new String[]{"abc","fgh"}, new int[]{0,7}, new int[]{3,10});
}
public void testReset() throws Exception {
WhitespaceTokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader("abcde"));
EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(tokenizer, 1, 3);
EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(tokenizer, 1, 3, false);
assertTokenStreamContents(filter, new String[]{"a","ab","abc"}, new int[]{0,0,0}, new int[]{5,5,5});
tokenizer.setReader(new StringReader("abcde"));
assertTokenStreamContents(filter, new String[]{"a","ab","abc"}, new int[]{0,0,0}, new int[]{5,5,5});
@ -160,13 +184,14 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase {
for (int i = 0; i < 10; i++) {
final int min = TestUtil.nextInt(random(), 2, 10);
final int max = TestUtil.nextInt(random(), min, 20);
final boolean preserveOriginal = TestUtil.nextInt(random(), 0, 1) % 2 == 0;
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(tokenizer,
new EdgeNGramTokenFilter(tokenizer, min, max));
new EdgeNGramTokenFilter(tokenizer, min, max, preserveOriginal));
}
};
checkRandomData(random(), a, 100*RANDOM_MULTIPLIER);
@ -181,7 +206,7 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase {
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new KeywordTokenizer();
return new TokenStreamComponents(tokenizer,
new EdgeNGramTokenFilter(tokenizer, 2, 15));
new EdgeNGramTokenFilter(tokenizer, 2, 15, false));
}
};
checkAnalysisConsistency(random, a, random.nextBoolean(), "");
@ -192,7 +217,7 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase {
TokenStream tk = new LetterTokenizer();
((Tokenizer)tk).setReader(new StringReader("abc d efgh ij klmno p q"));
tk = new ShingleFilter(tk);
tk = new EdgeNGramTokenFilter(tk, 7, 10);
tk = new EdgeNGramTokenFilter(tk, 7, 10, false);
assertTokenStreamContents(tk,
new String[] { "efgh ij", "ij klmn", "ij klmno", "klmno p" },
new int[] { 6,11,11,14 },
@ -204,23 +229,44 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase {
}
public void testSupplementaryCharacters() throws IOException {
final String s = TestUtil.randomUnicodeString(random(), 10);
final int codePointCount = s.codePointCount(0, s.length());
final int minGram = TestUtil.nextInt(random(), 1, 3);
final int maxGram = TestUtil.nextInt(random(), minGram, 10);
TokenStream tk = new KeywordTokenizer();
((Tokenizer)tk).setReader(new StringReader(s));
tk = new EdgeNGramTokenFilter(tk, minGram, maxGram);
final CharTermAttribute termAtt = tk.addAttribute(CharTermAttribute.class);
final OffsetAttribute offsetAtt = tk.addAttribute(OffsetAttribute.class);
tk.reset();
for (int i = minGram; i <= Math.min(codePointCount, maxGram); ++i) {
assertTrue(tk.incrementToken());
assertEquals(0, offsetAtt.startOffset());
assertEquals(s.length(), offsetAtt.endOffset());
final int end = Character.offsetByCodePoints(s, 0, i);
assertEquals(s.substring(0, end), termAtt.toString());
for (int i = 0; i < 20; i++) {
final String s = TestUtil.randomUnicodeString(random(), 10);
final int codePointCount = s.codePointCount(0, s.length());
final int minGram = TestUtil.nextInt(random(), 1, 3);
final int maxGram = TestUtil.nextInt(random(), minGram, 10);
final boolean preserveOriginal = TestUtil.nextInt(random(), 0, 1) % 2 == 0;
TokenStream tk = new KeywordTokenizer();
((Tokenizer)tk).setReader(new StringReader(s));
tk = new EdgeNGramTokenFilter(tk, minGram, maxGram, preserveOriginal);
final CharTermAttribute termAtt = tk.addAttribute(CharTermAttribute.class);
final OffsetAttribute offsetAtt = tk.addAttribute(OffsetAttribute.class);
tk.reset();
if (codePointCount < minGram && preserveOriginal) {
assertTrue(tk.incrementToken());
assertEquals(0, offsetAtt.startOffset());
assertEquals(s.length(), offsetAtt.endOffset());
assertEquals(s, termAtt.toString());
}
for (int j = minGram; j <= Math.min(codePointCount, maxGram); j++) {
assertTrue(tk.incrementToken());
assertEquals(0, offsetAtt.startOffset());
assertEquals(s.length(), offsetAtt.endOffset());
final int end = Character.offsetByCodePoints(s, 0, j);
assertEquals(s.substring(0, end), termAtt.toString());
}
if (codePointCount > maxGram && preserveOriginal) {
assertTrue(tk.incrementToken());
assertEquals(0, offsetAtt.startOffset());
assertEquals(s.length(), offsetAtt.endOffset());
assertEquals(s, termAtt.toString());
}
assertFalse(tk.incrementToken());
tk.close();
}
assertFalse(tk.incrementToken());
}
}

View File

@ -48,28 +48,28 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase {
public void testInvalidInput() throws Exception {
expectThrows(IllegalArgumentException.class, () -> {
new NGramTokenFilter(input, 2, 1);
new NGramTokenFilter(input, 2, 1, false);
});
}
public void testInvalidInput2() throws Exception {
expectThrows(IllegalArgumentException.class, () -> {
new NGramTokenFilter(input, 0, 1);
new NGramTokenFilter(input, 0, 1, false);
});
}
public void testUnigrams() throws Exception {
NGramTokenFilter filter = new NGramTokenFilter(input, 1, 1);
NGramTokenFilter filter = new NGramTokenFilter(input, 1, 1, false);
assertTokenStreamContents(filter, new String[]{"a","b","c","d","e"}, new int[]{0,0,0,0,0}, new int[]{5,5,5,5,5}, new int[]{1,0,0,0,0});
}
public void testBigrams() throws Exception {
NGramTokenFilter filter = new NGramTokenFilter(input, 2, 2);
NGramTokenFilter filter = new NGramTokenFilter(input, 2, 2, false);
assertTokenStreamContents(filter, new String[]{"ab","bc","cd","de"}, new int[]{0,0,0,0}, new int[]{5,5,5,5}, new int[]{1,0,0,0});
}
public void testNgrams() throws Exception {
NGramTokenFilter filter = new NGramTokenFilter(input, 1, 3);
NGramTokenFilter filter = new NGramTokenFilter(input, 1, 3, false);
assertTokenStreamContents(filter,
new String[]{"a","ab","abc","b","bc","bcd","c","cd","cde","d","de","e"},
new int[]{0,0,0,0,0,0,0,0,0,0,0,0},
@ -81,7 +81,7 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase {
}
public void testNgramsNoIncrement() throws Exception {
NGramTokenFilter filter = new NGramTokenFilter(input, 1, 3);
NGramTokenFilter filter = new NGramTokenFilter(input, 1, 3, false);
assertTokenStreamContents(filter,
new String[]{"a","ab","abc","b","bc","bcd","c","cd","cde","d","de","e"},
new int[]{0,0,0,0,0,0,0,0,0,0,0,0},
@ -93,25 +93,61 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase {
}
public void testOversizedNgrams() throws Exception {
NGramTokenFilter filter = new NGramTokenFilter(input, 6, 7);
NGramTokenFilter filter = new NGramTokenFilter(input, 6, 7, false);
assertTokenStreamContents(filter, new String[0], new int[0], new int[0]);
}
public void testOversizedNgramsPreserveOriginal() throws Exception {
NGramTokenFilter tokenizer = new NGramTokenFilter(input, 6, 6, true);
assertTokenStreamContents(tokenizer, new String[] {"abcde"}, new int[] {0}, new int[] {5});
}
public void testSmallTokenInStream() throws Exception {
input = whitespaceMockTokenizer("abc de fgh");
NGramTokenFilter filter = new NGramTokenFilter(input, 3, 3);
assertTokenStreamContents(filter, new String[]{"abc","fgh"}, new int[]{0,7}, new int[]{3,10}, new int[] {1, 2});
NGramTokenFilter tokenizer = new NGramTokenFilter(input, 3, 3, false);
assertTokenStreamContents(tokenizer, new String[]{"abc","fgh"}, new int[]{0,7}, new int[]{3,10}, new int[] {1, 2});
}
public void testSmallTokenInStreamPreserveOriginal() throws Exception {
input = whitespaceMockTokenizer("abc de fgh");
NGramTokenFilter tokenizer = new NGramTokenFilter(input, 3, 3, true);
assertTokenStreamContents(tokenizer, new String[]{"abc","de","fgh"}, new int[]{0,4,7}, new int[]{3,6,10}, new int[] {1, 1, 1});
}
public void testReset() throws Exception {
WhitespaceTokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader("abcde"));
NGramTokenFilter filter = new NGramTokenFilter(tokenizer, 1, 1);
NGramTokenFilter filter = new NGramTokenFilter(tokenizer, 1, 1, false);
assertTokenStreamContents(filter, new String[]{"a","b","c","d","e"}, new int[]{0,0,0,0,0}, new int[]{5,5,5,5,5}, new int[]{1,0,0,0,0});
tokenizer.setReader(new StringReader("abcde"));
assertTokenStreamContents(filter, new String[]{"a","b","c","d","e"}, new int[]{0,0,0,0,0}, new int[]{5,5,5,5,5}, new int[]{1,0,0,0,0});
}
public void testKeepShortTermKeepLongTerm() throws Exception {
final String inputString = "a bcd efghi jk";
{ // preserveOriginal = false
TokenStream ts = whitespaceMockTokenizer(inputString);
NGramTokenFilter filter = new NGramTokenFilter(ts, 2, 3, false);
assertTokenStreamContents(filter,
new String[] { "bc", "bcd", "cd", "ef", "efg", "fg", "fgh", "gh", "ghi", "hi", "jk" },
new int[] { 2, 2, 2, 6, 6, 6, 6, 6, 6, 6, 12 },
new int[] { 5, 5, 5, 11, 11, 11, 11, 11, 11, 11, 14 },
new int[] { 2, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1 });
}
{ // preserveOriginal = true
TokenStream ts = whitespaceMockTokenizer(inputString);
NGramTokenFilter filter = new NGramTokenFilter(ts, 2, 3, true);
assertTokenStreamContents(filter,
new String[] { "a", "bc", "bcd", "cd", "ef", "efg", "fg", "fgh", "gh", "ghi", "hi", "efghi", "jk" },
new int[] { 0, 2, 2, 2, 6, 6, 6, 6, 6, 6, 6, 6, 12 },
new int[] { 1, 5, 5, 5, 11, 11, 11, 11, 11, 11, 11, 11, 14 },
new int[] { 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1 });
}
}
// LUCENE-3642
// EdgeNgram blindly adds term length to offset, but this can take things out of bounds
// wrt original text if a previous filter increases the length of the word (in this case æ -> ae)
@ -122,7 +158,7 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase {
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
TokenFilter filters = new ASCIIFoldingFilter(tokenizer);
filters = new NGramTokenFilter(filters, 2, 2);
filters = new NGramTokenFilter(filters, 2, 2, false);
return new TokenStreamComponents(tokenizer, filters);
}
};
@ -139,12 +175,14 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase {
for (int i = 0; i < 10; i++) {
final int min = TestUtil.nextInt(random(), 2, 10);
final int max = TestUtil.nextInt(random(), min, 20);
final boolean preserveOriginal = TestUtil.nextInt(random(), 0, 1) % 2 == 0;
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(tokenizer,
new NGramTokenFilter(tokenizer, min, max));
new NGramTokenFilter(tokenizer, min, max, preserveOriginal));
}
};
checkRandomData(random(), a, 200*RANDOM_MULTIPLIER, 20);
@ -159,7 +197,7 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase {
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new KeywordTokenizer();
return new TokenStreamComponents(tokenizer,
new NGramTokenFilter(tokenizer, 2, 15));
new NGramTokenFilter(tokenizer, 2, 15, false));
}
};
checkAnalysisConsistency(random, a, random.nextBoolean(), "");
@ -167,27 +205,47 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase {
}
public void testSupplementaryCharacters() throws IOException {
final String s = TestUtil.randomUnicodeString(random(), 10);
final int codePointCount = s.codePointCount(0, s.length());
final int minGram = TestUtil.nextInt(random(), 1, 3);
final int maxGram = TestUtil.nextInt(random(), minGram, 10);
TokenStream tk = new KeywordTokenizer();
((Tokenizer)tk).setReader(new StringReader(s));
tk = new NGramTokenFilter(tk, minGram, maxGram);
final CharTermAttribute termAtt = tk.addAttribute(CharTermAttribute.class);
final OffsetAttribute offsetAtt = tk.addAttribute(OffsetAttribute.class);
tk.reset();
for (int start = 0; start < codePointCount; ++start) {
for (int end = start + minGram; end <= Math.min(codePointCount, start + maxGram); ++end) {
for (int i = 0; i < 20; i++) {
final String s = TestUtil.randomUnicodeString(random(), 10);
final int codePointCount = s.codePointCount(0, s.length());
final int minGram = TestUtil.nextInt(random(), 1, 3);
final int maxGram = TestUtil.nextInt(random(), minGram, 10);
final boolean preserveOriginal = TestUtil.nextInt(random(), 0, 1) % 2 == 0;
TokenStream tk = new KeywordTokenizer();
((Tokenizer)tk).setReader(new StringReader(s));
tk = new NGramTokenFilter(tk, minGram, maxGram, preserveOriginal);
final CharTermAttribute termAtt = tk.addAttribute(CharTermAttribute.class);
final OffsetAttribute offsetAtt = tk.addAttribute(OffsetAttribute.class);
tk.reset();
if (codePointCount < minGram && preserveOriginal) {
assertTrue(tk.incrementToken());
assertEquals(0, offsetAtt.startOffset());
assertEquals(s.length(), offsetAtt.endOffset());
final int startIndex = Character.offsetByCodePoints(s, 0, start);
final int endIndex = Character.offsetByCodePoints(s, 0, end);
assertEquals(s.substring(startIndex, endIndex), termAtt.toString());
assertEquals(s, termAtt.toString());
}
for (int start = 0; start < codePointCount; ++start) {
for (int end = start + minGram; end <= Math.min(codePointCount, start + maxGram); ++end) {
assertTrue(tk.incrementToken());
assertEquals(0, offsetAtt.startOffset());
assertEquals(s.length(), offsetAtt.endOffset());
final int startIndex = Character.offsetByCodePoints(s, 0, start);
final int endIndex = Character.offsetByCodePoints(s, 0, end);
assertEquals(s.substring(startIndex, endIndex), termAtt.toString());
}
}
if (codePointCount > maxGram && preserveOriginal) {
assertTrue(tk.incrementToken());
assertEquals(0, offsetAtt.startOffset());
assertEquals(s.length(), offsetAtt.endOffset());
assertEquals(s, termAtt.toString());
}
assertFalse(tk.incrementToken());
tk.close();
}
assertFalse(tk.incrementToken());
}
}

View File

@ -56,12 +56,14 @@ public class TestNGramFilters extends BaseTokenStreamFactoryTestCase {
}
/**
* Test the NGramFilterFactory
* Test the NGramFilterFactory with old defaults
*/
public void testNGramFilter() throws Exception {
Reader reader = new StringReader("test");
TokenStream stream = whitespaceMockTokenizer(reader);
stream = tokenFilterFactory("NGram").create(stream);
stream = tokenFilterFactory("NGram",
"minGramSize", "1",
"maxGramSize", "2").create(stream);
assertTokenStreamContents(stream,
new String[] { "t", "te", "e", "es", "s", "st", "t" });
}
@ -126,12 +128,13 @@ public class TestNGramFilters extends BaseTokenStreamFactoryTestCase {
}
/**
* Test EdgeNGramFilterFactory
* Test EdgeNGramFilterFactory with old defaults
*/
public void testEdgeNGramFilter() throws Exception {
Reader reader = new StringReader("test");
TokenStream stream = whitespaceMockTokenizer(reader);
stream = tokenFilterFactory("EdgeNGram").create(stream);
stream = tokenFilterFactory("EdgeNGram", "minGramSize", "1",
"maxGramSize", "1").create(stream);
assertTokenStreamContents(stream,
new String[] { "t" });
}
@ -173,7 +176,8 @@ public class TestNGramFilters extends BaseTokenStreamFactoryTestCase {
/** Test that bogus arguments result in exception */
public void testBogusArguments() throws Exception {
IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {
IllegalArgumentException expected = null;
expected = expectThrows(IllegalArgumentException.class, () -> {
tokenizerFactory("NGram", "bogusArg", "bogusValue");
});
assertTrue(expected.getMessage().contains("Unknown parameters"));
@ -184,12 +188,12 @@ public class TestNGramFilters extends BaseTokenStreamFactoryTestCase {
assertTrue(expected.getMessage().contains("Unknown parameters"));
expected = expectThrows(IllegalArgumentException.class, () -> {
tokenFilterFactory("NGram", "bogusArg", "bogusValue");
tokenFilterFactory("NGram", "minGramSize", "2", "maxGramSize", "5", "bogusArg", "bogusValue");
});
assertTrue(expected.getMessage().contains("Unknown parameters"));
expected = expectThrows(IllegalArgumentException.class, () -> {
tokenFilterFactory("EdgeNGram", "bogusArg", "bogusValue");
tokenFilterFactory("EdgeNGram", "minGramSize", "2", "maxGramSize", "5", "bogusArg", "bogusValue");
});
assertTrue(expected.getMessage().contains("Unknown parameters"));
}

View File

@ -87,7 +87,7 @@ public class BM25NBClassifierTest extends ClassificationTestBase<BytesRef> {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
final Tokenizer tokenizer = new KeywordTokenizer();
return new TokenStreamComponents(tokenizer, new ReverseStringFilter(new EdgeNGramTokenFilter(new ReverseStringFilter(tokenizer), 10, 20)));
return new TokenStreamComponents(tokenizer, new ReverseStringFilter(new EdgeNGramTokenFilter(new ReverseStringFilter(tokenizer), 10, 20, false)));
}
}

View File

@ -86,7 +86,7 @@ public class CachingNaiveBayesClassifierTest extends ClassificationTestBase<Byte
@Override
protected TokenStreamComponents createComponents(String fieldName) {
final Tokenizer tokenizer = new KeywordTokenizer();
return new TokenStreamComponents(tokenizer, new ReverseStringFilter(new EdgeNGramTokenFilter(new ReverseStringFilter(tokenizer), 10, 20)));
return new TokenStreamComponents(tokenizer, new ReverseStringFilter(new EdgeNGramTokenFilter(new ReverseStringFilter(tokenizer), 10, 20, false)));
}
}

View File

@ -89,7 +89,7 @@ public class SimpleNaiveBayesClassifierTest extends ClassificationTestBase<Bytes
@Override
protected TokenStreamComponents createComponents(String fieldName) {
final Tokenizer tokenizer = new KeywordTokenizer();
return new TokenStreamComponents(tokenizer, new ReverseStringFilter(new EdgeNGramTokenFilter(new ReverseStringFilter(tokenizer), 10, 20)));
return new TokenStreamComponents(tokenizer, new ReverseStringFilter(new EdgeNGramTokenFilter(new ReverseStringFilter(tokenizer), 10, 20, false)));
}
}

View File

@ -359,7 +359,7 @@ public class AnalyzingInfixSuggester extends Lookup implements Closeable {
: "no need \"textgrams\" when minPrefixChars="+minPrefixChars;
if (fieldName.equals(TEXTGRAMS_FIELD_NAME) && minPrefixChars > 0) {
// TODO: should use an EdgeNGramTokenFilterFactory here
TokenFilter filter = new EdgeNGramTokenFilter(components.getTokenStream(), 1, minPrefixChars);
TokenFilter filter = new EdgeNGramTokenFilter(components.getTokenStream(), 1, minPrefixChars, false);
return new TokenStreamComponents(components.getTokenizer(), filter);
} else {
return components;