diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramFilterFactory.java index f5edd1edf5f..f4647249d08 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramFilterFactory.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramFilterFactory.java @@ -27,21 +27,19 @@ import org.apache.lucene.analysis.util.TokenFilterFactory; * <fieldType name="text_edgngrm" class="solr.TextField" positionIncrementGap="100"> * <analyzer> * <tokenizer class="solr.WhitespaceTokenizerFactory"/> - * <filter class="solr.EdgeNGramFilterFactory" side="front" minGramSize="1" maxGramSize="1"/> + * <filter class="solr.EdgeNGramFilterFactory" minGramSize="1" maxGramSize="1"/> * </analyzer> * </fieldType> */ public class EdgeNGramFilterFactory extends TokenFilterFactory { private final int maxGramSize; private final int minGramSize; - private final String side; /** Creates a new EdgeNGramFilterFactory */ public EdgeNGramFilterFactory(Map args) { super(args); minGramSize = getInt(args, "minGramSize", EdgeNGramTokenFilter.DEFAULT_MIN_GRAM_SIZE); maxGramSize = getInt(args, "maxGramSize", EdgeNGramTokenFilter.DEFAULT_MAX_GRAM_SIZE); - side = get(args, "side", EdgeNGramTokenFilter.Side.FRONT.getLabel()); if (!args.isEmpty()) { throw new IllegalArgumentException("Unknown parameters: " + args); } @@ -49,6 +47,6 @@ public class EdgeNGramFilterFactory extends TokenFilterFactory { @Override public EdgeNGramTokenFilter create(TokenStream input) { - return new EdgeNGramTokenFilter(luceneMatchVersion, input, side, minGramSize, maxGramSize); + return new EdgeNGramTokenFilter(luceneMatchVersion, input, minGramSize, maxGramSize); } } diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java index d533c5bb19e..10aaf169ec7 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java @@ -17,69 +17,31 @@ package org.apache.lucene.analysis.ngram; * limitations under the License. */ +import java.io.IOException; + import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.reverse.ReverseStringFilter; -import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.util.Version; -import java.io.IOException; - /** * Tokenizes the given token into n-grams of given size(s). *

- * This {@link TokenFilter} create n-grams from the beginning edge or ending edge of a input token. - *

As of Lucene 4.4, this filter does not support - * {@link Side#BACK} (you can use {@link ReverseStringFilter} up-front and - * afterward to get the same behavior) and does not update offsets anymore. + * This {@link TokenFilter} create n-grams from the beginning edge of a input token. */ public final class EdgeNGramTokenFilter extends TokenFilter { - public static final Side DEFAULT_SIDE = Side.FRONT; public static final int DEFAULT_MAX_GRAM_SIZE = 1; public static final int DEFAULT_MIN_GRAM_SIZE = 1; - /** Specifies which side of the input the n-gram should be generated from */ - public static enum Side { - - /** Get the n-gram from the front of the input */ - FRONT { - @Override - public String getLabel() { return "front"; } - }, - - /** Get the n-gram from the end of the input */ - @Deprecated - BACK { - @Override - public String getLabel() { return "back"; } - }; - - public abstract String getLabel(); - - // Get the appropriate Side from a string - public static Side getSide(String sideName) { - if (FRONT.getLabel().equals(sideName)) { - return FRONT; - } - if (BACK.getLabel().equals(sideName)) { - return BACK; - } - return null; - } - } - - private final Version version; private final int minGram; private final int maxGram; - private Side side; private char[] curTermBuffer; private int curTermLength; private int curGramSize; private int tokStart; private int tokEnd; // only used if the length changed before this filter - private boolean updateOffsets; // never if the length changed before this filter private int savePosIncr; private boolean isFirstToken = true; @@ -90,28 +52,18 @@ public final class EdgeNGramTokenFilter extends TokenFilter { /** * Creates EdgeNGramTokenFilter that can generate n-grams in the sizes of the given range * - * @param version the Lucene match version + * @param version the Lucene match version * @param input {@link TokenStream} holding the input to be tokenized - * @param side the {@link Side} from which to chop off an n-gram * @param minGram the smallest n-gram to generate * @param maxGram the largest n-gram to generate */ - @Deprecated - public EdgeNGramTokenFilter(Version version, TokenStream input, Side side, int minGram, int maxGram) { + public EdgeNGramTokenFilter(Version version, TokenStream input, int minGram, int maxGram) { super(input); if (version == null) { throw new IllegalArgumentException("version must not be null"); } - if (version.onOrAfter(Version.LUCENE_44) && side == Side.BACK) { - throw new IllegalArgumentException("Side.BACK is not supported anymore as of Lucene 4.4, use ReverseStringFilter up-front and afterward"); - } - - if (side == null) { - throw new IllegalArgumentException("sideLabel must be either front or back"); - } - if (minGram < 1) { throw new IllegalArgumentException("minGram must be greater than zero"); } @@ -120,36 +72,8 @@ public final class EdgeNGramTokenFilter extends TokenFilter { throw new IllegalArgumentException("minGram must not be greater than maxGram"); } - this.version = version; this.minGram = minGram; this.maxGram = maxGram; - this.side = side; - } - - /** - * Creates EdgeNGramTokenFilter that can generate n-grams in the sizes of the given range - * - * @param version the Lucene match version - * @param input {@link TokenStream} holding the input to be tokenized - * @param sideLabel the name of the {@link Side} from which to chop off an n-gram - * @param minGram the smallest n-gram to generate - * @param maxGram the largest n-gram to generate - */ - @Deprecated - public EdgeNGramTokenFilter(Version version, TokenStream input, String sideLabel, int minGram, int maxGram) { - this(version, input, Side.getSide(sideLabel), minGram, maxGram); - } - - /** - * Creates EdgeNGramTokenFilter that can generate n-grams in the sizes of the given range - * - * @param version the Lucene match version - * @param input {@link TokenStream} holding the input to be tokenized - * @param minGram the smallest n-gram to generate - * @param maxGram the largest n-gram to generate - */ - public EdgeNGramTokenFilter(Version version, TokenStream input, int minGram, int maxGram) { - this(version, input, Side.FRONT, minGram, maxGram); } @Override @@ -164,28 +88,14 @@ public final class EdgeNGramTokenFilter extends TokenFilter { curGramSize = minGram; tokStart = offsetAtt.startOffset(); tokEnd = offsetAtt.endOffset(); - if (version.onOrAfter(Version.LUCENE_44)) { - // Never update offsets - updateOffsets = false; - } else { - // if length by start + end offsets doesn't match the term text then assume - // this is a synonym and don't adjust the offsets. - updateOffsets = (tokStart + curTermLength) == tokEnd; - } savePosIncr = posIncrAtt.getPositionIncrement(); } } if (curGramSize <= maxGram) { // if we have hit the end of our n-gram size range, quit if (curGramSize <= curTermLength) { // if the remaining input is too short, we can't generate any n-grams // grab gramSize chars from front or back - int start = side == Side.FRONT ? 0 : curTermLength - curGramSize; - int end = start + curGramSize; clearAttributes(); - if (updateOffsets) { - offsetAtt.setOffset(tokStart + start, tokStart + end); - } else { - offsetAtt.setOffset(tokStart, tokEnd); - } + offsetAtt.setOffset(tokStart, tokEnd); // first ngram gets increment, others don't if (curGramSize == minGram) { // Leave the first token position increment at the cleared-attribute value of 1 @@ -195,7 +105,7 @@ public final class EdgeNGramTokenFilter extends TokenFilter { } else { posIncrAtt.setPositionIncrement(0); } - termAtt.copyBuffer(curTermBuffer, start, curGramSize); + termAtt.copyBuffer(curTermBuffer, 0, curGramSize); curGramSize++; isFirstToken = false; return true; diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java index 551decbf515..e41d940ebba 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java @@ -30,160 +30,56 @@ import org.apache.lucene.util.Version; /** * Tokenizes the input from an edge into n-grams of given size(s). *

- * This {@link Tokenizer} create n-grams from the beginning edge or ending edge of a input token. - *

As of Lucene 4.4, this tokenizer

+ * This {@link Tokenizer} create n-grams from the beginning edge of a input token. */ public final class EdgeNGramTokenizer extends Tokenizer { - public static final Side DEFAULT_SIDE = Side.FRONT; public static final int DEFAULT_MAX_GRAM_SIZE = 1; public static final int DEFAULT_MIN_GRAM_SIZE = 1; - private Version version; private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); - /** Specifies which side of the input the n-gram should be generated from */ - public static enum Side { - - /** Get the n-gram from the front of the input */ - FRONT { - @Override - public String getLabel() { return "front"; } - }, - - /** Get the n-gram from the end of the input */ - @Deprecated - BACK { - @Override - public String getLabel() { return "back"; } - }; - - public abstract String getLabel(); - - // Get the appropriate Side from a string - public static Side getSide(String sideName) { - if (FRONT.getLabel().equals(sideName)) { - return FRONT; - } - if (BACK.getLabel().equals(sideName)) { - return BACK; - } - return null; - } - } - private int minGram; private int maxGram; private int gramSize; - private Side side; private boolean started; private int inLen; // length of the input AFTER trim() private int charsRead; // length of the input private String inStr; - /** * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range * - * @param version the Lucene match version - * @param input {@link Reader} holding the input to be tokenized - * @param side the {@link Side} from which to chop off an n-gram - * @param minGram the smallest n-gram to generate - * @param maxGram the largest n-gram to generate - */ - @Deprecated - public EdgeNGramTokenizer(Version version, Reader input, Side side, int minGram, int maxGram) { - super(input); - init(version, side, minGram, maxGram); - } - - /** - * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range - * - * @param version the Lucene match version - * @param factory {@link org.apache.lucene.util.AttributeSource.AttributeFactory} to use - * @param input {@link Reader} holding the input to be tokenized - * @param side the {@link Side} from which to chop off an n-gram - * @param minGram the smallest n-gram to generate - * @param maxGram the largest n-gram to generate - */ - @Deprecated - public EdgeNGramTokenizer(Version version, AttributeFactory factory, Reader input, Side side, int minGram, int maxGram) { - super(factory, input); - init(version, side, minGram, maxGram); - } - - /** - * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range - * - * @param version the Lucene match version - * @param input {@link Reader} holding the input to be tokenized - * @param sideLabel the name of the {@link Side} from which to chop off an n-gram - * @param minGram the smallest n-gram to generate - * @param maxGram the largest n-gram to generate - */ - @Deprecated - public EdgeNGramTokenizer(Version version, Reader input, String sideLabel, int minGram, int maxGram) { - this(version, input, Side.getSide(sideLabel), minGram, maxGram); - } - - /** - * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range - * - * @param version the Lucene match version - * @param factory {@link org.apache.lucene.util.AttributeSource.AttributeFactory} to use - * @param input {@link Reader} holding the input to be tokenized - * @param sideLabel the name of the {@link Side} from which to chop off an n-gram - * @param minGram the smallest n-gram to generate - * @param maxGram the largest n-gram to generate - */ - @Deprecated - public EdgeNGramTokenizer(Version version, AttributeFactory factory, Reader input, String sideLabel, int minGram, int maxGram) { - this(version, factory, input, Side.getSide(sideLabel), minGram, maxGram); - } - - /** - * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range - * - * @param version the Lucene match version + * @param version the Lucene match version * @param input {@link Reader} holding the input to be tokenized * @param minGram the smallest n-gram to generate * @param maxGram the largest n-gram to generate */ - @Deprecated public EdgeNGramTokenizer(Version version, Reader input, int minGram, int maxGram) { - this(version, input, Side.FRONT, minGram, maxGram); + super(input); + init(version, minGram, maxGram); } /** * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range * - * @param version the Lucene match version + * @param version the Lucene match version * @param factory {@link org.apache.lucene.util.AttributeSource.AttributeFactory} to use * @param input {@link Reader} holding the input to be tokenized * @param minGram the smallest n-gram to generate * @param maxGram the largest n-gram to generate */ - @Deprecated public EdgeNGramTokenizer(Version version, AttributeFactory factory, Reader input, int minGram, int maxGram) { - this(version, factory, input, Side.FRONT, minGram, maxGram); + super(factory, input); + init(version, minGram, maxGram); } - private void init(Version version, Side side, int minGram, int maxGram) { + private void init(Version version, int minGram, int maxGram) { if (version == null) { throw new IllegalArgumentException("version must not be null"); } - if (side == null) { - throw new IllegalArgumentException("sideLabel must be either front or back"); - } - if (minGram < 1) { throw new IllegalArgumentException("minGram must be greater than zero"); } @@ -192,18 +88,8 @@ public final class EdgeNGramTokenizer extends Tokenizer { throw new IllegalArgumentException("minGram must not be greater than maxGram"); } - if (version.onOrAfter(Version.LUCENE_44)) { - if (side == Side.BACK) { - throw new IllegalArgumentException("Side.BACK is not supported anymore as of Lucene 4.4"); - } - } else { - maxGram = Math.min(maxGram, 1024); - } - - this.version = version; this.minGram = minGram; this.maxGram = maxGram; - this.side = side; } /** Returns the next token in the stream, or null at EOS. */ @@ -214,27 +100,23 @@ public final class EdgeNGramTokenizer extends Tokenizer { if (!started) { started = true; gramSize = minGram; - final int limit = side == Side.FRONT ? maxGram : 1024; - char[] chars = new char[Math.min(1024, limit)]; + char[] chars = new char[Math.min(1024, maxGram)]; charsRead = 0; // TODO: refactor to a shared readFully somewhere: boolean exhausted = false; - while (charsRead < limit) { + while (charsRead < maxGram) { final int inc = input.read(chars, charsRead, chars.length-charsRead); if (inc == -1) { exhausted = true; break; } charsRead += inc; - if (charsRead == chars.length && charsRead < limit) { + if (charsRead == chars.length && charsRead < maxGram) { chars = ArrayUtil.grow(chars); } } inStr = new String(chars, 0, charsRead); - if (!version.onOrAfter(Version.LUCENE_44)) { - inStr = inStr.trim(); - } if (!exhausted) { // Read extra throwaway chars so that on end() we @@ -254,10 +136,8 @@ public final class EdgeNGramTokenizer extends Tokenizer { return false; } posIncrAtt.setPositionIncrement(1); - } else if (version.onOrAfter(Version.LUCENE_44)) { - posIncrAtt.setPositionIncrement(1); } else { - posIncrAtt.setPositionIncrement(0); + posIncrAtt.setPositionIncrement(1); } // if the remaining input is too short, we can't generate any n-grams @@ -271,10 +151,8 @@ public final class EdgeNGramTokenizer extends Tokenizer { } // grab gramSize chars from front or back - int start = side == Side.FRONT ? 0 : inLen - gramSize; - int end = start + gramSize; - termAtt.setEmpty().append(inStr, start, end); - offsetAtt.setOffset(correctOffset(start), correctOffset(end)); + termAtt.setEmpty().append(inStr, 0, gramSize); + offsetAtt.setOffset(correctOffset(0), correctOffset(gramSize)); gramSize++; return true; } diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerFactory.java index a17b8a7dec1..9104262f185 100755 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerFactory.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerFactory.java @@ -28,21 +28,19 @@ import java.util.Map; *
  * <fieldType name="text_edgngrm" class="solr.TextField" positionIncrementGap="100">
  *   <analyzer>
- *     <tokenizer class="solr.EdgeNGramTokenizerFactory" side="front" minGramSize="1" maxGramSize="1"/>
+ *     <tokenizer class="solr.EdgeNGramTokenizerFactory" minGramSize="1" maxGramSize="1"/>
  *   </analyzer>
  * </fieldType>
*/ public class EdgeNGramTokenizerFactory extends TokenizerFactory { private final int maxGramSize; private final int minGramSize; - private final String side; /** Creates a new EdgeNGramTokenizerFactory */ public EdgeNGramTokenizerFactory(Map args) { super(args); minGramSize = getInt(args, "minGramSize", EdgeNGramTokenizer.DEFAULT_MIN_GRAM_SIZE); maxGramSize = getInt(args, "maxGramSize", EdgeNGramTokenizer.DEFAULT_MAX_GRAM_SIZE); - side = get(args, "side", EdgeNGramTokenFilter.Side.FRONT.getLabel()); if (!args.isEmpty()) { throw new IllegalArgumentException("Unknown parameters: " + args); } @@ -50,6 +48,6 @@ public class EdgeNGramTokenizerFactory extends TokenizerFactory { @Override public EdgeNGramTokenizer create(AttributeFactory factory, Reader input) { - return new EdgeNGramTokenizer(luceneMatchVersion, factory, input, side, minGramSize, maxGramSize); + return new EdgeNGramTokenizer(luceneMatchVersion, factory, input, minGramSize, maxGramSize); } } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java index 99bc562c3fa..4baefbc4428 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java @@ -431,20 +431,6 @@ public class TestRandomChains extends BaseTokenStreamTestCase { } } }); - put(EdgeNGramTokenizer.Side.class, new ArgProducer() { - @Override public Object create(Random random) { - return random.nextBoolean() - ? EdgeNGramTokenizer.Side.FRONT - : EdgeNGramTokenizer.Side.BACK; - } - }); - put(EdgeNGramTokenFilter.Side.class, new ArgProducer() { - @Override public Object create(Random random) { - return random.nextBoolean() - ? EdgeNGramTokenFilter.Side.FRONT - : EdgeNGramTokenFilter.Side.BACK; - } - }); put(HyphenationTree.class, new ArgProducer() { @Override public Object create(Random random) { // TODO: make nastier diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java index 6fcc8f1639f..61393235022 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java @@ -19,15 +19,12 @@ package org.apache.lucene.analysis.ngram; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.MockTokenizer; -import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.core.KeywordTokenizer; import org.apache.lucene.analysis.core.WhitespaceTokenizer; -import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter; import org.apache.lucene.analysis.position.PositionFilter; -import org.apache.lucene.util.Version; import java.io.Reader; import java.io.StringReader; @@ -48,7 +45,7 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase { public void testInvalidInput() throws Exception { boolean gotException = false; try { - new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, input, EdgeNGramTokenFilter.Side.FRONT, 0, 0); + new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, input, 0, 0); } catch (IllegalArgumentException e) { gotException = true; } @@ -58,7 +55,7 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase { public void testInvalidInput2() throws Exception { boolean gotException = false; try { - new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, input, EdgeNGramTokenFilter.Side.FRONT, 2, 1); + new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, input, 2, 1); } catch (IllegalArgumentException e) { gotException = true; } @@ -68,7 +65,7 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase { public void testInvalidInput3() throws Exception { boolean gotException = false; try { - new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, input, EdgeNGramTokenFilter.Side.FRONT, -1, 2); + new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, input, -1, 2); } catch (IllegalArgumentException e) { gotException = true; } @@ -76,41 +73,23 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase { } public void testFrontUnigram() throws Exception { - EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, input, EdgeNGramTokenFilter.Side.FRONT, 1, 1); + EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, input, 1, 1); assertTokenStreamContents(tokenizer, new String[]{"a"}, new int[]{0}, new int[]{5}); } - public void testBackUnigram() throws Exception { - EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(Version.LUCENE_43, input, EdgeNGramTokenFilter.Side.BACK, 1, 1); - assertTokenStreamContents(tokenizer, new String[]{"e"}, new int[]{4}, new int[]{5}); - } - public void testOversizedNgrams() throws Exception { - EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, input, EdgeNGramTokenFilter.Side.FRONT, 6, 6); + EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, input, 6, 6); assertTokenStreamContents(tokenizer, new String[0], new int[0], new int[0]); } public void testFrontRangeOfNgrams() throws Exception { - EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, input, EdgeNGramTokenFilter.Side.FRONT, 1, 3); + EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, input, 1, 3); assertTokenStreamContents(tokenizer, new String[]{"a","ab","abc"}, new int[]{0,0,0}, new int[]{5,5,5}); } - public void testBackRangeOfNgrams() throws Exception { - EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(Version.LUCENE_43, input, EdgeNGramTokenFilter.Side.BACK, 1, 3); - assertTokenStreamContents(tokenizer, - new String[]{"e","de","cde"}, - new int[]{4,3,2}, - new int[]{5,5,5}, - null, - null, - null, - null, - false); - } - public void testFilterPositions() throws Exception { TokenStream ts = new MockTokenizer(new StringReader("abcde vwxyz"), MockTokenizer.WHITESPACE, false); - EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, ts, EdgeNGramTokenFilter.Side.FRONT, 1, 3); + EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, ts, 1, 3); assertTokenStreamContents(tokenizer, new String[]{"a","ab","abc","v","vw","vwx"}, new int[]{0,0,0,6,6,6}, @@ -125,7 +104,7 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase { public void testFirstTokenPositionIncrement() throws Exception { TokenStream ts = new MockTokenizer(new StringReader("a abc"), MockTokenizer.WHITESPACE, false); ts = new PositionFilter(ts, 0); // All but first token will get 0 position increment - EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, ts, EdgeNGramTokenFilter.Side.FRONT, 2, 3); + EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, ts, 2, 3); // The first token "a" will not be output, since it's smaller than the mingram size of 2. // The second token on input to EdgeNGramTokenFilter will have position increment of 0, // which should be increased to 1, since this is the first output token in the stream. @@ -139,38 +118,18 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase { public void testSmallTokenInStream() throws Exception { input = new MockTokenizer(new StringReader("abc de fgh"), MockTokenizer.WHITESPACE, false); - EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, input, EdgeNGramTokenFilter.Side.FRONT, 3, 3); + EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, input, 3, 3); assertTokenStreamContents(tokenizer, new String[]{"abc","fgh"}, new int[]{0,7}, new int[]{3,10}); } public void testReset() throws Exception { WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("abcde")); - EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, tokenizer, EdgeNGramTokenFilter.Side.FRONT, 1, 3); + EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, tokenizer, 1, 3); assertTokenStreamContents(filter, new String[]{"a","ab","abc"}, new int[]{0,0,0}, new int[]{5,5,5}); tokenizer.setReader(new StringReader("abcde")); assertTokenStreamContents(filter, new String[]{"a","ab","abc"}, new int[]{0,0,0}, new int[]{5,5,5}); } - // LUCENE-3642 - // EdgeNgram blindly adds term length to offset, but this can take things out of bounds - // wrt original text if a previous filter increases the length of the word (in this case æ -> ae) - // so in this case we behave like WDF, and preserve any modified offsets - public void testInvalidOffsets() throws Exception { - Analyzer analyzer = new Analyzer() { - @Override - protected TokenStreamComponents createComponents(String fieldName, Reader reader) { - Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); - TokenFilter filters = new ASCIIFoldingFilter(tokenizer); - filters = new EdgeNGramTokenFilter(Version.LUCENE_43, filters, EdgeNGramTokenFilter.Side.FRONT, 2, 15); - return new TokenStreamComponents(tokenizer, filters); - } - }; - assertAnalyzesTo(analyzer, "mosfellsbær", - new String[] { "mo", "mos", "mosf", "mosfe", "mosfel", "mosfell", "mosfells", "mosfellsb", "mosfellsba", "mosfellsbae", "mosfellsbaer" }, - new int[] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, - new int[] { 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11 }); - } - /** blast some random strings through the analyzer */ public void testRandomStrings() throws Exception { Analyzer a = new Analyzer() { @@ -178,20 +137,10 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase { protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); return new TokenStreamComponents(tokenizer, - new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, tokenizer, EdgeNGramTokenFilter.Side.FRONT, 2, 4)); + new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, tokenizer, 2, 4)); } }; checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER); - - Analyzer b = new Analyzer() { - @Override - protected TokenStreamComponents createComponents(String fieldName, Reader reader) { - Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); - return new TokenStreamComponents(tokenizer, - new EdgeNGramTokenFilter(Version.LUCENE_43, tokenizer, EdgeNGramTokenFilter.Side.BACK, 2, 4)); - } - }; - checkRandomData(random(), b, 1000*RANDOM_MULTIPLIER, 20, false, false); } public void testEmptyTerm() throws Exception { @@ -201,19 +150,9 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase { protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer tokenizer = new KeywordTokenizer(reader); return new TokenStreamComponents(tokenizer, - new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, tokenizer, EdgeNGramTokenFilter.Side.FRONT, 2, 15)); + new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, tokenizer, 2, 15)); } }; checkAnalysisConsistency(random, a, random.nextBoolean(), ""); - - Analyzer b = new Analyzer() { - @Override - protected TokenStreamComponents createComponents(String fieldName, Reader reader) { - Tokenizer tokenizer = new KeywordTokenizer(reader); - return new TokenStreamComponents(tokenizer, - new EdgeNGramTokenFilter(Version.LUCENE_43, tokenizer, EdgeNGramTokenFilter.Side.BACK, 2, 15)); - } - }; - checkAnalysisConsistency(random, b, random.nextBoolean(), ""); } } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerTest.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerTest.java index 96c1e59b5c3..4db7efe5537 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerTest.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerTest.java @@ -28,7 +28,6 @@ import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; -import org.apache.lucene.util.Version; import org.apache.lucene.util._TestUtil; /** @@ -46,7 +45,7 @@ public class EdgeNGramTokenizerTest extends BaseTokenStreamTestCase { public void testInvalidInput() throws Exception { boolean gotException = false; try { - new EdgeNGramTokenizer(TEST_VERSION_CURRENT, input, EdgeNGramTokenizer.Side.FRONT, 0, 0); + new EdgeNGramTokenizer(TEST_VERSION_CURRENT, input, 0, 0); } catch (IllegalArgumentException e) { gotException = true; } @@ -56,7 +55,7 @@ public class EdgeNGramTokenizerTest extends BaseTokenStreamTestCase { public void testInvalidInput2() throws Exception { boolean gotException = false; try { - new EdgeNGramTokenizer(TEST_VERSION_CURRENT, input, EdgeNGramTokenizer.Side.FRONT, 2, 1); + new EdgeNGramTokenizer(TEST_VERSION_CURRENT, input, 2, 1); } catch (IllegalArgumentException e) { gotException = true; } @@ -66,7 +65,7 @@ public class EdgeNGramTokenizerTest extends BaseTokenStreamTestCase { public void testInvalidInput3() throws Exception { boolean gotException = false; try { - new EdgeNGramTokenizer(TEST_VERSION_CURRENT, input, EdgeNGramTokenizer.Side.FRONT, -1, 2); + new EdgeNGramTokenizer(TEST_VERSION_CURRENT, input, -1, 2); } catch (IllegalArgumentException e) { gotException = true; } @@ -74,32 +73,22 @@ public class EdgeNGramTokenizerTest extends BaseTokenStreamTestCase { } public void testFrontUnigram() throws Exception { - EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(TEST_VERSION_CURRENT, input, EdgeNGramTokenizer.Side.FRONT, 1, 1); + EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(TEST_VERSION_CURRENT, input, 1, 1); assertTokenStreamContents(tokenizer, new String[]{"a"}, new int[]{0}, new int[]{1}, 5 /* abcde */); } - public void testBackUnigram() throws Exception { - EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(Version.LUCENE_43, input, EdgeNGramTokenizer.Side.BACK, 1, 1); - assertTokenStreamContents(tokenizer, new String[]{"e"}, new int[]{4}, new int[]{5}, 5 /* abcde */); - } - public void testOversizedNgrams() throws Exception { - EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(TEST_VERSION_CURRENT, input, EdgeNGramTokenizer.Side.FRONT, 6, 6); + EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(TEST_VERSION_CURRENT, input, 6, 6); assertTokenStreamContents(tokenizer, new String[0], new int[0], new int[0], 5 /* abcde */); } public void testFrontRangeOfNgrams() throws Exception { - EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(TEST_VERSION_CURRENT, input, EdgeNGramTokenizer.Side.FRONT, 1, 3); + EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(TEST_VERSION_CURRENT, input, 1, 3); assertTokenStreamContents(tokenizer, new String[]{"a","ab","abc"}, new int[]{0,0,0}, new int[]{1,2,3}, 5 /* abcde */); } - - public void testBackRangeOfNgrams() throws Exception { - EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(Version.LUCENE_43, input, EdgeNGramTokenizer.Side.BACK, 1, 3); - assertTokenStreamContents(tokenizer, new String[]{"e","de","cde"}, new int[]{4,3,2}, new int[]{5,5,5}, null, null, null, 5 /* abcde */, false); - } public void testReset() throws Exception { - EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(TEST_VERSION_CURRENT, input, EdgeNGramTokenizer.Side.FRONT, 1, 3); + EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(TEST_VERSION_CURRENT, input, 1, 3); assertTokenStreamContents(tokenizer, new String[]{"a","ab","abc"}, new int[]{0,0,0}, new int[]{1,2,3}, 5 /* abcde */); tokenizer.setReader(new StringReader("abcde")); assertTokenStreamContents(tokenizer, new String[]{"a","ab","abc"}, new int[]{0,0,0}, new int[]{1,2,3}, 5 /* abcde */); @@ -110,37 +99,16 @@ public class EdgeNGramTokenizerTest extends BaseTokenStreamTestCase { Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { - Tokenizer tokenizer = new EdgeNGramTokenizer(TEST_VERSION_CURRENT, reader, EdgeNGramTokenizer.Side.FRONT, 2, 4); + Tokenizer tokenizer = new EdgeNGramTokenizer(TEST_VERSION_CURRENT, reader, 2, 4); return new TokenStreamComponents(tokenizer, tokenizer); } }; checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER, 20, false, false); checkRandomData(random(), a, 100*RANDOM_MULTIPLIER, 8192, false, false); - - Analyzer b = new Analyzer() { - @Override - protected TokenStreamComponents createComponents(String fieldName, Reader reader) { - Tokenizer tokenizer = new EdgeNGramTokenizer(Version.LUCENE_43, reader, EdgeNGramTokenizer.Side.BACK, 2, 4); - return new TokenStreamComponents(tokenizer, tokenizer); - } - }; - checkRandomData(random(), b, 1000*RANDOM_MULTIPLIER, 20, false, false); - checkRandomData(random(), b, 100*RANDOM_MULTIPLIER, 8192, false, false); } public void testTokenizerPositions() throws Exception { - EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(Version.LUCENE_43, new StringReader("abcde"), EdgeNGramTokenizer.Side.FRONT, 1, 3); - assertTokenStreamContents(tokenizer, - new String[]{"a","ab","abc"}, - new int[]{0,0,0}, - new int[]{1,2,3}, - null, - new int[] {1,0,0}, - null, - null, - false); - - tokenizer = new EdgeNGramTokenizer(TEST_VERSION_CURRENT, new StringReader("abcde"), EdgeNGramTokenizer.Side.FRONT, 1, 3); + EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(TEST_VERSION_CURRENT, new StringReader("abcde"), 1, 3); assertTokenStreamContents(tokenizer, new String[]{"a","ab","abc"}, new int[]{0,0,0}, @@ -156,7 +124,7 @@ public class EdgeNGramTokenizerTest extends BaseTokenStreamTestCase { final String input = _TestUtil.randomSimpleString(random(), 1024 * 5); final int minGram = _TestUtil.nextInt(random(), 1, 1024); final int maxGram = _TestUtil.nextInt(random(), minGram, 5 * 1024); - EdgeNGramTokenizer tk = new EdgeNGramTokenizer(TEST_VERSION_CURRENT, new StringReader(input), EdgeNGramTokenizer.Side.FRONT, minGram, maxGram); + EdgeNGramTokenizer tk = new EdgeNGramTokenizer(TEST_VERSION_CURRENT, new StringReader(input), minGram, maxGram); final CharTermAttribute charTermAtt = tk.addAttribute(CharTermAttribute.class); final OffsetAttribute offsetAtt = tk.addAttribute(OffsetAttribute.class); final PositionIncrementAttribute posIncAtt = tk.addAttribute(PositionIncrementAttribute.class); diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestNGramFilters.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestNGramFilters.java index 0b8435255a6..47829cbca7b 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestNGramFilters.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestNGramFilters.java @@ -23,7 +23,6 @@ import java.io.StringReader; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase; -import org.apache.lucene.util.Version; /** * Simple tests to ensure the NGram filter factories are working. @@ -97,17 +96,6 @@ public class TestNGramFilters extends BaseTokenStreamFactoryTestCase { new String[] { "t", "te" }); } - /** - * Test EdgeNGramTokenizerFactory with side option - */ - public void testEdgeNGramTokenizer3() throws Exception { - Reader reader = new StringReader("ready"); - TokenStream stream = tokenizerFactory("EdgeNGram", Version.LUCENE_43, - "side", "back").create(reader); - assertTokenStreamContents(stream, - new String[] { "y" }); - } - /** * Test EdgeNGramFilterFactory */ @@ -131,18 +119,6 @@ public class TestNGramFilters extends BaseTokenStreamFactoryTestCase { assertTokenStreamContents(stream, new String[] { "t", "te" }); } - - /** - * Test EdgeNGramFilterFactory with side option - */ - public void testEdgeNGramFilter3() throws Exception { - Reader reader = new StringReader("ready"); - TokenStream stream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); - stream = tokenFilterFactory("EdgeNGram", Version.LUCENE_43, - "side", "back").create(stream); - assertTokenStreamContents(stream, - new String[] { "y" }); - } /** Test that bogus arguments result in exception */ public void testBogusArguments() throws Exception {