From a03e38d5d05008aaef969a200071c03a1d6cb991 Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Fri, 26 Apr 2013 11:03:26 +0000 Subject: [PATCH] LUCENE-4955: Fix NGramTokenizer and NGramTokenFilter, and remove them from TestRandomChains' exclusion list. git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1476135 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/CHANGES.txt | 13 ++ .../ngram/Lucene43NGramTokenizer.java | 155 ++++++++++++++++++ .../analysis/ngram/NGramFilterFactory.java | 2 +- .../analysis/ngram/NGramTokenFilter.java | 103 +++++++++--- .../lucene/analysis/ngram/NGramTokenizer.java | 151 +++++++++-------- .../analysis/ngram/NGramTokenizerFactory.java | 10 +- .../analysis/core/TestRandomChains.java | 15 +- .../analysis/ngram/NGramTokenFilterTest.java | 76 ++++++--- .../analysis/ngram/NGramTokenizerTest.java | 69 ++++++-- .../analysis/ngram/TestNGramFilters.java | 8 +- 10 files changed, 466 insertions(+), 136 deletions(-) create mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/Lucene43NGramTokenizer.java diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 39c875604e2..ee34d6edb90 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -37,6 +37,16 @@ Optimizations ======================= Lucene 4.4.0 ======================= +Changes in backwards compatibility policy + +* LUCENE-4955: NGramTokenFilter now emits all n-grams for the same token at the + same position and preserves the position length and the offsets of the + original token. (Simon Willnauer, Adrien Grand) + +* LUCENE-4955: NGramTokenizer now emits n-grams in a different order + (a, ab, b, bc, c) instead of (a, b, c, ab, bc) and doesn't trim trailing + whitespaces. (Adrien Grand) + Bug Fixes * LUCENE-4935: CustomScoreQuery wrongly applied its query boost twice @@ -46,6 +56,9 @@ Bug Fixes if you had a 64-bit JVM without compressed OOPS: IBM J9, or Oracle with large heap/explicitly disabled. (Mike McCandless, Uwe Schindler, Robert Muir) +* LUCENE-4955: NGramTokenizer now supports inputs larger than 1024 chars. + (Adrien Grand) + Optimizations * LUCENE-4938: Don't use an unnecessarily large priority queue in IndexSearcher diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/Lucene43NGramTokenizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/Lucene43NGramTokenizer.java new file mode 100644 index 00000000000..25693da23ba --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/Lucene43NGramTokenizer.java @@ -0,0 +1,155 @@ +package org.apache.lucene.analysis.ngram; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.Reader; + +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; + +/** + * Old broken version of {@link NGramTokenizer}. + */ +@Deprecated +public final class Lucene43NGramTokenizer extends Tokenizer { + public static final int DEFAULT_MIN_NGRAM_SIZE = 1; + public static final int DEFAULT_MAX_NGRAM_SIZE = 2; + + private int minGram, maxGram; + private int gramSize; + private int pos; + private int inLen; // length of the input AFTER trim() + private int charsRead; // length of the input + private String inStr; + private boolean started; + + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); + + /** + * Creates NGramTokenizer with given min and max n-grams. + * @param input {@link Reader} holding the input to be tokenized + * @param minGram the smallest n-gram to generate + * @param maxGram the largest n-gram to generate + */ + public Lucene43NGramTokenizer(Reader input, int minGram, int maxGram) { + super(input); + init(minGram, maxGram); + } + + /** + * Creates NGramTokenizer with given min and max n-grams. + * @param factory {@link org.apache.lucene.util.AttributeSource.AttributeFactory} to use + * @param input {@link Reader} holding the input to be tokenized + * @param minGram the smallest n-gram to generate + * @param maxGram the largest n-gram to generate + */ + public Lucene43NGramTokenizer(AttributeFactory factory, Reader input, int minGram, int maxGram) { + super(factory, input); + init(minGram, maxGram); + } + + /** + * Creates NGramTokenizer with default min and max n-grams. + * @param input {@link Reader} holding the input to be tokenized + */ + public Lucene43NGramTokenizer(Reader input) { + this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE); + } + + private void init(int minGram, int maxGram) { + if (minGram < 1) { + throw new IllegalArgumentException("minGram must be greater than zero"); + } + if (minGram > maxGram) { + throw new IllegalArgumentException("minGram must not be greater than maxGram"); + } + this.minGram = minGram; + this.maxGram = maxGram; + } + + /** Returns the next token in the stream, or null at EOS. */ + @Override + public boolean incrementToken() throws IOException { + clearAttributes(); + if (!started) { + started = true; + gramSize = minGram; + char[] chars = new char[1024]; + charsRead = 0; + // TODO: refactor to a shared readFully somewhere: + while (charsRead < chars.length) { + int inc = input.read(chars, charsRead, chars.length-charsRead); + if (inc == -1) { + break; + } + charsRead += inc; + } + inStr = new String(chars, 0, charsRead).trim(); // remove any trailing empty strings + + if (charsRead == chars.length) { + // Read extra throwaway chars so that on end() we + // report the correct offset: + char[] throwaway = new char[1024]; + while(true) { + final int inc = input.read(throwaway, 0, throwaway.length); + if (inc == -1) { + break; + } + charsRead += inc; + } + } + + inLen = inStr.length(); + if (inLen == 0) { + return false; + } + } + + if (pos+gramSize > inLen) { // if we hit the end of the string + pos = 0; // reset to beginning of string + gramSize++; // increase n-gram size + if (gramSize > maxGram) // we are done + return false; + if (pos+gramSize > inLen) + return false; + } + + int oldPos = pos; + pos++; + termAtt.setEmpty().append(inStr, oldPos, oldPos+gramSize); + offsetAtt.setOffset(correctOffset(oldPos), correctOffset(oldPos+gramSize)); + return true; + } + + @Override + public void end() { + // set final offset + final int finalOffset = correctOffset(charsRead); + this.offsetAtt.setOffset(finalOffset, finalOffset); + } + + @Override + public void reset() throws IOException { + super.reset(); + started = false; + pos = 0; + } +} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramFilterFactory.java index 42b3934cb62..60398bdf4b2 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramFilterFactory.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramFilterFactory.java @@ -47,6 +47,6 @@ public class NGramFilterFactory extends TokenFilterFactory { @Override public NGramTokenFilter create(TokenStream input) { - return new NGramTokenFilter(input, minGramSize, maxGramSize); + return new NGramTokenFilter(luceneMatchVersion, input, minGramSize, maxGramSize); } } diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java index ebb6e1290a5..30c2454852f 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java @@ -21,37 +21,60 @@ import java.io.IOException; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.miscellaneous.LengthFilter; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; +import org.apache.lucene.util.Version; /** * Tokenizes the input into n-grams of the given size(s). + * + *

You must specify the required {@link Version} compatibility when + * creating a {@link NGramTokenFilter}. As of Lucene 4.4, this token filters:

+ *

You can make this filter use the old behavior by providing a version < + * {@link Version#LUCENE_44} in the constructor but this is not recommended as + * it will lead to broken {@link TokenStream}s that will cause highlighting + * bugs. */ public final class NGramTokenFilter extends TokenFilter { public static final int DEFAULT_MIN_NGRAM_SIZE = 1; public static final int DEFAULT_MAX_NGRAM_SIZE = 2; - private int minGram, maxGram; - + private final int minGram, maxGram; + private char[] curTermBuffer; private int curTermLength; private int curGramSize; private int curPos; + private int curPosInc, curPosLen; private int tokStart; - private int tokEnd; // only used if the length changed before this filter + private int tokEnd; private boolean hasIllegalOffsets; // only if the length changed before this filter - + + private final Version version; private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final PositionIncrementAttribute posIncAtt; + private final PositionLengthAttribute posLenAtt; private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); /** * Creates NGramTokenFilter with given min and max n-grams. + * @param version Lucene version to enable correct position increments. + * See above for details. * @param input {@link TokenStream} holding the input to be tokenized * @param minGram the smallest n-gram to generate * @param maxGram the largest n-gram to generate */ - public NGramTokenFilter(TokenStream input, int minGram, int maxGram) { - super(input); + public NGramTokenFilter(Version version, TokenStream input, int minGram, int maxGram) { + super(new LengthFilter(true, input, minGram, Integer.MAX_VALUE)); + this.version = version; if (minGram < 1) { throw new IllegalArgumentException("minGram must be greater than zero"); } @@ -60,14 +83,37 @@ public final class NGramTokenFilter extends TokenFilter { } this.minGram = minGram; this.maxGram = maxGram; + if (version.onOrAfter(Version.LUCENE_44)) { + posIncAtt = addAttribute(PositionIncrementAttribute.class); + posLenAtt = addAttribute(PositionLengthAttribute.class); + } else { + posIncAtt = new PositionIncrementAttribute() { + @Override + public void setPositionIncrement(int positionIncrement) {} + @Override + public int getPositionIncrement() { + return 0; + } + }; + posLenAtt = new PositionLengthAttribute() { + @Override + public void setPositionLength(int positionLength) {} + @Override + public int getPositionLength() { + return 0; + } + }; + } } /** * Creates NGramTokenFilter with default min and max n-grams. + * @param version Lucene version to enable correct position increments. + * See above for details. * @param input {@link TokenStream} holding the input to be tokenized */ - public NGramTokenFilter(TokenStream input) { - this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE); + public NGramTokenFilter(Version version, TokenStream input) { + this(version, input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE); } /** Returns the next token in the stream, or null at EOS. */ @@ -82,6 +128,8 @@ public final class NGramTokenFilter extends TokenFilter { curTermLength = termAtt.length(); curGramSize = minGram; curPos = 0; + curPosInc = posIncAtt.getPositionIncrement(); + curPosLen = posLenAtt.getPositionLength(); tokStart = offsetAtt.startOffset(); tokEnd = offsetAtt.endOffset(); // if length by start + end offsets doesn't match the term text then assume @@ -89,20 +137,37 @@ public final class NGramTokenFilter extends TokenFilter { hasIllegalOffsets = (tokStart + curTermLength) != tokEnd; } } - while (curGramSize <= maxGram) { - while (curPos+curGramSize <= curTermLength) { // while there is input + if (version.onOrAfter(Version.LUCENE_44)) { + if (curGramSize > maxGram || curPos + curGramSize > curTermLength) { + ++curPos; + curGramSize = minGram; + } + if (curPos + curGramSize <= curTermLength) { clearAttributes(); termAtt.copyBuffer(curTermBuffer, curPos, curGramSize); - if (hasIllegalOffsets) { - offsetAtt.setOffset(tokStart, tokEnd); - } else { - offsetAtt.setOffset(tokStart + curPos, tokStart + curPos + curGramSize); - } - curPos++; + posIncAtt.setPositionIncrement(curPosInc); + curPosInc = 0; + posLenAtt.setPositionLength(curPosLen); + offsetAtt.setOffset(tokStart, tokEnd); + curGramSize++; return true; } - curGramSize++; // increase n-gram size - curPos = 0; + } else { + while (curGramSize <= maxGram) { + while (curPos+curGramSize <= curTermLength) { // while there is input + clearAttributes(); + termAtt.copyBuffer(curTermBuffer, curPos, curGramSize); + if (hasIllegalOffsets) { + offsetAtt.setOffset(tokStart, tokEnd); + } else { + offsetAtt.setOffset(tokStart + curPos, tokStart + curPos + curGramSize); + } + curPos++; + return true; + } + curGramSize++; // increase n-gram size + curPos = 0; + } } curTermBuffer = null; } diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java index 81ce887ea7b..a0665bf6f5f 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java @@ -17,64 +17,90 @@ package org.apache.lucene.analysis.ngram; * limitations under the License. */ -import org.apache.lucene.analysis.Tokenizer; -import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; -import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; -import org.apache.lucene.util.AttributeSource; - import java.io.IOException; import java.io.Reader; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; +import org.apache.lucene.util.Version; + /** * Tokenizes the input into n-grams of the given size(s). + *

On the contrary to {@link NGramTokenFilter}, this class sets offsets so + * that characters between startOffset and endOffset in the original stream are + * the same as the term chars. + *

For example, "abcde" would be tokenized as (minGram=2, maxGram=3): + * + * + * + * + * + *
Termababcbcbcdcdcdede
Position increment1111111
Position length1111111
Offsets[0,2[[0,3[[1,3[[1,4[[2,4[[2,5[[3,5[
+ * + *

Before Lucene 4.4, this class had a different behavior:

+ *

Although highly discouraged, it is still possible to use the old behavior + * through {@link Lucene43NGramTokenizer}. */ public final class NGramTokenizer extends Tokenizer { public static final int DEFAULT_MIN_NGRAM_SIZE = 1; public static final int DEFAULT_MAX_NGRAM_SIZE = 2; - private int minGram, maxGram; + private char[] buffer; + private int bufferStart, bufferEnd; // remaining slice of the buffer + private int offset; private int gramSize; - private int pos; - private int inLen; // length of the input AFTER trim() - private int charsRead; // length of the input - private String inStr; - private boolean started; - + private int minGram, maxGram; + private boolean exhausted; + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class); + private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class); private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); /** * Creates NGramTokenizer with given min and max n-grams. + * @param version the lucene compatibility version * @param input {@link Reader} holding the input to be tokenized * @param minGram the smallest n-gram to generate * @param maxGram the largest n-gram to generate */ - public NGramTokenizer(Reader input, int minGram, int maxGram) { + public NGramTokenizer(Version version, Reader input, int minGram, int maxGram) { super(input); - init(minGram, maxGram); + init(version, minGram, maxGram); } /** * Creates NGramTokenizer with given min and max n-grams. + * @param version the lucene compatibility version * @param factory {@link org.apache.lucene.util.AttributeSource.AttributeFactory} to use * @param input {@link Reader} holding the input to be tokenized * @param minGram the smallest n-gram to generate * @param maxGram the largest n-gram to generate */ - public NGramTokenizer(AttributeFactory factory, Reader input, int minGram, int maxGram) { + public NGramTokenizer(Version version, AttributeFactory factory, Reader input, int minGram, int maxGram) { super(factory, input); - init(minGram, maxGram); + init(version, minGram, maxGram); } /** * Creates NGramTokenizer with default min and max n-grams. + * @param version the lucene compatibility version * @param input {@link Reader} holding the input to be tokenized */ - public NGramTokenizer(Reader input) { - this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE); + public NGramTokenizer(Version version, Reader input) { + this(version, input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE); } - - private void init(int minGram, int maxGram) { + + private void init(Version version, int minGram, int maxGram) { + if (!version.onOrAfter(Version.LUCENE_44)) { + throw new IllegalArgumentException("This class only works with Lucene 4.4+. To emulate the old (broken) behavior of NGramTokenizer, use Lucene43NGramTokenizer"); + } if (minGram < 1) { throw new IllegalArgumentException("minGram must be greater than zero"); } @@ -83,73 +109,66 @@ public final class NGramTokenizer extends Tokenizer { } this.minGram = minGram; this.maxGram = maxGram; + buffer = new char[maxGram + 1024]; } /** Returns the next token in the stream, or null at EOS. */ @Override public boolean incrementToken() throws IOException { clearAttributes(); - if (!started) { - started = true; - gramSize = minGram; - char[] chars = new char[1024]; - charsRead = 0; - // TODO: refactor to a shared readFully somewhere: - while (charsRead < chars.length) { - int inc = input.read(chars, charsRead, chars.length-charsRead); - if (inc == -1) { - break; - } - charsRead += inc; - } - inStr = new String(chars, 0, charsRead).trim(); // remove any trailing empty strings - if (charsRead == chars.length) { - // Read extra throwaway chars so that on end() we - // report the correct offset: - char[] throwaway = new char[1024]; - while(true) { - final int inc = input.read(throwaway, 0, throwaway.length); - if (inc == -1) { + // compact + if (bufferStart >= buffer.length - maxGram) { + System.arraycopy(buffer, bufferStart, buffer, 0, bufferEnd - bufferStart); + bufferEnd -= bufferStart; + bufferStart = 0; + + // fill in remaining space + if (!exhausted) { + // TODO: refactor to a shared readFully + while (bufferEnd < buffer.length) { + final int read = input.read(buffer, bufferEnd, buffer.length - bufferEnd); + if (read == -1) { + exhausted = true; break; } - charsRead += inc; + bufferEnd += read; } } - - inLen = inStr.length(); - if (inLen == 0) { - return false; - } } - if (pos+gramSize > inLen) { // if we hit the end of the string - pos = 0; // reset to beginning of string - gramSize++; // increase n-gram size - if (gramSize > maxGram) // we are done - return false; - if (pos+gramSize > inLen) - return false; + // should we go to the next offset? + if (gramSize > maxGram || bufferStart + gramSize > bufferEnd) { + bufferStart++; + offset++; + gramSize = minGram; } - int oldPos = pos; - pos++; - termAtt.setEmpty().append(inStr, oldPos, oldPos+gramSize); - offsetAtt.setOffset(correctOffset(oldPos), correctOffset(oldPos+gramSize)); + // are there enough chars remaining? + if (bufferStart + gramSize > bufferEnd) { + return false; + } + + termAtt.copyBuffer(buffer, bufferStart, gramSize); + posIncAtt.setPositionIncrement(1); + posLenAtt.setPositionLength(1); + offsetAtt.setOffset(correctOffset(offset), correctOffset(offset + gramSize)); + ++gramSize; return true; } - + @Override public void end() { - // set final offset - final int finalOffset = correctOffset(charsRead); - this.offsetAtt.setOffset(finalOffset, finalOffset); - } - + final int endOffset = correctOffset(offset + bufferEnd - bufferStart); + offsetAtt.setOffset(endOffset, endOffset); + } + @Override public void reset() throws IOException { super.reset(); - started = false; - pos = 0; + bufferStart = bufferEnd = buffer.length; + offset = 0; + gramSize = minGram; + exhausted = false; } } diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizerFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizerFactory.java index 1b702909a77..c74bb42f5f0 100755 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizerFactory.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizerFactory.java @@ -18,8 +18,10 @@ package org.apache.lucene.analysis.ngram; */ import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.util.TokenizerFactory; import org.apache.lucene.util.AttributeSource.AttributeFactory; +import org.apache.lucene.util.Version; import java.io.Reader; import java.util.Map; @@ -49,7 +51,11 @@ public class NGramTokenizerFactory extends TokenizerFactory { /** Creates the {@link TokenStream} of n-grams from the given {@link Reader} and {@link AttributeFactory}. */ @Override - public NGramTokenizer create(AttributeFactory factory, Reader input) { - return new NGramTokenizer(factory, input, minGramSize, maxGramSize); + public Tokenizer create(AttributeFactory factory, Reader input) { + if (luceneMatchVersion.onOrAfter(Version.LUCENE_44)) { + return new NGramTokenizer(luceneMatchVersion, factory, input, minGramSize, maxGramSize); + } else { + return new Lucene43NGramTokenizer(factory, input, minGramSize, maxGramSize); + } } } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java index a34d43d8692..b788c069ce0 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java @@ -54,8 +54,6 @@ import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; -import org.apache.lucene.analysis.miscellaneous.LimitTokenPositionFilter; -import org.apache.lucene.analysis.wikipedia.WikipediaTokenizer; import org.apache.lucene.analysis.ValidatingTokenFilter; import org.apache.lucene.analysis.charfilter.NormalizeCharMap; import org.apache.lucene.analysis.cjk.CJKBigramFilter; @@ -71,14 +69,14 @@ import org.apache.lucene.analysis.miscellaneous.HyphenatedWordsFilter; import org.apache.lucene.analysis.miscellaneous.KeepWordFilter; import org.apache.lucene.analysis.miscellaneous.LengthFilter; import org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilter; +import org.apache.lucene.analysis.miscellaneous.LimitTokenPositionFilter; import org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter; +import org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter.StemmerOverrideMap; import org.apache.lucene.analysis.miscellaneous.TrimFilter; import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter; -import org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter.StemmerOverrideMap; import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter; import org.apache.lucene.analysis.ngram.EdgeNGramTokenizer; -import org.apache.lucene.analysis.ngram.NGramTokenFilter; -import org.apache.lucene.analysis.ngram.NGramTokenizer; +import org.apache.lucene.analysis.ngram.Lucene43NGramTokenizer; import org.apache.lucene.analysis.path.PathHierarchyTokenizer; import org.apache.lucene.analysis.path.ReversePathHierarchyTokenizer; import org.apache.lucene.analysis.payloads.IdentityEncoder; @@ -90,8 +88,9 @@ import org.apache.lucene.analysis.synonym.SynonymMap; import org.apache.lucene.analysis.th.ThaiWordFilter; import org.apache.lucene.analysis.util.CharArrayMap; import org.apache.lucene.analysis.util.CharArraySet; -import org.apache.lucene.util.AttributeSource.AttributeFactory; +import org.apache.lucene.analysis.wikipedia.WikipediaTokenizer; import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.AttributeSource.AttributeFactory; import org.apache.lucene.util.CharsRef; import org.apache.lucene.util.Rethrow; import org.apache.lucene.util.Version; @@ -162,9 +161,7 @@ public class TestRandomChains extends BaseTokenStreamTestCase { // startOffset thats > its endOffset // (see LUCENE-3738 for a list of other offenders here) // broken! - NGramTokenizer.class, - // broken! - NGramTokenFilter.class, + Lucene43NGramTokenizer.class, // broken! EdgeNGramTokenizer.class, // broken! diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java index b2118472206..37db05d849e 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java @@ -26,7 +26,9 @@ import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.core.KeywordTokenizer; import org.apache.lucene.analysis.core.WhitespaceTokenizer; import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter; +import org.apache.lucene.util.Version; +import java.io.IOException; import java.io.Reader; import java.io.StringReader; import java.util.Random; @@ -46,7 +48,7 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase { public void testInvalidInput() throws Exception { boolean gotException = false; try { - new NGramTokenFilter(input, 2, 1); + new NGramTokenFilter(TEST_VERSION_CURRENT, input, 2, 1); } catch (IllegalArgumentException e) { gotException = true; } @@ -56,50 +58,64 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase { public void testInvalidInput2() throws Exception { boolean gotException = false; try { - new NGramTokenFilter(input, 0, 1); + new NGramTokenFilter(TEST_VERSION_CURRENT, input, 0, 1); } catch (IllegalArgumentException e) { gotException = true; } assertTrue(gotException); } - + public void testUnigrams() throws Exception { - NGramTokenFilter filter = new NGramTokenFilter(input, 1, 1); - assertTokenStreamContents(filter, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5}); + NGramTokenFilter filter = new NGramTokenFilter(TEST_VERSION_CURRENT, input, 1, 1); + assertTokenStreamContents(filter, new String[]{"a","b","c","d","e"}, new int[]{0,0,0,0,0}, new int[]{5,5,5,5,5}, new int[]{1,0,0,0,0}); } public void testBigrams() throws Exception { - NGramTokenFilter filter = new NGramTokenFilter(input, 2, 2); - assertTokenStreamContents(filter, new String[]{"ab","bc","cd","de"}, new int[]{0,1,2,3}, new int[]{2,3,4,5}); + NGramTokenFilter filter = new NGramTokenFilter(TEST_VERSION_CURRENT, input, 2, 2); + assertTokenStreamContents(filter, new String[]{"ab","bc","cd","de"}, new int[]{0,0,0,0}, new int[]{5,5,5,5}, new int[]{1,0,0,0}); } public void testNgrams() throws Exception { - NGramTokenFilter filter = new NGramTokenFilter(input, 1, 3); + NGramTokenFilter filter = new NGramTokenFilter(TEST_VERSION_CURRENT, input, 1, 3); assertTokenStreamContents(filter, - new String[]{"a","b","c","d","e", "ab","bc","cd","de", "abc","bcd","cde"}, - new int[]{0,1,2,3,4, 0,1,2,3, 0,1,2}, - new int[]{1,2,3,4,5, 2,3,4,5, 3,4,5}, - null, null, null, null, false + new String[]{"a","ab","abc","b","bc","bcd","c","cd","cde","d","de","e"}, + new int[]{0,0,0,0,0,0,0,0,0,0,0,0}, + new int[]{5,5,5,5,5,5,5,5,5,5,5,5}, + null, + new int[]{1,0,0,0,0,0,0,0,0,0,0,0}, + null, null, false ); } - + + public void testNgramsNoIncrement() throws Exception { + NGramTokenFilter filter = new NGramTokenFilter(TEST_VERSION_CURRENT, input, 1, 3); + assertTokenStreamContents(filter, + new String[]{"a","ab","abc","b","bc","bcd","c","cd","cde","d","de","e"}, + new int[]{0,0,0,0,0,0,0,0,0,0,0,0}, + new int[]{5,5,5,5,5,5,5,5,5,5,5,5}, + null, + new int[]{1,0,0,0,0,0,0,0,0,0,0,0}, + null, null, false + ); + } + public void testOversizedNgrams() throws Exception { - NGramTokenFilter filter = new NGramTokenFilter(input, 6, 7); + NGramTokenFilter filter = new NGramTokenFilter(TEST_VERSION_CURRENT, input, 6, 7); assertTokenStreamContents(filter, new String[0], new int[0], new int[0]); } public void testSmallTokenInStream() throws Exception { input = new MockTokenizer(new StringReader("abc de fgh"), MockTokenizer.WHITESPACE, false); - NGramTokenFilter filter = new NGramTokenFilter(input, 3, 3); - assertTokenStreamContents(filter, new String[]{"abc","fgh"}, new int[]{0,7}, new int[]{3,10}); + NGramTokenFilter filter = new NGramTokenFilter(TEST_VERSION_CURRENT, input, 3, 3); + assertTokenStreamContents(filter, new String[]{"abc","fgh"}, new int[]{0,7}, new int[]{3,10}, new int[] {1, 2}); } public void testReset() throws Exception { WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("abcde")); - NGramTokenFilter filter = new NGramTokenFilter(tokenizer, 1, 1); - assertTokenStreamContents(filter, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5}); + NGramTokenFilter filter = new NGramTokenFilter(TEST_VERSION_CURRENT, tokenizer, 1, 1); + assertTokenStreamContents(filter, new String[]{"a","b","c","d","e"}, new int[]{0,0,0,0,0}, new int[]{5,5,5,5,5}, new int[]{1,0,0,0,0}); tokenizer.setReader(new StringReader("abcde")); - assertTokenStreamContents(filter, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5}); + assertTokenStreamContents(filter, new String[]{"a","b","c","d","e"}, new int[]{0,0,0,0,0}, new int[]{5,5,5,5,5}, new int[]{1,0,0,0,0}); } // LUCENE-3642 @@ -112,14 +128,15 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase { protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); TokenFilter filters = new ASCIIFoldingFilter(tokenizer); - filters = new NGramTokenFilter(filters, 2, 2); + filters = new NGramTokenFilter(TEST_VERSION_CURRENT, filters, 2, 2); return new TokenStreamComponents(tokenizer, filters); } }; assertAnalyzesTo(analyzer, "mosfellsbær", new String[] { "mo", "os", "sf", "fe", "el", "ll", "ls", "sb", "ba", "ae", "er" }, new int[] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, - new int[] { 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11 }); + new int[] { 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11 }, + new int[] { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }); } /** blast some random strings through the analyzer */ @@ -129,7 +146,7 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase { protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); return new TokenStreamComponents(tokenizer, - new NGramTokenFilter(tokenizer, 2, 4)); + new NGramTokenFilter(TEST_VERSION_CURRENT, tokenizer, 2, 4)); } }; checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER, 20, false, false); @@ -142,9 +159,22 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase { protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer tokenizer = new KeywordTokenizer(reader); return new TokenStreamComponents(tokenizer, - new NGramTokenFilter(tokenizer, 2, 15)); + new NGramTokenFilter(TEST_VERSION_CURRENT, tokenizer, 2, 15)); } }; checkAnalysisConsistency(random, a, random.nextBoolean(), ""); } + + public void testLucene43() throws IOException { + NGramTokenFilter filter = new NGramTokenFilter(Version.LUCENE_43, input, 2, 3); + assertTokenStreamContents(filter, + new String[]{"ab","bc","cd","de","abc","bcd","cde"}, + new int[]{0,1,2,3,0,1,2}, + new int[]{2,3,4,5,3,4,5}, + null, + new int[]{1,1,1,1,1,1,1}, + null, null, false + ); + } + } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java index d4ac43f2814..f56f41309a2 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java @@ -18,13 +18,21 @@ package org.apache.lucene.analysis.ngram; */ +import java.io.IOException; import java.io.Reader; import java.io.StringReader; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; -import org.apache.lucene.util.LuceneTestCase.Slow; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; +import org.apache.lucene.util._TestUtil; + +import com.carrotsearch.randomizedtesting.generators.RandomStrings; /** * Tests {@link NGramTokenizer} for correctness. @@ -41,7 +49,7 @@ public class NGramTokenizerTest extends BaseTokenStreamTestCase { public void testInvalidInput() throws Exception { boolean gotException = false; try { - new NGramTokenizer(input, 2, 1); + new NGramTokenizer(TEST_VERSION_CURRENT, input, 2, 1); } catch (IllegalArgumentException e) { gotException = true; } @@ -51,7 +59,7 @@ public class NGramTokenizerTest extends BaseTokenStreamTestCase { public void testInvalidInput2() throws Exception { boolean gotException = false; try { - new NGramTokenizer(input, 0, 1); + new NGramTokenizer(TEST_VERSION_CURRENT, input, 0, 1); } catch (IllegalArgumentException e) { gotException = true; } @@ -59,21 +67,21 @@ public class NGramTokenizerTest extends BaseTokenStreamTestCase { } public void testUnigrams() throws Exception { - NGramTokenizer tokenizer = new NGramTokenizer(input, 1, 1); + NGramTokenizer tokenizer = new NGramTokenizer(TEST_VERSION_CURRENT, input, 1, 1); assertTokenStreamContents(tokenizer, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5}, 5 /* abcde */); } public void testBigrams() throws Exception { - NGramTokenizer tokenizer = new NGramTokenizer(input, 2, 2); + NGramTokenizer tokenizer = new NGramTokenizer(TEST_VERSION_CURRENT, input, 2, 2); assertTokenStreamContents(tokenizer, new String[]{"ab","bc","cd","de"}, new int[]{0,1,2,3}, new int[]{2,3,4,5}, 5 /* abcde */); } public void testNgrams() throws Exception { - NGramTokenizer tokenizer = new NGramTokenizer(input, 1, 3); + NGramTokenizer tokenizer = new NGramTokenizer(TEST_VERSION_CURRENT, input, 1, 3); assertTokenStreamContents(tokenizer, - new String[]{"a","b","c","d","e", "ab","bc","cd","de", "abc","bcd","cde"}, - new int[]{0,1,2,3,4, 0,1,2,3, 0,1,2}, - new int[]{1,2,3,4,5, 2,3,4,5, 3,4,5}, + new String[]{"a","ab", "abc", "b", "bc", "bcd", "c", "cd", "cde", "d", "de", "e"}, + new int[]{0,0,0,1,1,1,2,2,2,3,3,4}, + new int[]{1,2,3,2,3,4,3,4,5,4,5,5}, null, null, null, @@ -83,12 +91,12 @@ public class NGramTokenizerTest extends BaseTokenStreamTestCase { } public void testOversizedNgrams() throws Exception { - NGramTokenizer tokenizer = new NGramTokenizer(input, 6, 7); + NGramTokenizer tokenizer = new NGramTokenizer(TEST_VERSION_CURRENT, input, 6, 7); assertTokenStreamContents(tokenizer, new String[0], new int[0], new int[0], 5 /* abcde */); } public void testReset() throws Exception { - NGramTokenizer tokenizer = new NGramTokenizer(input, 1, 1); + NGramTokenizer tokenizer = new NGramTokenizer(TEST_VERSION_CURRENT, input, 1, 1); assertTokenStreamContents(tokenizer, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5}, 5 /* abcde */); tokenizer.setReader(new StringReader("abcde")); assertTokenStreamContents(tokenizer, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5}, 5 /* abcde */); @@ -99,11 +107,48 @@ public class NGramTokenizerTest extends BaseTokenStreamTestCase { Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { - Tokenizer tokenizer = new NGramTokenizer(reader, 2, 4); + Tokenizer tokenizer = new NGramTokenizer(TEST_VERSION_CURRENT, reader, 2, 4); return new TokenStreamComponents(tokenizer, tokenizer); } }; checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER, 20, false, false); checkRandomData(random(), a, 50*RANDOM_MULTIPLIER, 1027, false, false); } + + private void testNGrams(int minGram, int maxGram, int length) throws IOException { + final String s = RandomStrings.randomAsciiOfLength(random(), length); + final TokenStream grams = new NGramTokenizer(TEST_VERSION_CURRENT, new StringReader(s), minGram, maxGram); + final CharTermAttribute termAtt = grams.addAttribute(CharTermAttribute.class); + final PositionIncrementAttribute posIncAtt = grams.addAttribute(PositionIncrementAttribute.class); + final PositionLengthAttribute posLenAtt = grams.addAttribute(PositionLengthAttribute.class); + final OffsetAttribute offsetAtt = grams.addAttribute(OffsetAttribute.class); + grams.reset(); + for (int start = 0; start < s.length(); ++start) { + for (int end = start + minGram; end <= start + maxGram && end <= s.length(); ++end) { + assertTrue(grams.incrementToken()); + assertEquals(s.substring(start, end), termAtt.toString()); + assertEquals(1, posIncAtt.getPositionIncrement()); + assertEquals(start, offsetAtt.startOffset()); + assertEquals(end, offsetAtt.endOffset()); + } + } + grams.end(); + assertEquals(s.length(), offsetAtt.startOffset()); + assertEquals(s.length(), offsetAtt.endOffset()); + } + + public void testLargeInput() throws IOException { + // test sliding + final int minGram = _TestUtil.nextInt(random(), 1, 100); + final int maxGram = _TestUtil.nextInt(random(), minGram, 100); + testNGrams(minGram, maxGram, _TestUtil.nextInt(random(), 3 * 1024, 4 * 1024)); + } + + public void testLargeMaxGram() throws IOException { + // test sliding with maxGram > 1024 + final int minGram = _TestUtil.nextInt(random(), 1200, 1300); + final int maxGram = _TestUtil.nextInt(random(), minGram, 1300); + testNGrams(minGram, maxGram, _TestUtil.nextInt(random(), 3 * 1024, 4 * 1024)); + } + } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestNGramFilters.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestNGramFilters.java index a4ea1a2d09c..2256af6ba92 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestNGramFilters.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestNGramFilters.java @@ -35,7 +35,7 @@ public class TestNGramFilters extends BaseTokenStreamFactoryTestCase { Reader reader = new StringReader("test"); TokenStream stream = tokenizerFactory("NGram").create(reader); assertTokenStreamContents(stream, - new String[] { "t", "e", "s", "t", "te", "es", "st" }); + new String[] { "t", "te", "e", "es", "s", "st", "t" }); } /** @@ -47,7 +47,7 @@ public class TestNGramFilters extends BaseTokenStreamFactoryTestCase { "minGramSize", "2", "maxGramSize", "3").create(reader); assertTokenStreamContents(stream, - new String[] { "te", "es", "st", "tes", "est" }); + new String[] { "te", "tes", "es", "est", "st" }); } /** @@ -58,7 +58,7 @@ public class TestNGramFilters extends BaseTokenStreamFactoryTestCase { TokenStream stream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); stream = tokenFilterFactory("NGram").create(stream); assertTokenStreamContents(stream, - new String[] { "t", "e", "s", "t", "te", "es", "st" }); + new String[] { "t", "te", "e", "es", "s", "st", "t" }); } /** @@ -71,7 +71,7 @@ public class TestNGramFilters extends BaseTokenStreamFactoryTestCase { "minGramSize", "2", "maxGramSize", "3").create(stream); assertTokenStreamContents(stream, - new String[] { "te", "es", "st", "tes", "est" }); + new String[] { "te", "tes", "es", "est", "st" }); } /**