diff --git a/contrib/CHANGES.txt b/contrib/CHANGES.txt index 9f280e60dc6..f878aa8680a 100644 --- a/contrib/CHANGES.txt +++ b/contrib/CHANGES.txt @@ -99,6 +99,10 @@ New features * LUCENE-2243: Add DisjunctionMaxQuery support for FastVectorHighlighter. (Koji Sekiguchi) + * LUCENE-2218: ShingleFilter supports minimum shingle size, and the separator + character is now configurable. Its also up to 20% faster. + (Steven Rowe via Robert Muir) + Build * LUCENE-2124: Moved the JDK-based collation support from contrib/collation diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapper.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapper.java index 2aa6cd4ea2a..8349eeb98c4 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapper.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapper.java @@ -34,7 +34,9 @@ import org.apache.lucene.util.Version; public final class ShingleAnalyzerWrapper extends Analyzer { private final Analyzer defaultAnalyzer; - private int maxShingleSize = 2; + private int maxShingleSize = ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE; + private int minShingleSize = ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE; + private String tokenSeparator = ShingleFilter.TOKEN_SEPARATOR; private boolean outputUnigrams = true; public ShingleAnalyzerWrapper(Analyzer defaultAnalyzer) { @@ -44,7 +46,13 @@ public final class ShingleAnalyzerWrapper extends Analyzer { public ShingleAnalyzerWrapper(Analyzer defaultAnalyzer, int maxShingleSize) { this(defaultAnalyzer); - this.maxShingleSize = maxShingleSize; + setMaxShingleSize(maxShingleSize); + } + + public ShingleAnalyzerWrapper(Analyzer defaultAnalyzer, int minShingleSize, int maxShingleSize) { + this(defaultAnalyzer); + setMaxShingleSize(maxShingleSize); + setMinShingleSize(minShingleSize); } /** @@ -58,29 +66,73 @@ public final class ShingleAnalyzerWrapper extends Analyzer { /** * Wraps {@link StandardAnalyzer}. */ - public ShingleAnalyzerWrapper(Version matchVersion, int nGramSize) { + public ShingleAnalyzerWrapper(Version matchVersion, int minShingleSize, int maxShingleSize) { this(matchVersion); - this.maxShingleSize = nGramSize; + setMaxShingleSize(maxShingleSize); + setMinShingleSize(minShingleSize); } /** - * The max shingle (ngram) size + * The max shingle (token ngram) size * - * @return The max shingle (ngram) size + * @return The max shingle (token ngram) size */ public int getMaxShingleSize() { return maxShingleSize; } /** - * Set the maximum size of output shingles - * + * Set the maximum size of output shingles (default: 2) + * * @param maxShingleSize max shingle size */ public void setMaxShingleSize(int maxShingleSize) { + if (maxShingleSize < 2) { + throw new IllegalArgumentException("Max shingle size must be >= 2"); + } this.maxShingleSize = maxShingleSize; } + /** + * The min shingle (token ngram) size + * + * @return The min shingle (token ngram) size + */ + public int getMinShingleSize() { + return minShingleSize; + } + + /** + *

Set the min shingle size (default: 2). + *

This method requires that the passed in minShingleSize is not greater + * than maxShingleSize, so make sure that maxShingleSize is set before + * calling this method. + * + * @param minShingleSize min size of output shingles + */ + public void setMinShingleSize(int minShingleSize) { + if (minShingleSize < 2) { + throw new IllegalArgumentException("Min shingle size must be >= 2"); + } + if (minShingleSize > maxShingleSize) { + throw new IllegalArgumentException + ("Min shingle size must be <= max shingle size"); + } + this.minShingleSize = minShingleSize; + } + + public String getTokenSeparator() { + return tokenSeparator; + } + + /** + * Sets the string to use when joining adjacent tokens to form a shingle + * @param tokenSeparator used to separate input stream tokens in output shingles + */ + public void setTokenSeparator(String tokenSeparator) { + this.tokenSeparator = (tokenSeparator == null ? "" : tokenSeparator); + } + public boolean isOutputUnigrams() { return outputUnigrams; } @@ -104,8 +156,10 @@ public final class ShingleAnalyzerWrapper extends Analyzer { } catch (IOException e) { wrapped = defaultAnalyzer.tokenStream(fieldName, reader); } - ShingleFilter filter = new ShingleFilter(wrapped); + ShingleFilter filter = new ShingleFilter(wrapped, minShingleSize, maxShingleSize); + filter.setMinShingleSize(minShingleSize); filter.setMaxShingleSize(maxShingleSize); + filter.setTokenSeparator(tokenSeparator); filter.setOutputUnigrams(outputUnigrams); return filter; } @@ -113,7 +167,7 @@ public final class ShingleAnalyzerWrapper extends Analyzer { private class SavedStreams { TokenStream wrapped; ShingleFilter shingle; - }; + } @Override public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException { @@ -135,6 +189,8 @@ public final class ShingleAnalyzerWrapper extends Analyzer { } } streams.shingle.setMaxShingleSize(maxShingleSize); + streams.shingle.setMinShingleSize(minShingleSize); + streams.shingle.setTokenSeparator(tokenSeparator); streams.shingle.setOutputUnigrams(outputUnigrams); return streams.shingle; } diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java index 2a74357695a..ebf789b4d4b 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java @@ -18,18 +18,15 @@ package org.apache.lucene.analysis.shingle; */ import java.io.IOException; -import java.util.Iterator; import java.util.LinkedList; -import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute; -import org.apache.lucene.util.AttributeSource; -import org.apache.lucene.util.AttributeSource.State; + /** *

A ShingleFilter constructs shingles (token n-grams) from a token stream. @@ -44,26 +41,59 @@ import org.apache.lucene.util.AttributeSource.State; */ public final class ShingleFilter extends TokenFilter { - private LinkedList shingleBuf = new LinkedList(); - private StringBuilder[] shingles; - private String tokenType = "shingle"; - /** * filler token for when positionIncrement is more than 1 */ public static final char[] FILLER_TOKEN = { '_' }; - /** * default maximum shingle size is 2. */ public static final int DEFAULT_MAX_SHINGLE_SIZE = 2; /** - * The string to use when joining adjacent tokens to form a shingle + * default minimum shingle size is 2. + */ + public static final int DEFAULT_MIN_SHINGLE_SIZE = 2; + + /** + * default token type attribute value is "shingle" + */ + public static final String DEFAULT_TOKEN_TYPE = "shingle"; + + /** + * The default string to use when joining adjacent tokens to form a shingle */ public static final String TOKEN_SEPARATOR = " "; + + /** + * The sequence of input stream tokens (or filler tokens, if necessary) + * that will be composed to form output shingles. + */ + private LinkedList inputWindow = new LinkedList(); + + /** + * The number of input tokens in the next output token. This is the "n" in + * "token n-grams". + */ + private CircularSequence gramSize; + + /** + * Shingle text is composed here. + */ + private StringBuilder shingleBuilder = new StringBuilder(); + + /** + * The token type attribute value to use - default is "shingle" + */ + private String tokenType = DEFAULT_TOKEN_TYPE; + + /** + * The string to use when joining adjacent tokens to form a shingle + */ + private String tokenSeparator = TOKEN_SEPARATOR; + /** * By default, we output unigrams (individual tokens) as well as shingles * (token n-grams). @@ -76,15 +106,40 @@ public final class ShingleFilter extends TokenFilter { private int maxShingleSize; /** - * Constructs a ShingleFilter with the specified single size from the + * minimum shingle size (number of tokens) + */ + private int minShingleSize; + + /** + * The remaining number of filler tokens inserted into the input stream + * from which shingles are composed, to handle position increments greater + * than one. + */ + private int numFillerTokensToInsert; + + /** + * The next input stream token. + */ + private State nextInputStreamToken; + + private final TermAttribute termAtt; + private final OffsetAttribute offsetAtt; + private final PositionIncrementAttribute posIncrAtt; + private final TypeAttribute typeAtt; + + + /** + * Constructs a ShingleFilter with the specified shingle size from the * {@link TokenStream} input * * @param input input stream + * @param minShingleSize minimum shingle size produced by the filter. * @param maxShingleSize maximum shingle size produced by the filter. */ - public ShingleFilter(TokenStream input, int maxShingleSize) { + public ShingleFilter(TokenStream input, int minShingleSize, int maxShingleSize) { super(input); setMaxShingleSize(maxShingleSize); + setMinShingleSize(minShingleSize); this.termAtt = addAttribute(TermAttribute.class); this.offsetAtt = addAttribute(OffsetAttribute.class); this.posIncrAtt = addAttribute(PositionIncrementAttribute.class); @@ -92,22 +147,34 @@ public final class ShingleFilter extends TokenFilter { } /** - * Construct a ShingleFilter with default shingle size. + * Constructs a ShingleFilter with the specified shingle size from the + * {@link TokenStream} input + * + * @param input input stream + * @param maxShingleSize maximum shingle size produced by the filter. + */ + public ShingleFilter(TokenStream input, int maxShingleSize) { + this(input, DEFAULT_MIN_SHINGLE_SIZE, maxShingleSize); + } + + /** + * Construct a ShingleFilter with default shingle size: 2. * * @param input input stream */ public ShingleFilter(TokenStream input) { - this(input, DEFAULT_MAX_SHINGLE_SIZE); + this(input, DEFAULT_MIN_SHINGLE_SIZE, DEFAULT_MAX_SHINGLE_SIZE); } /** - * Construct a ShingleFilter with the specified token type for shingle tokens. + * Construct a ShingleFilter with the specified token type for shingle tokens + * and the default shingle size: 2 * * @param input input stream * @param tokenType token type for shingle tokens */ public ShingleFilter(TokenStream input, String tokenType) { - this(input, DEFAULT_MAX_SHINGLE_SIZE); + this(input, DEFAULT_MIN_SHINGLE_SIZE, DEFAULT_MAX_SHINGLE_SIZE); setTokenType(tokenType); } @@ -130,6 +197,7 @@ public final class ShingleFilter extends TokenFilter { */ public void setOutputUnigrams(boolean outputUnigrams) { this.outputUnigrams = outputUnigrams; + gramSize = new CircularSequence(); } /** @@ -141,203 +209,239 @@ public final class ShingleFilter extends TokenFilter { if (maxShingleSize < 2) { throw new IllegalArgumentException("Max shingle size must be >= 2"); } - shingles = new StringBuilder[maxShingleSize]; - for (int i = 0; i < shingles.length; i++) { - shingles[i] = new StringBuilder(); - } this.maxShingleSize = maxShingleSize; } /** - * Clear the StringBuilders that are used for storing the output shingles. + *

Set the min shingle size (default: 2). + *

This method requires that the passed in minShingleSize is not greater + * than maxShingleSize, so make sure that maxShingleSize is set before + * calling this method. + *

The unigram output option is independent of the min shingle size. + * + * @param minShingleSize min size of output shingles */ - private void clearShingles() { - for (int i = 0; i < shingles.length; i++) { - shingles[i].setLength(0); + public void setMinShingleSize(int minShingleSize) { + if (minShingleSize < 2) { + throw new IllegalArgumentException("Min shingle size must be >= 2"); } + if (minShingleSize > maxShingleSize) { + throw new IllegalArgumentException + ("Min shingle size must be <= max shingle size"); + } + this.minShingleSize = minShingleSize; + gramSize = new CircularSequence(); + } + + /** + * Sets the string to use when joining adjacent tokens to form a shingle + * @param tokenSeparator used to separate input stream tokens in output shingles + */ + public void setTokenSeparator(String tokenSeparator) { + this.tokenSeparator = null == tokenSeparator ? "" : tokenSeparator; } - - private AttributeSource.State nextToken; - private int shingleBufferPosition; - private int[] endOffsets; /* (non-Javadoc) * @see org.apache.lucene.analysis.TokenStream#next() */ @Override public final boolean incrementToken() throws IOException { - while (true) { - if (nextToken == null) { - if (!fillShingleBuffer()) { - return false; - } - } - - nextToken = shingleBuf.getFirst(); - - if (outputUnigrams) { - if (shingleBufferPosition == 0) { - restoreState(nextToken); - posIncrAtt.setPositionIncrement(1); - shingleBufferPosition++; - return true; - } - } else if (shingleBufferPosition % this.maxShingleSize == 0){ - shingleBufferPosition++; - } - - if (shingleBufferPosition < shingleBuf.size()) { - restoreState(nextToken); - typeAtt.setType(tokenType); - offsetAtt.setOffset(offsetAtt.startOffset(), endOffsets[shingleBufferPosition]); - StringBuilder buf = shingles[shingleBufferPosition]; - int termLength = buf.length(); - char[] termBuffer = termAtt.termBuffer(); - if (termBuffer.length < termLength) - termBuffer = termAtt.resizeTermBuffer(termLength); - buf.getChars(0, termLength, termBuffer, 0); - termAtt.setTermLength(termLength); - if ((! outputUnigrams) && shingleBufferPosition % this.maxShingleSize == 1) { - posIncrAtt.setPositionIncrement(1); - } else { - posIncrAtt.setPositionIncrement(0); - } - shingleBufferPosition++; - if (shingleBufferPosition == shingleBuf.size()) { - nextToken = null; - shingleBufferPosition = 0; - } - return true; - } else { - nextToken = null; - shingleBufferPosition = 0; + boolean tokenAvailable = false; + if (gramSize.atMinValue() || inputWindow.size() < gramSize.getValue()) { + shiftInputWindow(); + } + if ( ! inputWindow.isEmpty()) { + restoreState(inputWindow.getFirst()); + if (1 == gramSize.getValue()) { + posIncrAtt.setPositionIncrement(1); + gramSize.advance(); + tokenAvailable = true; + } else if (inputWindow.size() >= gramSize.getValue()) { + getNextShingle(); + gramSize.advance(); + tokenAvailable = true; } } + return tokenAvailable; + } + + /** + *

Makes the next token a shingle of length {@link #gramSize}, + * composed of tokens taken from {@link #inputWindow}. + *

Callers of this method must first insure that there are at least + * gramSize tokens available in inputWindow. + */ + private void getNextShingle() { + int startOffset = offsetAtt.startOffset(); + + int minTokNum = gramSize.getValue() - 1; // zero-based inputWindow position + if (gramSize.getValue() == minShingleSize) { + // Clear the shingle text buffer if this is the first shingle + // at the current position in the input stream. + shingleBuilder.setLength(0); + minTokNum = 0; + } + for (int tokNum = minTokNum ; tokNum < gramSize.getValue() ; ++tokNum) { + if (tokNum > 0) { + shingleBuilder.append(tokenSeparator); + } + restoreState(inputWindow.get(tokNum)); + shingleBuilder.append(termAtt.termBuffer(), 0, termAtt.termLength()); + } + char[] termBuffer = termAtt.termBuffer(); + int termLength = shingleBuilder.length(); + if (termBuffer.length < termLength) { + termBuffer = termAtt.resizeTermBuffer(termLength); + } + shingleBuilder.getChars(0, termLength, termBuffer, 0); + termAtt.setTermLength(termLength); + posIncrAtt.setPositionIncrement(gramSize.atMinValue() ? 1 : 0); + typeAtt.setType(tokenType); + offsetAtt.setOffset(startOffset, offsetAtt.endOffset()); } - private int numFillerTokensToInsert; - private AttributeSource.State currentToken; - private boolean hasCurrentToken; - - private TermAttribute termAtt; - private OffsetAttribute offsetAtt; - private PositionIncrementAttribute posIncrAtt; - private TypeAttribute typeAtt; - /** - * Get the next token from the input stream and push it on the token buffer. - * If we encounter a token with position increment > 1, we put filler tokens - * on the token buffer. - *

- * Returns null when the end of the input stream is reached. - * @return the next token, or null if at end of input stream + *

Get the next token from the input stream. + *

If the next token has positionIncrement > 1, + * positionIncrement - 1 {@link #FILLER_TOKEN}s are + * inserted first. + * @return false for end of stream; true otherwise * @throws IOException if the input stream has a problem */ private boolean getNextToken() throws IOException { - - while (true) { - if (numFillerTokensToInsert > 0) { - if (currentToken == null) { - currentToken = captureState(); - } else { - restoreState(currentToken); - } - numFillerTokensToInsert--; - // A filler token occupies no space - offsetAtt.setOffset(offsetAtt.startOffset(), offsetAtt.startOffset()); - termAtt.setTermBuffer(FILLER_TOKEN, 0, FILLER_TOKEN.length); - return true; - } - - if (hasCurrentToken) { - if (currentToken != null) { - restoreState(currentToken); - currentToken = null; - } - hasCurrentToken = false; - return true; - } - - if (!input.incrementToken()) return false; - hasCurrentToken = true; - - if (posIncrAtt.getPositionIncrement() > 1) { - numFillerTokensToInsert = posIncrAtt.getPositionIncrement() - 1; - } + boolean success = false; + if (numFillerTokensToInsert > 0) { + insertFillerToken(); + success = true; + } else if (null != nextInputStreamToken) { + restoreState(nextInputStreamToken); + nextInputStreamToken = null; + success = true; + } else if (input.incrementToken()) { + if (posIncrAtt.getPositionIncrement() > 1) { + numFillerTokensToInsert = posIncrAtt.getPositionIncrement() - 1; + insertFillerToken(); + } + success = true; } + return success; } /** - * Fill the output buffer with new shingles. + * Inserts a {@link #FILLER_TOKEN} and decrements + * {@link #numFillerTokensToInsert}. + */ + private void insertFillerToken() { + if (null == nextInputStreamToken) { + nextInputStreamToken = captureState(); + } else { + restoreState(nextInputStreamToken); + } + --numFillerTokensToInsert; + // A filler token occupies no space + offsetAtt.setOffset(offsetAtt.startOffset(), offsetAtt.startOffset()); + termAtt.setTermBuffer(FILLER_TOKEN, 0, FILLER_TOKEN.length); + } + + /** + *

Fills {@link #inputWindow} with input stream tokens, if available, + * shifting to the right if the window was previously full. + *

Resets {@link #gramSize} to its minimum value. * * @throws IOException if there's a problem getting the next token */ - private boolean fillShingleBuffer() throws IOException { - boolean addedToken = false; - /* - * Try to fill the shingle buffer. - */ - do { - if (getNextToken()) { - shingleBuf.add(captureState()); - if (shingleBuf.size() > maxShingleSize) - { - shingleBuf.removeFirst(); - } - addedToken = true; - } else { + private void shiftInputWindow() throws IOException { + if (inputWindow.size() > 0) { + inputWindow.removeFirst(); + } + while (getNextToken()) { + inputWindow.add(captureState()); + if (inputWindow.size() == maxShingleSize) { break; } - } while (shingleBuf.size() < maxShingleSize); - - if (shingleBuf.isEmpty()) { - return false; } - - /* - * If no new token could be added to the shingle buffer, we have reached - * the end of the input stream and have to discard the least recent token. - */ - if (! addedToken) { - shingleBuf.removeFirst(); - } - - if (shingleBuf.isEmpty()) { - return false; - } - - clearShingles(); - - endOffsets = new int[shingleBuf.size()]; - for (int i = 0; i < endOffsets.length; i++) { - endOffsets[i] = 0; - } - - int i = 0; - for (Iterator it = shingleBuf.iterator(); it.hasNext(); ) { - restoreState(it.next()); - for (int j = i; j < shingles.length; j++) { - if (shingles[j].length() != 0) { - shingles[j].append(TOKEN_SEPARATOR); - } - shingles[j].append(termAtt.termBuffer(), 0, termAtt.termLength()); - } - - endOffsets[i] = offsetAtt.endOffset(); - i++; - } - - return true; + gramSize.reset(); } @Override public void reset() throws IOException { super.reset(); - nextToken = null; - shingleBufferPosition = 0; - shingleBuf.clear(); + gramSize.reset(); + inputWindow.clear(); numFillerTokensToInsert = 0; - currentToken = null; - hasCurrentToken = false; + } + + + /** + *

An instance of this class is used to maintain the number of input + * stream tokens that will be used to compose the next unigram or shingle: + * {@link #gramSize}. + *

gramSize will take on values from the circular sequence + * { [ 1, ] {@link #minShingleSize} [ , ... , {@link #maxShingleSize} ] }. + *

1 is included in the circular sequence only if + * {@link #outputUnigrams} = true. + */ + private class CircularSequence { + private int value; + private int minValue; + + public CircularSequence() { + minValue = outputUnigrams ? 1 : minShingleSize; + reset(); + } + + /** + * {@see #advance()} + * @return the current value. + */ + public int getValue() { + return value; + } + + /** + *

Increments this circular number's value to the next member in the + * circular sequence + * gramSize will take on values from the circular sequence + * { [ 1, ] {@link #minShingleSize} [ , ... , {@link #maxShingleSize} ] }. + *

1 is included in the circular sequence only if + * {@link #outputUnigrams} = true. + * + * @return the next member in the circular sequence + */ + public int advance() { + if (value == 1) { + value = minShingleSize; + } else if (value == maxShingleSize) { + reset(); + } else { + ++value; + } + return value; + } + + /** + *

Sets this circular number's value to the first member of the + * circular sequence + *

gramSize will take on values from the circular sequence + * { [ 1, ] {@link #minShingleSize} [ , ... , {@link #maxShingleSize} ] }. + *

1 is included in the circular sequence only if + * {@link #outputUnigrams} = true. + */ + public void reset() { + value = minValue; + } + + /** + *

Returns true if the current value is the first member of the circular + * sequence. + *

If {@link #outputUnigrams} = true, the first member of the circular + * sequence will be 1; otherwise, it will be {@link #minShingleSize}. + * + * @return true if the current value is the first member of the circular + * sequence; false otherwise + */ + public boolean atMinValue() { + return value == minValue; + } } } diff --git a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapperTest.java b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapperTest.java index 1991af0505d..9258aba003f 100644 --- a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapperTest.java +++ b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapperTest.java @@ -246,4 +246,117 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase { new int[] { 6, 13, 13, 18, 18, 27, 27 }, new int[] { 1, 0, 1, 0, 1, 0, 1 }); } + + public void testNonDefaultMinShingleSize() throws Exception { + ShingleAnalyzerWrapper analyzer + = new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(), 3, 4); + assertAnalyzesToReuse(analyzer, "please divide this sentence into shingles", + new String[] { "please", "please divide this", "please divide this sentence", + "divide", "divide this sentence", "divide this sentence into", + "this", "this sentence into", "this sentence into shingles", + "sentence", "sentence into shingles", + "into", + "shingles" }, + new int[] { 0, 0, 0, 7, 7, 7, 14, 14, 14, 19, 19, 28, 33 }, + new int[] { 6, 18, 27, 13, 27, 32, 18, 32, 41, 27, 41, 32, 41 }, + new int[] { 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1 }); + analyzer.setOutputUnigrams(false); + assertAnalyzesToReuse(analyzer, "please divide this sentence into shingles", + new String[] { "please divide this", "please divide this sentence", + "divide this sentence", "divide this sentence into", + "this sentence into", "this sentence into shingles", + "sentence into shingles" }, + new int[] { 0, 0, 7, 7, 14, 14, 19 }, + new int[] { 18, 27, 27, 32, 32, 41, 41 }, + new int[] { 1, 0, 1, 0, 1, 0, 1 }); + } + + public void testNonDefaultMinAndSameMaxShingleSize() throws Exception { + ShingleAnalyzerWrapper analyzer + = new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(), 3, 3); + assertAnalyzesToReuse(analyzer, "please divide this sentence into shingles", + new String[] { "please", "please divide this", + "divide", "divide this sentence", + "this", "this sentence into", + "sentence", "sentence into shingles", + "into", + "shingles" }, + new int[] { 0, 0, 7, 7, 14, 14, 19, 19, 28, 33 }, + new int[] { 6, 18, 13, 27, 18, 32, 27, 41, 32, 41 }, + new int[] { 1, 0, 1, 0, 1, 0, 1, 0, 1, 1 }); + analyzer.setOutputUnigrams(false); + assertAnalyzesToReuse(analyzer, "please divide this sentence into shingles", + new String[] { "please divide this", + "divide this sentence", + "this sentence into", + "sentence into shingles" }, + new int[] { 0, 7, 14, 19 }, + new int[] { 18, 27, 32, 41 }, + new int[] { 1, 1, 1, 1 }); + } + + public void testNoTokenSeparator() throws Exception { + ShingleAnalyzerWrapper analyzer + = new ShingleAnalyzerWrapper(new WhitespaceAnalyzer()); + analyzer.setTokenSeparator(""); + assertAnalyzesToReuse(analyzer, "please divide into shingles", + new String[] { "please", "pleasedivide", + "divide", "divideinto", + "into", "intoshingles", + "shingles" }, + new int[] { 0, 0, 7, 7, 14, 14, 19 }, + new int[] { 6, 13, 13, 18, 18, 27, 27 }, + new int[] { 1, 0, 1, 0, 1, 0, 1 }); + analyzer.setOutputUnigrams(false); + assertAnalyzesToReuse(analyzer, "please divide into shingles", + new String[] { "pleasedivide", + "divideinto", + "intoshingles" }, + new int[] { 0, 7, 14 }, + new int[] { 13, 18, 27 }, + new int[] { 1, 1, 1 }); + } + + public void testNullTokenSeparator() throws Exception { + ShingleAnalyzerWrapper analyzer + = new ShingleAnalyzerWrapper(new WhitespaceAnalyzer()); + analyzer.setTokenSeparator(null); + assertAnalyzesToReuse(analyzer, "please divide into shingles", + new String[] { "please", "pleasedivide", + "divide", "divideinto", + "into", "intoshingles", + "shingles" }, + new int[] { 0, 0, 7, 7, 14, 14, 19 }, + new int[] { 6, 13, 13, 18, 18, 27, 27 }, + new int[] { 1, 0, 1, 0, 1, 0, 1 }); + analyzer.setOutputUnigrams(false); + assertAnalyzesToReuse(analyzer, "please divide into shingles", + new String[] { "pleasedivide", + "divideinto", + "intoshingles" }, + new int[] { 0, 7, 14 }, + new int[] { 13, 18, 27 }, + new int[] { 1, 1, 1 }); + } + public void testAltTokenSeparator() throws Exception { + ShingleAnalyzerWrapper analyzer + = new ShingleAnalyzerWrapper(new WhitespaceAnalyzer()); + analyzer.setTokenSeparator(""); + assertAnalyzesToReuse(analyzer, "please divide into shingles", + new String[] { "please", "pleasedivide", + "divide", "divideinto", + "into", "intoshingles", + "shingles" }, + new int[] { 0, 0, 7, 7, 14, 14, 19 }, + new int[] { 6, 13, 13, 18, 18, 27, 27 }, + new int[] { 1, 0, 1, 0, 1, 0, 1 }); + analyzer.setOutputUnigrams(false); + assertAnalyzesToReuse(analyzer, "please divide into shingles", + new String[] { "pleasedivide", + "divideinto", + "intoshingles" }, + new int[] { 0, 7, 14 }, + new int[] { 13, 18, 27 }, + new int[] { 1, 1, 1 }); + } } diff --git a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java index 0d9434181e6..27a8c6bdd86 100644 --- a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java +++ b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java @@ -288,7 +288,360 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase { }; + public static final Token[] TRI_GRAM_TOKENS_MIN_TRI_GRAM = new Token[] { + createToken("please", 0, 6), + createToken("please divide this", 0, 18), + createToken("divide", 7, 13), + createToken("divide this sentence", 7, 27), + createToken("this", 14, 18), + createToken("this sentence into", 14, 32), + createToken("sentence", 19, 27), + createToken("sentence into shingles", 19, 39), + createToken("into", 28, 32), + createToken("shingles", 33, 39) + }; + public static final int[] TRI_GRAM_POSITION_INCREMENTS_MIN_TRI_GRAM = new int[] { + 1, 0, 1, 0, 1, 0, 1, 0, 1, 1 + }; + + public static final String[] TRI_GRAM_TYPES_MIN_TRI_GRAM = new String[] { + "word", "shingle", + "word", "shingle", + "word", "shingle", + "word", "shingle", + "word", + "word" + }; + + public static final Token[] TRI_GRAM_TOKENS_WITHOUT_UNIGRAMS_MIN_TRI_GRAM = new Token[] { + createToken("please divide this", 0, 18), + createToken("divide this sentence", 7, 27), + createToken("this sentence into", 14, 32), + createToken("sentence into shingles", 19, 39) + }; + + public static final int[] TRI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_MIN_TRI_GRAM = new int[] { + 1, 1, 1, 1 + }; + + public static final String[] TRI_GRAM_TYPES_WITHOUT_UNIGRAMS_MIN_TRI_GRAM = new String[] { + "shingle", + "shingle", + "shingle", + "shingle" + }; + + public static final Token[] FOUR_GRAM_TOKENS_MIN_TRI_GRAM = new Token[] { + createToken("please", 0, 6), + createToken("please divide this", 0, 18), + createToken("please divide this sentence", 0, 27), + createToken("divide", 7, 13), + createToken("divide this sentence", 7, 27), + createToken("divide this sentence into", 7, 32), + createToken("this", 14, 18), + createToken("this sentence into", 14, 32), + createToken("this sentence into shingles", 14, 39), + createToken("sentence", 19, 27), + createToken("sentence into shingles", 19, 39), + createToken("into", 28, 32), + createToken("shingles", 33, 39) + }; + + public static final int[] FOUR_GRAM_POSITION_INCREMENTS_MIN_TRI_GRAM = new int[] { + 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1 + }; + + public static final String[] FOUR_GRAM_TYPES_MIN_TRI_GRAM = new String[] { + "word", "shingle", "shingle", + "word", "shingle", "shingle", + "word", "shingle", "shingle", + "word", "shingle", + "word", + "word" + }; + + public static final Token[] FOUR_GRAM_TOKENS_WITHOUT_UNIGRAMS_MIN_TRI_GRAM = new Token[] { + createToken("please divide this", 0, 18), + createToken("please divide this sentence", 0, 27), + createToken("divide this sentence", 7, 27), + createToken("divide this sentence into", 7, 32), + createToken("this sentence into", 14, 32), + createToken("this sentence into shingles", 14, 39), + createToken("sentence into shingles", 19, 39), + }; + + public static final int[] FOUR_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_MIN_TRI_GRAM = new int[] { + 1, 0, 1, 0, 1, 0, 1 + }; + + public static final String[] FOUR_GRAM_TYPES_WITHOUT_UNIGRAMS_MIN_TRI_GRAM = new String[] { + "shingle", "shingle", + "shingle", "shingle", + "shingle", "shingle", + "shingle" + }; + + public static final Token[] FOUR_GRAM_TOKENS_MIN_FOUR_GRAM = new Token[] { + createToken("please", 0, 6), + createToken("please divide this sentence", 0, 27), + createToken("divide", 7, 13), + createToken("divide this sentence into", 7, 32), + createToken("this", 14, 18), + createToken("this sentence into shingles", 14, 39), + createToken("sentence", 19, 27), + createToken("into", 28, 32), + createToken("shingles", 33, 39) + }; + + public static final int[] FOUR_GRAM_POSITION_INCREMENTS_MIN_FOUR_GRAM = new int[] { + 1, 0, 1, 0, 1, 0, 1, 1, 1 + }; + + public static final String[] FOUR_GRAM_TYPES_MIN_FOUR_GRAM = new String[] { + "word", "shingle", + "word", "shingle", + "word", "shingle", + "word", + "word", + "word" + }; + + public static final Token[] FOUR_GRAM_TOKENS_WITHOUT_UNIGRAMS_MIN_FOUR_GRAM = new Token[] { + createToken("please divide this sentence", 0, 27), + createToken("divide this sentence into", 7, 32), + createToken("this sentence into shingles", 14, 39), + }; + + public static final int[] FOUR_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_MIN_FOUR_GRAM = new int[] { + 1, 1, 1 + }; + + public static final String[] FOUR_GRAM_TYPES_WITHOUT_UNIGRAMS_MIN_FOUR_GRAM = new String[] { + "shingle", + "shingle", + "shingle" + }; + + public static final Token[] BI_GRAM_TOKENS_NO_SEPARATOR = new Token[] { + createToken("please", 0, 6), + createToken("pleasedivide", 0, 13), + createToken("divide", 7, 13), + createToken("dividethis", 7, 18), + createToken("this", 14, 18), + createToken("thissentence", 14, 27), + createToken("sentence", 19, 27), + createToken("sentenceinto", 19, 32), + createToken("into", 28, 32), + createToken("intoshingles", 28, 39), + createToken("shingles", 33, 39), + }; + + public static final int[] BI_GRAM_POSITION_INCREMENTS_NO_SEPARATOR = new int[] { + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 + }; + + public static final String[] BI_GRAM_TYPES_NO_SEPARATOR = new String[] { + "word", "shingle", "word", "shingle", "word", "shingle", "word", + "shingle", "word", "shingle", "word" + }; + + public static final Token[] BI_GRAM_TOKENS_WITHOUT_UNIGRAMS_NO_SEPARATOR = new Token[] { + createToken("pleasedivide", 0, 13), + createToken("dividethis", 7, 18), + createToken("thissentence", 14, 27), + createToken("sentenceinto", 19, 32), + createToken("intoshingles", 28, 39), + }; + + public static final int[] BI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_NO_SEPARATOR = new int[] { + 1, 1, 1, 1, 1 + }; + + public static final String[] BI_GRAM_TYPES_WITHOUT_UNIGRAMS_NO_SEPARATOR = new String[] { + "shingle", "shingle", "shingle", "shingle", "shingle" + }; + + public static final Token[] TRI_GRAM_TOKENS_NO_SEPARATOR = new Token[] { + createToken("please", 0, 6), + createToken("pleasedivide", 0, 13), + createToken("pleasedividethis", 0, 18), + createToken("divide", 7, 13), + createToken("dividethis", 7, 18), + createToken("dividethissentence", 7, 27), + createToken("this", 14, 18), + createToken("thissentence", 14, 27), + createToken("thissentenceinto", 14, 32), + createToken("sentence", 19, 27), + createToken("sentenceinto", 19, 32), + createToken("sentenceintoshingles", 19, 39), + createToken("into", 28, 32), + createToken("intoshingles", 28, 39), + createToken("shingles", 33, 39) + }; + + public static final int[] TRI_GRAM_POSITION_INCREMENTS_NO_SEPARATOR = new int[] { + 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1 + }; + + public static final String[] TRI_GRAM_TYPES_NO_SEPARATOR = new String[] { + "word", "shingle", "shingle", + "word", "shingle", "shingle", + "word", "shingle", "shingle", + "word", "shingle", "shingle", + "word", "shingle", + "word" + }; + + public static final Token[] TRI_GRAM_TOKENS_WITHOUT_UNIGRAMS_NO_SEPARATOR = new Token[] { + createToken("pleasedivide", 0, 13), + createToken("pleasedividethis", 0, 18), + createToken("dividethis", 7, 18), + createToken("dividethissentence", 7, 27), + createToken("thissentence", 14, 27), + createToken("thissentenceinto", 14, 32), + createToken("sentenceinto", 19, 32), + createToken("sentenceintoshingles", 19, 39), + createToken("intoshingles", 28, 39), + }; + + public static final int[] TRI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_NO_SEPARATOR = new int[] { + 1, 0, 1, 0, 1, 0, 1, 0, 1 + }; + + public static final String[] TRI_GRAM_TYPES_WITHOUT_UNIGRAMS_NO_SEPARATOR = new String[] { + "shingle", "shingle", + "shingle", "shingle", + "shingle", "shingle", + "shingle", "shingle", + "shingle", + }; + + public static final Token[] BI_GRAM_TOKENS_ALT_SEPARATOR = new Token[] { + createToken("please", 0, 6), + createToken("pleasedivide", 0, 13), + createToken("divide", 7, 13), + createToken("dividethis", 7, 18), + createToken("this", 14, 18), + createToken("thissentence", 14, 27), + createToken("sentence", 19, 27), + createToken("sentenceinto", 19, 32), + createToken("into", 28, 32), + createToken("intoshingles", 28, 39), + createToken("shingles", 33, 39), + }; + + public static final int[] BI_GRAM_POSITION_INCREMENTS_ALT_SEPARATOR = new int[] { + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 + }; + + public static final String[] BI_GRAM_TYPES_ALT_SEPARATOR = new String[] { + "word", "shingle", "word", "shingle", "word", "shingle", "word", + "shingle", "word", "shingle", "word" + }; + + public static final Token[] BI_GRAM_TOKENS_WITHOUT_UNIGRAMS_ALT_SEPARATOR = new Token[] { + createToken("pleasedivide", 0, 13), + createToken("dividethis", 7, 18), + createToken("thissentence", 14, 27), + createToken("sentenceinto", 19, 32), + createToken("intoshingles", 28, 39), + }; + + public static final int[] BI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_ALT_SEPARATOR = new int[] { + 1, 1, 1, 1, 1 + }; + + public static final String[] BI_GRAM_TYPES_WITHOUT_UNIGRAMS_ALT_SEPARATOR = new String[] { + "shingle", "shingle", "shingle", "shingle", "shingle" + }; + + public static final Token[] TRI_GRAM_TOKENS_ALT_SEPARATOR = new Token[] { + createToken("please", 0, 6), + createToken("pleasedivide", 0, 13), + createToken("pleasedividethis", 0, 18), + createToken("divide", 7, 13), + createToken("dividethis", 7, 18), + createToken("dividethissentence", 7, 27), + createToken("this", 14, 18), + createToken("thissentence", 14, 27), + createToken("thissentenceinto", 14, 32), + createToken("sentence", 19, 27), + createToken("sentenceinto", 19, 32), + createToken("sentenceintoshingles", 19, 39), + createToken("into", 28, 32), + createToken("intoshingles", 28, 39), + createToken("shingles", 33, 39) + }; + + public static final int[] TRI_GRAM_POSITION_INCREMENTS_ALT_SEPARATOR = new int[] { + 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1 + }; + + public static final String[] TRI_GRAM_TYPES_ALT_SEPARATOR = new String[] { + "word", "shingle", "shingle", + "word", "shingle", "shingle", + "word", "shingle", "shingle", + "word", "shingle", "shingle", + "word", "shingle", + "word" + }; + + public static final Token[] TRI_GRAM_TOKENS_WITHOUT_UNIGRAMS_ALT_SEPARATOR = new Token[] { + createToken("pleasedivide", 0, 13), + createToken("pleasedividethis", 0, 18), + createToken("dividethis", 7, 18), + createToken("dividethissentence", 7, 27), + createToken("thissentence", 14, 27), + createToken("thissentenceinto", 14, 32), + createToken("sentenceinto", 19, 32), + createToken("sentenceintoshingles", 19, 39), + createToken("intoshingles", 28, 39), + }; + + public static final int[] TRI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_ALT_SEPARATOR = new int[] { + 1, 0, 1, 0, 1, 0, 1, 0, 1 + }; + + public static final String[] TRI_GRAM_TYPES_WITHOUT_UNIGRAMS_ALT_SEPARATOR = new String[] { + "shingle", "shingle", + "shingle", "shingle", + "shingle", "shingle", + "shingle", "shingle", + "shingle", + }; + + public static final Token[] TRI_GRAM_TOKENS_NULL_SEPARATOR = new Token[] { + createToken("please", 0, 6), + createToken("pleasedivide", 0, 13), + createToken("pleasedividethis", 0, 18), + createToken("divide", 7, 13), + createToken("dividethis", 7, 18), + createToken("dividethissentence", 7, 27), + createToken("this", 14, 18), + createToken("thissentence", 14, 27), + createToken("thissentenceinto", 14, 32), + createToken("sentence", 19, 27), + createToken("sentenceinto", 19, 32), + createToken("sentenceintoshingles", 19, 39), + createToken("into", 28, 32), + createToken("intoshingles", 28, 39), + createToken("shingles", 33, 39) + }; + + public static final int[] TRI_GRAM_POSITION_INCREMENTS_NULL_SEPARATOR = new int[] { + 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1 + }; + + public static final String[] TRI_GRAM_TYPES_NULL_SEPARATOR = new String[] { + "word", "shingle", "shingle", + "word", "shingle", "shingle", + "word", "shingle", "shingle", + "word", "shingle", "shingle", + "word", "shingle", + "word" + }; + @Override protected void setUp() throws Exception { super.setUp(); @@ -379,6 +732,108 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase { } + public void testTriGramFilterMinTriGram() throws IOException { + this.shingleFilterTest(3, 3, TEST_TOKEN, TRI_GRAM_TOKENS_MIN_TRI_GRAM, + TRI_GRAM_POSITION_INCREMENTS_MIN_TRI_GRAM, + TRI_GRAM_TYPES_MIN_TRI_GRAM, + true); + } + + public void testTriGramFilterWithoutUnigramsMinTriGram() throws IOException { + this.shingleFilterTest(3, 3, TEST_TOKEN, + TRI_GRAM_TOKENS_WITHOUT_UNIGRAMS_MIN_TRI_GRAM, + TRI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_MIN_TRI_GRAM, + TRI_GRAM_TYPES_WITHOUT_UNIGRAMS_MIN_TRI_GRAM, + false); + } + + public void testFourGramFilterMinTriGram() throws IOException { + this.shingleFilterTest(3, 4, TEST_TOKEN, FOUR_GRAM_TOKENS_MIN_TRI_GRAM, + FOUR_GRAM_POSITION_INCREMENTS_MIN_TRI_GRAM, + FOUR_GRAM_TYPES_MIN_TRI_GRAM, + true); + } + + public void testFourGramFilterWithoutUnigramsMinTriGram() throws IOException { + this.shingleFilterTest(3, 4, TEST_TOKEN, + FOUR_GRAM_TOKENS_WITHOUT_UNIGRAMS_MIN_TRI_GRAM, + FOUR_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_MIN_TRI_GRAM, + FOUR_GRAM_TYPES_WITHOUT_UNIGRAMS_MIN_TRI_GRAM, false); + } + + public void testFourGramFilterMinFourGram() throws IOException { + this.shingleFilterTest(4, 4, TEST_TOKEN, FOUR_GRAM_TOKENS_MIN_FOUR_GRAM, + FOUR_GRAM_POSITION_INCREMENTS_MIN_FOUR_GRAM, + FOUR_GRAM_TYPES_MIN_FOUR_GRAM, + true); + } + + public void testFourGramFilterWithoutUnigramsMinFourGram() throws IOException { + this.shingleFilterTest(4, 4, TEST_TOKEN, + FOUR_GRAM_TOKENS_WITHOUT_UNIGRAMS_MIN_FOUR_GRAM, + FOUR_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_MIN_FOUR_GRAM, + FOUR_GRAM_TYPES_WITHOUT_UNIGRAMS_MIN_FOUR_GRAM, false); + } + + public void testBiGramFilterNoSeparator() throws IOException { + this.shingleFilterTest("", 2, 2, TEST_TOKEN, BI_GRAM_TOKENS_NO_SEPARATOR, + BI_GRAM_POSITION_INCREMENTS_NO_SEPARATOR, + BI_GRAM_TYPES_NO_SEPARATOR, true); + } + + public void testBiGramFilterWithoutUnigramsNoSeparator() throws IOException { + this.shingleFilterTest("", 2, 2, TEST_TOKEN, + BI_GRAM_TOKENS_WITHOUT_UNIGRAMS_NO_SEPARATOR, + BI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_NO_SEPARATOR, + BI_GRAM_TYPES_WITHOUT_UNIGRAMS_NO_SEPARATOR, + false); + } + public void testTriGramFilterNoSeparator() throws IOException { + this.shingleFilterTest("", 2, 3, TEST_TOKEN, TRI_GRAM_TOKENS_NO_SEPARATOR, + TRI_GRAM_POSITION_INCREMENTS_NO_SEPARATOR, + TRI_GRAM_TYPES_NO_SEPARATOR, true); + } + + public void testTriGramFilterWithoutUnigramsNoSeparator() throws IOException { + this.shingleFilterTest("", 2, 3, TEST_TOKEN, + TRI_GRAM_TOKENS_WITHOUT_UNIGRAMS_NO_SEPARATOR, + TRI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_NO_SEPARATOR, + TRI_GRAM_TYPES_WITHOUT_UNIGRAMS_NO_SEPARATOR, false); + } + + public void testBiGramFilterAltSeparator() throws IOException { + this.shingleFilterTest("", 2, 2, TEST_TOKEN, BI_GRAM_TOKENS_ALT_SEPARATOR, + BI_GRAM_POSITION_INCREMENTS_ALT_SEPARATOR, + BI_GRAM_TYPES_ALT_SEPARATOR, true); + } + + public void testBiGramFilterWithoutUnigramsAltSeparator() throws IOException { + this.shingleFilterTest("", 2, 2, TEST_TOKEN, + BI_GRAM_TOKENS_WITHOUT_UNIGRAMS_ALT_SEPARATOR, + BI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_ALT_SEPARATOR, + BI_GRAM_TYPES_WITHOUT_UNIGRAMS_ALT_SEPARATOR, + false); + } + public void testTriGramFilterAltSeparator() throws IOException { + this.shingleFilterTest("", 2, 3, TEST_TOKEN, TRI_GRAM_TOKENS_ALT_SEPARATOR, + TRI_GRAM_POSITION_INCREMENTS_ALT_SEPARATOR, + TRI_GRAM_TYPES_ALT_SEPARATOR, true); + } + + public void testTriGramFilterWithoutUnigramsAltSeparator() throws IOException { + this.shingleFilterTest("", 2, 3, TEST_TOKEN, + TRI_GRAM_TOKENS_WITHOUT_UNIGRAMS_ALT_SEPARATOR, + TRI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_ALT_SEPARATOR, + TRI_GRAM_TYPES_WITHOUT_UNIGRAMS_ALT_SEPARATOR, false); + } + + public void testTriGramFilterNullSeparator() throws IOException { + this.shingleFilterTest(null, 2, 3, TEST_TOKEN, TRI_GRAM_TOKENS_NULL_SEPARATOR, + TRI_GRAM_POSITION_INCREMENTS_NULL_SEPARATOR, + TRI_GRAM_TYPES_NULL_SEPARATOR, true); + } + + public void testReset() throws Exception { Tokenizer wsTokenizer = new WhitespaceTokenizer(new StringReader("please divide this sentence")); TokenStream filter = new ShingleFilter(wsTokenizer, 2); @@ -403,30 +858,50 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase { throws IOException { ShingleFilter filter = new ShingleFilter(new TestTokenStream(tokensToShingle), maxSize); + shingleFilterTestCommon + (filter, tokensToCompare, positionIncrements, types, outputUnigrams); + } + + protected void shingleFilterTest(int minSize, int maxSize, Token[] tokensToShingle, + Token[] tokensToCompare, int[] positionIncrements, + String[] types, boolean outputUnigrams) + throws IOException { + ShingleFilter filter + = new ShingleFilter(new TestTokenStream(tokensToShingle), minSize, maxSize); + shingleFilterTestCommon + (filter, tokensToCompare, positionIncrements, types, outputUnigrams); + } + + protected void shingleFilterTest(String tokenSeparator, int minSize, int maxSize, Token[] tokensToShingle, + Token[] tokensToCompare, int[] positionIncrements, + String[] types, boolean outputUnigrams) + throws IOException { + ShingleFilter filter + = new ShingleFilter(new TestTokenStream(tokensToShingle), minSize, maxSize); + filter.setTokenSeparator(tokenSeparator); + shingleFilterTestCommon + (filter, tokensToCompare, positionIncrements, types, outputUnigrams); + } + + protected void shingleFilterTestCommon(ShingleFilter filter, + Token[] tokensToCompare, + int[] positionIncrements, + String[] types, boolean outputUnigrams) + throws IOException { + filter.setOutputUnigrams(outputUnigrams); - TermAttribute termAtt = filter.addAttribute(TermAttribute.class); - OffsetAttribute offsetAtt = filter.addAttribute(OffsetAttribute.class); - PositionIncrementAttribute posIncrAtt = filter.addAttribute(PositionIncrementAttribute.class); - TypeAttribute typeAtt = filter.addAttribute(TypeAttribute.class); - - int i = 0; - while (filter.incrementToken()) { - assertTrue("ShingleFilter outputted more tokens than expected", i < tokensToCompare.length); - String termText = termAtt.term(); - String goldText = tokensToCompare[i].term(); - assertEquals("Wrong termText", goldText, termText); - assertEquals("Wrong startOffset for token \"" + termText + "\"", - tokensToCompare[i].startOffset(), offsetAtt.startOffset()); - assertEquals("Wrong endOffset for token \"" + termText + "\"", - tokensToCompare[i].endOffset(), offsetAtt.endOffset()); - assertEquals("Wrong positionIncrement for token \"" + termText + "\"", - positionIncrements[i], posIncrAtt.getPositionIncrement()); - assertEquals("Wrong type for token \"" + termText + "\"", types[i], typeAtt.type()); - i++; + String text[] = new String[tokensToCompare.length]; + int startOffsets[] = new int[tokensToCompare.length]; + int endOffsets[] = new int[tokensToCompare.length]; + + for (int i = 0; i < tokensToCompare.length; i++) { + text[i] = tokensToCompare[i].term(); + startOffsets[i] = tokensToCompare[i].startOffset(); + endOffsets[i] = tokensToCompare[i].endOffset(); } - assertEquals("ShingleFilter outputted wrong # of tokens. (# output = " + i + "; # expected =" + tokensToCompare.length + ")", - tokensToCompare.length, i); + + assertTokenStreamContents(filter, text, startOffsets, endOffsets, types, positionIncrements); } private static Token createToken(String term, int start, int offset)