LUCENE-2218: Improvements to ShingleFilter (performance, configurable sep. char and min shingle size)

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@905043 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2010-01-31 14:04:01 +00:00
parent e1df47d58f
commit fdf4ea2448
5 changed files with 960 additions and 208 deletions

View File

@ -99,6 +99,10 @@ New features
* LUCENE-2243: Add DisjunctionMaxQuery support for FastVectorHighlighter.
(Koji Sekiguchi)
* LUCENE-2218: ShingleFilter supports minimum shingle size, and the separator
character is now configurable. Its also up to 20% faster.
(Steven Rowe via Robert Muir)
Build
* LUCENE-2124: Moved the JDK-based collation support from contrib/collation

View File

@ -34,7 +34,9 @@ import org.apache.lucene.util.Version;
public final class ShingleAnalyzerWrapper extends Analyzer {
private final Analyzer defaultAnalyzer;
private int maxShingleSize = 2;
private int maxShingleSize = ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE;
private int minShingleSize = ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE;
private String tokenSeparator = ShingleFilter.TOKEN_SEPARATOR;
private boolean outputUnigrams = true;
public ShingleAnalyzerWrapper(Analyzer defaultAnalyzer) {
@ -44,7 +46,13 @@ public final class ShingleAnalyzerWrapper extends Analyzer {
public ShingleAnalyzerWrapper(Analyzer defaultAnalyzer, int maxShingleSize) {
this(defaultAnalyzer);
this.maxShingleSize = maxShingleSize;
setMaxShingleSize(maxShingleSize);
}
public ShingleAnalyzerWrapper(Analyzer defaultAnalyzer, int minShingleSize, int maxShingleSize) {
this(defaultAnalyzer);
setMaxShingleSize(maxShingleSize);
setMinShingleSize(minShingleSize);
}
/**
@ -58,29 +66,73 @@ public final class ShingleAnalyzerWrapper extends Analyzer {
/**
* Wraps {@link StandardAnalyzer}.
*/
public ShingleAnalyzerWrapper(Version matchVersion, int nGramSize) {
public ShingleAnalyzerWrapper(Version matchVersion, int minShingleSize, int maxShingleSize) {
this(matchVersion);
this.maxShingleSize = nGramSize;
setMaxShingleSize(maxShingleSize);
setMinShingleSize(minShingleSize);
}
/**
* The max shingle (ngram) size
* The max shingle (token ngram) size
*
* @return The max shingle (ngram) size
* @return The max shingle (token ngram) size
*/
public int getMaxShingleSize() {
return maxShingleSize;
}
/**
* Set the maximum size of output shingles
* Set the maximum size of output shingles (default: 2)
*
* @param maxShingleSize max shingle size
*/
public void setMaxShingleSize(int maxShingleSize) {
if (maxShingleSize < 2) {
throw new IllegalArgumentException("Max shingle size must be >= 2");
}
this.maxShingleSize = maxShingleSize;
}
/**
* The min shingle (token ngram) size
*
* @return The min shingle (token ngram) size
*/
public int getMinShingleSize() {
return minShingleSize;
}
/**
* <p>Set the min shingle size (default: 2).
* <p>This method requires that the passed in minShingleSize is not greater
* than maxShingleSize, so make sure that maxShingleSize is set before
* calling this method.
*
* @param minShingleSize min size of output shingles
*/
public void setMinShingleSize(int minShingleSize) {
if (minShingleSize < 2) {
throw new IllegalArgumentException("Min shingle size must be >= 2");
}
if (minShingleSize > maxShingleSize) {
throw new IllegalArgumentException
("Min shingle size must be <= max shingle size");
}
this.minShingleSize = minShingleSize;
}
public String getTokenSeparator() {
return tokenSeparator;
}
/**
* Sets the string to use when joining adjacent tokens to form a shingle
* @param tokenSeparator used to separate input stream tokens in output shingles
*/
public void setTokenSeparator(String tokenSeparator) {
this.tokenSeparator = (tokenSeparator == null ? "" : tokenSeparator);
}
public boolean isOutputUnigrams() {
return outputUnigrams;
}
@ -104,8 +156,10 @@ public final class ShingleAnalyzerWrapper extends Analyzer {
} catch (IOException e) {
wrapped = defaultAnalyzer.tokenStream(fieldName, reader);
}
ShingleFilter filter = new ShingleFilter(wrapped);
ShingleFilter filter = new ShingleFilter(wrapped, minShingleSize, maxShingleSize);
filter.setMinShingleSize(minShingleSize);
filter.setMaxShingleSize(maxShingleSize);
filter.setTokenSeparator(tokenSeparator);
filter.setOutputUnigrams(outputUnigrams);
return filter;
}
@ -113,7 +167,7 @@ public final class ShingleAnalyzerWrapper extends Analyzer {
private class SavedStreams {
TokenStream wrapped;
ShingleFilter shingle;
};
}
@Override
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
@ -135,6 +189,8 @@ public final class ShingleAnalyzerWrapper extends Analyzer {
}
}
streams.shingle.setMaxShingleSize(maxShingleSize);
streams.shingle.setMinShingleSize(minShingleSize);
streams.shingle.setTokenSeparator(tokenSeparator);
streams.shingle.setOutputUnigrams(outputUnigrams);
return streams.shingle;
}

View File

@ -18,18 +18,15 @@ package org.apache.lucene.analysis.shingle;
*/
import java.io.IOException;
import java.util.Iterator;
import java.util.LinkedList;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.AttributeSource.State;
/**
* <p>A ShingleFilter constructs shingles (token n-grams) from a token stream.
@ -44,26 +41,59 @@ import org.apache.lucene.util.AttributeSource.State;
*/
public final class ShingleFilter extends TokenFilter {
private LinkedList<State> shingleBuf = new LinkedList<State>();
private StringBuilder[] shingles;
private String tokenType = "shingle";
/**
* filler token for when positionIncrement is more than 1
*/
public static final char[] FILLER_TOKEN = { '_' };
/**
* default maximum shingle size is 2.
*/
public static final int DEFAULT_MAX_SHINGLE_SIZE = 2;
/**
* The string to use when joining adjacent tokens to form a shingle
* default minimum shingle size is 2.
*/
public static final int DEFAULT_MIN_SHINGLE_SIZE = 2;
/**
* default token type attribute value is "shingle"
*/
public static final String DEFAULT_TOKEN_TYPE = "shingle";
/**
* The default string to use when joining adjacent tokens to form a shingle
*/
public static final String TOKEN_SEPARATOR = " ";
/**
* The sequence of input stream tokens (or filler tokens, if necessary)
* that will be composed to form output shingles.
*/
private LinkedList<State> inputWindow = new LinkedList<State>();
/**
* The number of input tokens in the next output token. This is the "n" in
* "token n-grams".
*/
private CircularSequence gramSize;
/**
* Shingle text is composed here.
*/
private StringBuilder shingleBuilder = new StringBuilder();
/**
* The token type attribute value to use - default is "shingle"
*/
private String tokenType = DEFAULT_TOKEN_TYPE;
/**
* The string to use when joining adjacent tokens to form a shingle
*/
private String tokenSeparator = TOKEN_SEPARATOR;
/**
* By default, we output unigrams (individual tokens) as well as shingles
* (token n-grams).
@ -76,15 +106,40 @@ public final class ShingleFilter extends TokenFilter {
private int maxShingleSize;
/**
* Constructs a ShingleFilter with the specified single size from the
* minimum shingle size (number of tokens)
*/
private int minShingleSize;
/**
* The remaining number of filler tokens inserted into the input stream
* from which shingles are composed, to handle position increments greater
* than one.
*/
private int numFillerTokensToInsert;
/**
* The next input stream token.
*/
private State nextInputStreamToken;
private final TermAttribute termAtt;
private final OffsetAttribute offsetAtt;
private final PositionIncrementAttribute posIncrAtt;
private final TypeAttribute typeAtt;
/**
* Constructs a ShingleFilter with the specified shingle size from the
* {@link TokenStream} <code>input</code>
*
* @param input input stream
* @param minShingleSize minimum shingle size produced by the filter.
* @param maxShingleSize maximum shingle size produced by the filter.
*/
public ShingleFilter(TokenStream input, int maxShingleSize) {
public ShingleFilter(TokenStream input, int minShingleSize, int maxShingleSize) {
super(input);
setMaxShingleSize(maxShingleSize);
setMinShingleSize(minShingleSize);
this.termAtt = addAttribute(TermAttribute.class);
this.offsetAtt = addAttribute(OffsetAttribute.class);
this.posIncrAtt = addAttribute(PositionIncrementAttribute.class);
@ -92,22 +147,34 @@ public final class ShingleFilter extends TokenFilter {
}
/**
* Construct a ShingleFilter with default shingle size.
* Constructs a ShingleFilter with the specified shingle size from the
* {@link TokenStream} <code>input</code>
*
* @param input input stream
* @param maxShingleSize maximum shingle size produced by the filter.
*/
public ShingleFilter(TokenStream input, int maxShingleSize) {
this(input, DEFAULT_MIN_SHINGLE_SIZE, maxShingleSize);
}
/**
* Construct a ShingleFilter with default shingle size: 2.
*
* @param input input stream
*/
public ShingleFilter(TokenStream input) {
this(input, DEFAULT_MAX_SHINGLE_SIZE);
this(input, DEFAULT_MIN_SHINGLE_SIZE, DEFAULT_MAX_SHINGLE_SIZE);
}
/**
* Construct a ShingleFilter with the specified token type for shingle tokens.
* Construct a ShingleFilter with the specified token type for shingle tokens
* and the default shingle size: 2
*
* @param input input stream
* @param tokenType token type for shingle tokens
*/
public ShingleFilter(TokenStream input, String tokenType) {
this(input, DEFAULT_MAX_SHINGLE_SIZE);
this(input, DEFAULT_MIN_SHINGLE_SIZE, DEFAULT_MAX_SHINGLE_SIZE);
setTokenType(tokenType);
}
@ -130,6 +197,7 @@ public final class ShingleFilter extends TokenFilter {
*/
public void setOutputUnigrams(boolean outputUnigrams) {
this.outputUnigrams = outputUnigrams;
gramSize = new CircularSequence();
}
/**
@ -141,203 +209,239 @@ public final class ShingleFilter extends TokenFilter {
if (maxShingleSize < 2) {
throw new IllegalArgumentException("Max shingle size must be >= 2");
}
shingles = new StringBuilder[maxShingleSize];
for (int i = 0; i < shingles.length; i++) {
shingles[i] = new StringBuilder();
}
this.maxShingleSize = maxShingleSize;
}
/**
* Clear the StringBuilders that are used for storing the output shingles.
* <p>Set the min shingle size (default: 2).
* <p>This method requires that the passed in minShingleSize is not greater
* than maxShingleSize, so make sure that maxShingleSize is set before
* calling this method.
* <p>The unigram output option is independent of the min shingle size.
*
* @param minShingleSize min size of output shingles
*/
private void clearShingles() {
for (int i = 0; i < shingles.length; i++) {
shingles[i].setLength(0);
public void setMinShingleSize(int minShingleSize) {
if (minShingleSize < 2) {
throw new IllegalArgumentException("Min shingle size must be >= 2");
}
if (minShingleSize > maxShingleSize) {
throw new IllegalArgumentException
("Min shingle size must be <= max shingle size");
}
this.minShingleSize = minShingleSize;
gramSize = new CircularSequence();
}
private AttributeSource.State nextToken;
private int shingleBufferPosition;
private int[] endOffsets;
/**
* Sets the string to use when joining adjacent tokens to form a shingle
* @param tokenSeparator used to separate input stream tokens in output shingles
*/
public void setTokenSeparator(String tokenSeparator) {
this.tokenSeparator = null == tokenSeparator ? "" : tokenSeparator;
}
/* (non-Javadoc)
* @see org.apache.lucene.analysis.TokenStream#next()
*/
@Override
public final boolean incrementToken() throws IOException {
while (true) {
if (nextToken == null) {
if (!fillShingleBuffer()) {
return false;
boolean tokenAvailable = false;
if (gramSize.atMinValue() || inputWindow.size() < gramSize.getValue()) {
shiftInputWindow();
}
}
nextToken = shingleBuf.getFirst();
if (outputUnigrams) {
if (shingleBufferPosition == 0) {
restoreState(nextToken);
if ( ! inputWindow.isEmpty()) {
restoreState(inputWindow.getFirst());
if (1 == gramSize.getValue()) {
posIncrAtt.setPositionIncrement(1);
shingleBufferPosition++;
return true;
}
} else if (shingleBufferPosition % this.maxShingleSize == 0){
shingleBufferPosition++;
}
if (shingleBufferPosition < shingleBuf.size()) {
restoreState(nextToken);
typeAtt.setType(tokenType);
offsetAtt.setOffset(offsetAtt.startOffset(), endOffsets[shingleBufferPosition]);
StringBuilder buf = shingles[shingleBufferPosition];
int termLength = buf.length();
char[] termBuffer = termAtt.termBuffer();
if (termBuffer.length < termLength)
termBuffer = termAtt.resizeTermBuffer(termLength);
buf.getChars(0, termLength, termBuffer, 0);
termAtt.setTermLength(termLength);
if ((! outputUnigrams) && shingleBufferPosition % this.maxShingleSize == 1) {
posIncrAtt.setPositionIncrement(1);
} else {
posIncrAtt.setPositionIncrement(0);
}
shingleBufferPosition++;
if (shingleBufferPosition == shingleBuf.size()) {
nextToken = null;
shingleBufferPosition = 0;
}
return true;
} else {
nextToken = null;
shingleBufferPosition = 0;
gramSize.advance();
tokenAvailable = true;
} else if (inputWindow.size() >= gramSize.getValue()) {
getNextShingle();
gramSize.advance();
tokenAvailable = true;
}
}
return tokenAvailable;
}
private int numFillerTokensToInsert;
private AttributeSource.State currentToken;
private boolean hasCurrentToken;
private TermAttribute termAtt;
private OffsetAttribute offsetAtt;
private PositionIncrementAttribute posIncrAtt;
private TypeAttribute typeAtt;
/**
* Get the next token from the input stream and push it on the token buffer.
* If we encounter a token with position increment > 1, we put filler tokens
* on the token buffer.
* <p/>
* Returns null when the end of the input stream is reached.
* @return the next token, or null if at end of input stream
* <p>Makes the next token a shingle of length {@link #gramSize},
* composed of tokens taken from {@link #inputWindow}.
* <p>Callers of this method must first insure that there are at least
* <code>gramSize</code> tokens available in <code>inputWindow</code>.
*/
private void getNextShingle() {
int startOffset = offsetAtt.startOffset();
int minTokNum = gramSize.getValue() - 1; // zero-based inputWindow position
if (gramSize.getValue() == minShingleSize) {
// Clear the shingle text buffer if this is the first shingle
// at the current position in the input stream.
shingleBuilder.setLength(0);
minTokNum = 0;
}
for (int tokNum = minTokNum ; tokNum < gramSize.getValue() ; ++tokNum) {
if (tokNum > 0) {
shingleBuilder.append(tokenSeparator);
}
restoreState(inputWindow.get(tokNum));
shingleBuilder.append(termAtt.termBuffer(), 0, termAtt.termLength());
}
char[] termBuffer = termAtt.termBuffer();
int termLength = shingleBuilder.length();
if (termBuffer.length < termLength) {
termBuffer = termAtt.resizeTermBuffer(termLength);
}
shingleBuilder.getChars(0, termLength, termBuffer, 0);
termAtt.setTermLength(termLength);
posIncrAtt.setPositionIncrement(gramSize.atMinValue() ? 1 : 0);
typeAtt.setType(tokenType);
offsetAtt.setOffset(startOffset, offsetAtt.endOffset());
}
/**
* <p>Get the next token from the input stream.
* <p>If the next token has <code>positionIncrement > 1</code>,
* <code>positionIncrement - 1</code> {@link #FILLER_TOKEN}s are
* inserted first.
* @return false for end of stream; true otherwise
* @throws IOException if the input stream has a problem
*/
private boolean getNextToken() throws IOException {
while (true) {
boolean success = false;
if (numFillerTokensToInsert > 0) {
if (currentToken == null) {
currentToken = captureState();
} else {
restoreState(currentToken);
}
numFillerTokensToInsert--;
// A filler token occupies no space
offsetAtt.setOffset(offsetAtt.startOffset(), offsetAtt.startOffset());
termAtt.setTermBuffer(FILLER_TOKEN, 0, FILLER_TOKEN.length);
return true;
}
if (hasCurrentToken) {
if (currentToken != null) {
restoreState(currentToken);
currentToken = null;
}
hasCurrentToken = false;
return true;
}
if (!input.incrementToken()) return false;
hasCurrentToken = true;
insertFillerToken();
success = true;
} else if (null != nextInputStreamToken) {
restoreState(nextInputStreamToken);
nextInputStreamToken = null;
success = true;
} else if (input.incrementToken()) {
if (posIncrAtt.getPositionIncrement() > 1) {
numFillerTokensToInsert = posIncrAtt.getPositionIncrement() - 1;
insertFillerToken();
}
success = true;
}
return success;
}
/**
* Fill the output buffer with new shingles.
* Inserts a {@link #FILLER_TOKEN} and decrements
* {@link #numFillerTokensToInsert}.
*/
private void insertFillerToken() {
if (null == nextInputStreamToken) {
nextInputStreamToken = captureState();
} else {
restoreState(nextInputStreamToken);
}
--numFillerTokensToInsert;
// A filler token occupies no space
offsetAtt.setOffset(offsetAtt.startOffset(), offsetAtt.startOffset());
termAtt.setTermBuffer(FILLER_TOKEN, 0, FILLER_TOKEN.length);
}
/**
* <p>Fills {@link #inputWindow} with input stream tokens, if available,
* shifting to the right if the window was previously full.
* <p>Resets {@link #gramSize} to its minimum value.
*
* @throws IOException if there's a problem getting the next token
*/
private boolean fillShingleBuffer() throws IOException {
boolean addedToken = false;
/*
* Try to fill the shingle buffer.
*/
do {
if (getNextToken()) {
shingleBuf.add(captureState());
if (shingleBuf.size() > maxShingleSize)
{
shingleBuf.removeFirst();
private void shiftInputWindow() throws IOException {
if (inputWindow.size() > 0) {
inputWindow.removeFirst();
}
addedToken = true;
} else {
while (getNextToken()) {
inputWindow.add(captureState());
if (inputWindow.size() == maxShingleSize) {
break;
}
} while (shingleBuf.size() < maxShingleSize);
if (shingleBuf.isEmpty()) {
return false;
}
/*
* If no new token could be added to the shingle buffer, we have reached
* the end of the input stream and have to discard the least recent token.
*/
if (! addedToken) {
shingleBuf.removeFirst();
}
if (shingleBuf.isEmpty()) {
return false;
}
clearShingles();
endOffsets = new int[shingleBuf.size()];
for (int i = 0; i < endOffsets.length; i++) {
endOffsets[i] = 0;
}
int i = 0;
for (Iterator<State> it = shingleBuf.iterator(); it.hasNext(); ) {
restoreState(it.next());
for (int j = i; j < shingles.length; j++) {
if (shingles[j].length() != 0) {
shingles[j].append(TOKEN_SEPARATOR);
}
shingles[j].append(termAtt.termBuffer(), 0, termAtt.termLength());
}
endOffsets[i] = offsetAtt.endOffset();
i++;
}
return true;
gramSize.reset();
}
@Override
public void reset() throws IOException {
super.reset();
nextToken = null;
shingleBufferPosition = 0;
shingleBuf.clear();
gramSize.reset();
inputWindow.clear();
numFillerTokensToInsert = 0;
currentToken = null;
hasCurrentToken = false;
}
/**
* <p>An instance of this class is used to maintain the number of input
* stream tokens that will be used to compose the next unigram or shingle:
* {@link #gramSize}.
* <p><code>gramSize</code> will take on values from the circular sequence
* <b>{ [ 1, ] {@link #minShingleSize} [ , ... , {@link #maxShingleSize} ] }</b>.
* <p>1 is included in the circular sequence only if
* {@link #outputUnigrams} = true.
*/
private class CircularSequence {
private int value;
private int minValue;
public CircularSequence() {
minValue = outputUnigrams ? 1 : minShingleSize;
reset();
}
/**
* {@see #advance()}
* @return the current value.
*/
public int getValue() {
return value;
}
/**
* <p>Increments this circular number's value to the next member in the
* circular sequence
* <code>gramSize</code> will take on values from the circular sequence
* <b>{ [ 1, ] {@link #minShingleSize} [ , ... , {@link #maxShingleSize} ] }</b>.
* <p>1 is included in the circular sequence only if
* {@link #outputUnigrams} = true.
*
* @return the next member in the circular sequence
*/
public int advance() {
if (value == 1) {
value = minShingleSize;
} else if (value == maxShingleSize) {
reset();
} else {
++value;
}
return value;
}
/**
* <p>Sets this circular number's value to the first member of the
* circular sequence
* <p><code>gramSize</code> will take on values from the circular sequence
* <b>{ [ 1, ] {@link #minShingleSize} [ , ... , {@link #maxShingleSize} ] }</b>.
* <p>1 is included in the circular sequence only if
* {@link #outputUnigrams} = true.
*/
public void reset() {
value = minValue;
}
/**
* <p>Returns true if the current value is the first member of the circular
* sequence.
* <p>If {@link #outputUnigrams} = true, the first member of the circular
* sequence will be 1; otherwise, it will be {@link #minShingleSize}.
*
* @return true if the current value is the first member of the circular
* sequence; false otherwise
*/
public boolean atMinValue() {
return value == minValue;
}
}
}

View File

@ -246,4 +246,117 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
new int[] { 6, 13, 13, 18, 18, 27, 27 },
new int[] { 1, 0, 1, 0, 1, 0, 1 });
}
public void testNonDefaultMinShingleSize() throws Exception {
ShingleAnalyzerWrapper analyzer
= new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(), 3, 4);
assertAnalyzesToReuse(analyzer, "please divide this sentence into shingles",
new String[] { "please", "please divide this", "please divide this sentence",
"divide", "divide this sentence", "divide this sentence into",
"this", "this sentence into", "this sentence into shingles",
"sentence", "sentence into shingles",
"into",
"shingles" },
new int[] { 0, 0, 0, 7, 7, 7, 14, 14, 14, 19, 19, 28, 33 },
new int[] { 6, 18, 27, 13, 27, 32, 18, 32, 41, 27, 41, 32, 41 },
new int[] { 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1 });
analyzer.setOutputUnigrams(false);
assertAnalyzesToReuse(analyzer, "please divide this sentence into shingles",
new String[] { "please divide this", "please divide this sentence",
"divide this sentence", "divide this sentence into",
"this sentence into", "this sentence into shingles",
"sentence into shingles" },
new int[] { 0, 0, 7, 7, 14, 14, 19 },
new int[] { 18, 27, 27, 32, 32, 41, 41 },
new int[] { 1, 0, 1, 0, 1, 0, 1 });
}
public void testNonDefaultMinAndSameMaxShingleSize() throws Exception {
ShingleAnalyzerWrapper analyzer
= new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(), 3, 3);
assertAnalyzesToReuse(analyzer, "please divide this sentence into shingles",
new String[] { "please", "please divide this",
"divide", "divide this sentence",
"this", "this sentence into",
"sentence", "sentence into shingles",
"into",
"shingles" },
new int[] { 0, 0, 7, 7, 14, 14, 19, 19, 28, 33 },
new int[] { 6, 18, 13, 27, 18, 32, 27, 41, 32, 41 },
new int[] { 1, 0, 1, 0, 1, 0, 1, 0, 1, 1 });
analyzer.setOutputUnigrams(false);
assertAnalyzesToReuse(analyzer, "please divide this sentence into shingles",
new String[] { "please divide this",
"divide this sentence",
"this sentence into",
"sentence into shingles" },
new int[] { 0, 7, 14, 19 },
new int[] { 18, 27, 32, 41 },
new int[] { 1, 1, 1, 1 });
}
public void testNoTokenSeparator() throws Exception {
ShingleAnalyzerWrapper analyzer
= new ShingleAnalyzerWrapper(new WhitespaceAnalyzer());
analyzer.setTokenSeparator("");
assertAnalyzesToReuse(analyzer, "please divide into shingles",
new String[] { "please", "pleasedivide",
"divide", "divideinto",
"into", "intoshingles",
"shingles" },
new int[] { 0, 0, 7, 7, 14, 14, 19 },
new int[] { 6, 13, 13, 18, 18, 27, 27 },
new int[] { 1, 0, 1, 0, 1, 0, 1 });
analyzer.setOutputUnigrams(false);
assertAnalyzesToReuse(analyzer, "please divide into shingles",
new String[] { "pleasedivide",
"divideinto",
"intoshingles" },
new int[] { 0, 7, 14 },
new int[] { 13, 18, 27 },
new int[] { 1, 1, 1 });
}
public void testNullTokenSeparator() throws Exception {
ShingleAnalyzerWrapper analyzer
= new ShingleAnalyzerWrapper(new WhitespaceAnalyzer());
analyzer.setTokenSeparator(null);
assertAnalyzesToReuse(analyzer, "please divide into shingles",
new String[] { "please", "pleasedivide",
"divide", "divideinto",
"into", "intoshingles",
"shingles" },
new int[] { 0, 0, 7, 7, 14, 14, 19 },
new int[] { 6, 13, 13, 18, 18, 27, 27 },
new int[] { 1, 0, 1, 0, 1, 0, 1 });
analyzer.setOutputUnigrams(false);
assertAnalyzesToReuse(analyzer, "please divide into shingles",
new String[] { "pleasedivide",
"divideinto",
"intoshingles" },
new int[] { 0, 7, 14 },
new int[] { 13, 18, 27 },
new int[] { 1, 1, 1 });
}
public void testAltTokenSeparator() throws Exception {
ShingleAnalyzerWrapper analyzer
= new ShingleAnalyzerWrapper(new WhitespaceAnalyzer());
analyzer.setTokenSeparator("<SEP>");
assertAnalyzesToReuse(analyzer, "please divide into shingles",
new String[] { "please", "please<SEP>divide",
"divide", "divide<SEP>into",
"into", "into<SEP>shingles",
"shingles" },
new int[] { 0, 0, 7, 7, 14, 14, 19 },
new int[] { 6, 13, 13, 18, 18, 27, 27 },
new int[] { 1, 0, 1, 0, 1, 0, 1 });
analyzer.setOutputUnigrams(false);
assertAnalyzesToReuse(analyzer, "please divide into shingles",
new String[] { "please<SEP>divide",
"divide<SEP>into",
"into<SEP>shingles" },
new int[] { 0, 7, 14 },
new int[] { 13, 18, 27 },
new int[] { 1, 1, 1 });
}
}

View File

@ -288,6 +288,359 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
};
public static final Token[] TRI_GRAM_TOKENS_MIN_TRI_GRAM = new Token[] {
createToken("please", 0, 6),
createToken("please divide this", 0, 18),
createToken("divide", 7, 13),
createToken("divide this sentence", 7, 27),
createToken("this", 14, 18),
createToken("this sentence into", 14, 32),
createToken("sentence", 19, 27),
createToken("sentence into shingles", 19, 39),
createToken("into", 28, 32),
createToken("shingles", 33, 39)
};
public static final int[] TRI_GRAM_POSITION_INCREMENTS_MIN_TRI_GRAM = new int[] {
1, 0, 1, 0, 1, 0, 1, 0, 1, 1
};
public static final String[] TRI_GRAM_TYPES_MIN_TRI_GRAM = new String[] {
"word", "shingle",
"word", "shingle",
"word", "shingle",
"word", "shingle",
"word",
"word"
};
public static final Token[] TRI_GRAM_TOKENS_WITHOUT_UNIGRAMS_MIN_TRI_GRAM = new Token[] {
createToken("please divide this", 0, 18),
createToken("divide this sentence", 7, 27),
createToken("this sentence into", 14, 32),
createToken("sentence into shingles", 19, 39)
};
public static final int[] TRI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_MIN_TRI_GRAM = new int[] {
1, 1, 1, 1
};
public static final String[] TRI_GRAM_TYPES_WITHOUT_UNIGRAMS_MIN_TRI_GRAM = new String[] {
"shingle",
"shingle",
"shingle",
"shingle"
};
public static final Token[] FOUR_GRAM_TOKENS_MIN_TRI_GRAM = new Token[] {
createToken("please", 0, 6),
createToken("please divide this", 0, 18),
createToken("please divide this sentence", 0, 27),
createToken("divide", 7, 13),
createToken("divide this sentence", 7, 27),
createToken("divide this sentence into", 7, 32),
createToken("this", 14, 18),
createToken("this sentence into", 14, 32),
createToken("this sentence into shingles", 14, 39),
createToken("sentence", 19, 27),
createToken("sentence into shingles", 19, 39),
createToken("into", 28, 32),
createToken("shingles", 33, 39)
};
public static final int[] FOUR_GRAM_POSITION_INCREMENTS_MIN_TRI_GRAM = new int[] {
1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1
};
public static final String[] FOUR_GRAM_TYPES_MIN_TRI_GRAM = new String[] {
"word", "shingle", "shingle",
"word", "shingle", "shingle",
"word", "shingle", "shingle",
"word", "shingle",
"word",
"word"
};
public static final Token[] FOUR_GRAM_TOKENS_WITHOUT_UNIGRAMS_MIN_TRI_GRAM = new Token[] {
createToken("please divide this", 0, 18),
createToken("please divide this sentence", 0, 27),
createToken("divide this sentence", 7, 27),
createToken("divide this sentence into", 7, 32),
createToken("this sentence into", 14, 32),
createToken("this sentence into shingles", 14, 39),
createToken("sentence into shingles", 19, 39),
};
public static final int[] FOUR_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_MIN_TRI_GRAM = new int[] {
1, 0, 1, 0, 1, 0, 1
};
public static final String[] FOUR_GRAM_TYPES_WITHOUT_UNIGRAMS_MIN_TRI_GRAM = new String[] {
"shingle", "shingle",
"shingle", "shingle",
"shingle", "shingle",
"shingle"
};
public static final Token[] FOUR_GRAM_TOKENS_MIN_FOUR_GRAM = new Token[] {
createToken("please", 0, 6),
createToken("please divide this sentence", 0, 27),
createToken("divide", 7, 13),
createToken("divide this sentence into", 7, 32),
createToken("this", 14, 18),
createToken("this sentence into shingles", 14, 39),
createToken("sentence", 19, 27),
createToken("into", 28, 32),
createToken("shingles", 33, 39)
};
public static final int[] FOUR_GRAM_POSITION_INCREMENTS_MIN_FOUR_GRAM = new int[] {
1, 0, 1, 0, 1, 0, 1, 1, 1
};
public static final String[] FOUR_GRAM_TYPES_MIN_FOUR_GRAM = new String[] {
"word", "shingle",
"word", "shingle",
"word", "shingle",
"word",
"word",
"word"
};
public static final Token[] FOUR_GRAM_TOKENS_WITHOUT_UNIGRAMS_MIN_FOUR_GRAM = new Token[] {
createToken("please divide this sentence", 0, 27),
createToken("divide this sentence into", 7, 32),
createToken("this sentence into shingles", 14, 39),
};
public static final int[] FOUR_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_MIN_FOUR_GRAM = new int[] {
1, 1, 1
};
public static final String[] FOUR_GRAM_TYPES_WITHOUT_UNIGRAMS_MIN_FOUR_GRAM = new String[] {
"shingle",
"shingle",
"shingle"
};
public static final Token[] BI_GRAM_TOKENS_NO_SEPARATOR = new Token[] {
createToken("please", 0, 6),
createToken("pleasedivide", 0, 13),
createToken("divide", 7, 13),
createToken("dividethis", 7, 18),
createToken("this", 14, 18),
createToken("thissentence", 14, 27),
createToken("sentence", 19, 27),
createToken("sentenceinto", 19, 32),
createToken("into", 28, 32),
createToken("intoshingles", 28, 39),
createToken("shingles", 33, 39),
};
public static final int[] BI_GRAM_POSITION_INCREMENTS_NO_SEPARATOR = new int[] {
1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
};
public static final String[] BI_GRAM_TYPES_NO_SEPARATOR = new String[] {
"word", "shingle", "word", "shingle", "word", "shingle", "word",
"shingle", "word", "shingle", "word"
};
public static final Token[] BI_GRAM_TOKENS_WITHOUT_UNIGRAMS_NO_SEPARATOR = new Token[] {
createToken("pleasedivide", 0, 13),
createToken("dividethis", 7, 18),
createToken("thissentence", 14, 27),
createToken("sentenceinto", 19, 32),
createToken("intoshingles", 28, 39),
};
public static final int[] BI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_NO_SEPARATOR = new int[] {
1, 1, 1, 1, 1
};
public static final String[] BI_GRAM_TYPES_WITHOUT_UNIGRAMS_NO_SEPARATOR = new String[] {
"shingle", "shingle", "shingle", "shingle", "shingle"
};
public static final Token[] TRI_GRAM_TOKENS_NO_SEPARATOR = new Token[] {
createToken("please", 0, 6),
createToken("pleasedivide", 0, 13),
createToken("pleasedividethis", 0, 18),
createToken("divide", 7, 13),
createToken("dividethis", 7, 18),
createToken("dividethissentence", 7, 27),
createToken("this", 14, 18),
createToken("thissentence", 14, 27),
createToken("thissentenceinto", 14, 32),
createToken("sentence", 19, 27),
createToken("sentenceinto", 19, 32),
createToken("sentenceintoshingles", 19, 39),
createToken("into", 28, 32),
createToken("intoshingles", 28, 39),
createToken("shingles", 33, 39)
};
public static final int[] TRI_GRAM_POSITION_INCREMENTS_NO_SEPARATOR = new int[] {
1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1
};
public static final String[] TRI_GRAM_TYPES_NO_SEPARATOR = new String[] {
"word", "shingle", "shingle",
"word", "shingle", "shingle",
"word", "shingle", "shingle",
"word", "shingle", "shingle",
"word", "shingle",
"word"
};
public static final Token[] TRI_GRAM_TOKENS_WITHOUT_UNIGRAMS_NO_SEPARATOR = new Token[] {
createToken("pleasedivide", 0, 13),
createToken("pleasedividethis", 0, 18),
createToken("dividethis", 7, 18),
createToken("dividethissentence", 7, 27),
createToken("thissentence", 14, 27),
createToken("thissentenceinto", 14, 32),
createToken("sentenceinto", 19, 32),
createToken("sentenceintoshingles", 19, 39),
createToken("intoshingles", 28, 39),
};
public static final int[] TRI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_NO_SEPARATOR = new int[] {
1, 0, 1, 0, 1, 0, 1, 0, 1
};
public static final String[] TRI_GRAM_TYPES_WITHOUT_UNIGRAMS_NO_SEPARATOR = new String[] {
"shingle", "shingle",
"shingle", "shingle",
"shingle", "shingle",
"shingle", "shingle",
"shingle",
};
public static final Token[] BI_GRAM_TOKENS_ALT_SEPARATOR = new Token[] {
createToken("please", 0, 6),
createToken("please<SEP>divide", 0, 13),
createToken("divide", 7, 13),
createToken("divide<SEP>this", 7, 18),
createToken("this", 14, 18),
createToken("this<SEP>sentence", 14, 27),
createToken("sentence", 19, 27),
createToken("sentence<SEP>into", 19, 32),
createToken("into", 28, 32),
createToken("into<SEP>shingles", 28, 39),
createToken("shingles", 33, 39),
};
public static final int[] BI_GRAM_POSITION_INCREMENTS_ALT_SEPARATOR = new int[] {
1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
};
public static final String[] BI_GRAM_TYPES_ALT_SEPARATOR = new String[] {
"word", "shingle", "word", "shingle", "word", "shingle", "word",
"shingle", "word", "shingle", "word"
};
public static final Token[] BI_GRAM_TOKENS_WITHOUT_UNIGRAMS_ALT_SEPARATOR = new Token[] {
createToken("please<SEP>divide", 0, 13),
createToken("divide<SEP>this", 7, 18),
createToken("this<SEP>sentence", 14, 27),
createToken("sentence<SEP>into", 19, 32),
createToken("into<SEP>shingles", 28, 39),
};
public static final int[] BI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_ALT_SEPARATOR = new int[] {
1, 1, 1, 1, 1
};
public static final String[] BI_GRAM_TYPES_WITHOUT_UNIGRAMS_ALT_SEPARATOR = new String[] {
"shingle", "shingle", "shingle", "shingle", "shingle"
};
public static final Token[] TRI_GRAM_TOKENS_ALT_SEPARATOR = new Token[] {
createToken("please", 0, 6),
createToken("please<SEP>divide", 0, 13),
createToken("please<SEP>divide<SEP>this", 0, 18),
createToken("divide", 7, 13),
createToken("divide<SEP>this", 7, 18),
createToken("divide<SEP>this<SEP>sentence", 7, 27),
createToken("this", 14, 18),
createToken("this<SEP>sentence", 14, 27),
createToken("this<SEP>sentence<SEP>into", 14, 32),
createToken("sentence", 19, 27),
createToken("sentence<SEP>into", 19, 32),
createToken("sentence<SEP>into<SEP>shingles", 19, 39),
createToken("into", 28, 32),
createToken("into<SEP>shingles", 28, 39),
createToken("shingles", 33, 39)
};
public static final int[] TRI_GRAM_POSITION_INCREMENTS_ALT_SEPARATOR = new int[] {
1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1
};
public static final String[] TRI_GRAM_TYPES_ALT_SEPARATOR = new String[] {
"word", "shingle", "shingle",
"word", "shingle", "shingle",
"word", "shingle", "shingle",
"word", "shingle", "shingle",
"word", "shingle",
"word"
};
public static final Token[] TRI_GRAM_TOKENS_WITHOUT_UNIGRAMS_ALT_SEPARATOR = new Token[] {
createToken("please<SEP>divide", 0, 13),
createToken("please<SEP>divide<SEP>this", 0, 18),
createToken("divide<SEP>this", 7, 18),
createToken("divide<SEP>this<SEP>sentence", 7, 27),
createToken("this<SEP>sentence", 14, 27),
createToken("this<SEP>sentence<SEP>into", 14, 32),
createToken("sentence<SEP>into", 19, 32),
createToken("sentence<SEP>into<SEP>shingles", 19, 39),
createToken("into<SEP>shingles", 28, 39),
};
public static final int[] TRI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_ALT_SEPARATOR = new int[] {
1, 0, 1, 0, 1, 0, 1, 0, 1
};
public static final String[] TRI_GRAM_TYPES_WITHOUT_UNIGRAMS_ALT_SEPARATOR = new String[] {
"shingle", "shingle",
"shingle", "shingle",
"shingle", "shingle",
"shingle", "shingle",
"shingle",
};
public static final Token[] TRI_GRAM_TOKENS_NULL_SEPARATOR = new Token[] {
createToken("please", 0, 6),
createToken("pleasedivide", 0, 13),
createToken("pleasedividethis", 0, 18),
createToken("divide", 7, 13),
createToken("dividethis", 7, 18),
createToken("dividethissentence", 7, 27),
createToken("this", 14, 18),
createToken("thissentence", 14, 27),
createToken("thissentenceinto", 14, 32),
createToken("sentence", 19, 27),
createToken("sentenceinto", 19, 32),
createToken("sentenceintoshingles", 19, 39),
createToken("into", 28, 32),
createToken("intoshingles", 28, 39),
createToken("shingles", 33, 39)
};
public static final int[] TRI_GRAM_POSITION_INCREMENTS_NULL_SEPARATOR = new int[] {
1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1
};
public static final String[] TRI_GRAM_TYPES_NULL_SEPARATOR = new String[] {
"word", "shingle", "shingle",
"word", "shingle", "shingle",
"word", "shingle", "shingle",
"word", "shingle", "shingle",
"word", "shingle",
"word"
};
@Override
protected void setUp() throws Exception {
@ -379,6 +732,108 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
}
public void testTriGramFilterMinTriGram() throws IOException {
this.shingleFilterTest(3, 3, TEST_TOKEN, TRI_GRAM_TOKENS_MIN_TRI_GRAM,
TRI_GRAM_POSITION_INCREMENTS_MIN_TRI_GRAM,
TRI_GRAM_TYPES_MIN_TRI_GRAM,
true);
}
public void testTriGramFilterWithoutUnigramsMinTriGram() throws IOException {
this.shingleFilterTest(3, 3, TEST_TOKEN,
TRI_GRAM_TOKENS_WITHOUT_UNIGRAMS_MIN_TRI_GRAM,
TRI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_MIN_TRI_GRAM,
TRI_GRAM_TYPES_WITHOUT_UNIGRAMS_MIN_TRI_GRAM,
false);
}
public void testFourGramFilterMinTriGram() throws IOException {
this.shingleFilterTest(3, 4, TEST_TOKEN, FOUR_GRAM_TOKENS_MIN_TRI_GRAM,
FOUR_GRAM_POSITION_INCREMENTS_MIN_TRI_GRAM,
FOUR_GRAM_TYPES_MIN_TRI_GRAM,
true);
}
public void testFourGramFilterWithoutUnigramsMinTriGram() throws IOException {
this.shingleFilterTest(3, 4, TEST_TOKEN,
FOUR_GRAM_TOKENS_WITHOUT_UNIGRAMS_MIN_TRI_GRAM,
FOUR_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_MIN_TRI_GRAM,
FOUR_GRAM_TYPES_WITHOUT_UNIGRAMS_MIN_TRI_GRAM, false);
}
public void testFourGramFilterMinFourGram() throws IOException {
this.shingleFilterTest(4, 4, TEST_TOKEN, FOUR_GRAM_TOKENS_MIN_FOUR_GRAM,
FOUR_GRAM_POSITION_INCREMENTS_MIN_FOUR_GRAM,
FOUR_GRAM_TYPES_MIN_FOUR_GRAM,
true);
}
public void testFourGramFilterWithoutUnigramsMinFourGram() throws IOException {
this.shingleFilterTest(4, 4, TEST_TOKEN,
FOUR_GRAM_TOKENS_WITHOUT_UNIGRAMS_MIN_FOUR_GRAM,
FOUR_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_MIN_FOUR_GRAM,
FOUR_GRAM_TYPES_WITHOUT_UNIGRAMS_MIN_FOUR_GRAM, false);
}
public void testBiGramFilterNoSeparator() throws IOException {
this.shingleFilterTest("", 2, 2, TEST_TOKEN, BI_GRAM_TOKENS_NO_SEPARATOR,
BI_GRAM_POSITION_INCREMENTS_NO_SEPARATOR,
BI_GRAM_TYPES_NO_SEPARATOR, true);
}
public void testBiGramFilterWithoutUnigramsNoSeparator() throws IOException {
this.shingleFilterTest("", 2, 2, TEST_TOKEN,
BI_GRAM_TOKENS_WITHOUT_UNIGRAMS_NO_SEPARATOR,
BI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_NO_SEPARATOR,
BI_GRAM_TYPES_WITHOUT_UNIGRAMS_NO_SEPARATOR,
false);
}
public void testTriGramFilterNoSeparator() throws IOException {
this.shingleFilterTest("", 2, 3, TEST_TOKEN, TRI_GRAM_TOKENS_NO_SEPARATOR,
TRI_GRAM_POSITION_INCREMENTS_NO_SEPARATOR,
TRI_GRAM_TYPES_NO_SEPARATOR, true);
}
public void testTriGramFilterWithoutUnigramsNoSeparator() throws IOException {
this.shingleFilterTest("", 2, 3, TEST_TOKEN,
TRI_GRAM_TOKENS_WITHOUT_UNIGRAMS_NO_SEPARATOR,
TRI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_NO_SEPARATOR,
TRI_GRAM_TYPES_WITHOUT_UNIGRAMS_NO_SEPARATOR, false);
}
public void testBiGramFilterAltSeparator() throws IOException {
this.shingleFilterTest("<SEP>", 2, 2, TEST_TOKEN, BI_GRAM_TOKENS_ALT_SEPARATOR,
BI_GRAM_POSITION_INCREMENTS_ALT_SEPARATOR,
BI_GRAM_TYPES_ALT_SEPARATOR, true);
}
public void testBiGramFilterWithoutUnigramsAltSeparator() throws IOException {
this.shingleFilterTest("<SEP>", 2, 2, TEST_TOKEN,
BI_GRAM_TOKENS_WITHOUT_UNIGRAMS_ALT_SEPARATOR,
BI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_ALT_SEPARATOR,
BI_GRAM_TYPES_WITHOUT_UNIGRAMS_ALT_SEPARATOR,
false);
}
public void testTriGramFilterAltSeparator() throws IOException {
this.shingleFilterTest("<SEP>", 2, 3, TEST_TOKEN, TRI_GRAM_TOKENS_ALT_SEPARATOR,
TRI_GRAM_POSITION_INCREMENTS_ALT_SEPARATOR,
TRI_GRAM_TYPES_ALT_SEPARATOR, true);
}
public void testTriGramFilterWithoutUnigramsAltSeparator() throws IOException {
this.shingleFilterTest("<SEP>", 2, 3, TEST_TOKEN,
TRI_GRAM_TOKENS_WITHOUT_UNIGRAMS_ALT_SEPARATOR,
TRI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_ALT_SEPARATOR,
TRI_GRAM_TYPES_WITHOUT_UNIGRAMS_ALT_SEPARATOR, false);
}
public void testTriGramFilterNullSeparator() throws IOException {
this.shingleFilterTest(null, 2, 3, TEST_TOKEN, TRI_GRAM_TOKENS_NULL_SEPARATOR,
TRI_GRAM_POSITION_INCREMENTS_NULL_SEPARATOR,
TRI_GRAM_TYPES_NULL_SEPARATOR, true);
}
public void testReset() throws Exception {
Tokenizer wsTokenizer = new WhitespaceTokenizer(new StringReader("please divide this sentence"));
TokenStream filter = new ShingleFilter(wsTokenizer, 2);
@ -403,30 +858,50 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
throws IOException {
ShingleFilter filter = new ShingleFilter(new TestTokenStream(tokensToShingle), maxSize);
shingleFilterTestCommon
(filter, tokensToCompare, positionIncrements, types, outputUnigrams);
}
protected void shingleFilterTest(int minSize, int maxSize, Token[] tokensToShingle,
Token[] tokensToCompare, int[] positionIncrements,
String[] types, boolean outputUnigrams)
throws IOException {
ShingleFilter filter
= new ShingleFilter(new TestTokenStream(tokensToShingle), minSize, maxSize);
shingleFilterTestCommon
(filter, tokensToCompare, positionIncrements, types, outputUnigrams);
}
protected void shingleFilterTest(String tokenSeparator, int minSize, int maxSize, Token[] tokensToShingle,
Token[] tokensToCompare, int[] positionIncrements,
String[] types, boolean outputUnigrams)
throws IOException {
ShingleFilter filter
= new ShingleFilter(new TestTokenStream(tokensToShingle), minSize, maxSize);
filter.setTokenSeparator(tokenSeparator);
shingleFilterTestCommon
(filter, tokensToCompare, positionIncrements, types, outputUnigrams);
}
protected void shingleFilterTestCommon(ShingleFilter filter,
Token[] tokensToCompare,
int[] positionIncrements,
String[] types, boolean outputUnigrams)
throws IOException {
filter.setOutputUnigrams(outputUnigrams);
TermAttribute termAtt = filter.addAttribute(TermAttribute.class);
OffsetAttribute offsetAtt = filter.addAttribute(OffsetAttribute.class);
PositionIncrementAttribute posIncrAtt = filter.addAttribute(PositionIncrementAttribute.class);
TypeAttribute typeAtt = filter.addAttribute(TypeAttribute.class);
String text[] = new String[tokensToCompare.length];
int startOffsets[] = new int[tokensToCompare.length];
int endOffsets[] = new int[tokensToCompare.length];
int i = 0;
while (filter.incrementToken()) {
assertTrue("ShingleFilter outputted more tokens than expected", i < tokensToCompare.length);
String termText = termAtt.term();
String goldText = tokensToCompare[i].term();
assertEquals("Wrong termText", goldText, termText);
assertEquals("Wrong startOffset for token \"" + termText + "\"",
tokensToCompare[i].startOffset(), offsetAtt.startOffset());
assertEquals("Wrong endOffset for token \"" + termText + "\"",
tokensToCompare[i].endOffset(), offsetAtt.endOffset());
assertEquals("Wrong positionIncrement for token \"" + termText + "\"",
positionIncrements[i], posIncrAtt.getPositionIncrement());
assertEquals("Wrong type for token \"" + termText + "\"", types[i], typeAtt.type());
i++;
for (int i = 0; i < tokensToCompare.length; i++) {
text[i] = tokensToCompare[i].term();
startOffsets[i] = tokensToCompare[i].startOffset();
endOffsets[i] = tokensToCompare[i].endOffset();
}
assertEquals("ShingleFilter outputted wrong # of tokens. (# output = " + i + "; # expected =" + tokensToCompare.length + ")",
tokensToCompare.length, i);
assertTokenStreamContents(filter, text, startOffsets, endOffsets, types, positionIncrements);
}
private static Token createToken(String term, int start, int offset)