mirror of https://github.com/apache/lucene.git
LUCENE-2218: Improvements to ShingleFilter (performance, configurable sep. char and min shingle size)
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@905043 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
e1df47d58f
commit
fdf4ea2448
|
@ -99,6 +99,10 @@ New features
|
|||
* LUCENE-2243: Add DisjunctionMaxQuery support for FastVectorHighlighter.
|
||||
(Koji Sekiguchi)
|
||||
|
||||
* LUCENE-2218: ShingleFilter supports minimum shingle size, and the separator
|
||||
character is now configurable. Its also up to 20% faster.
|
||||
(Steven Rowe via Robert Muir)
|
||||
|
||||
Build
|
||||
|
||||
* LUCENE-2124: Moved the JDK-based collation support from contrib/collation
|
||||
|
|
|
@ -34,7 +34,9 @@ import org.apache.lucene.util.Version;
|
|||
public final class ShingleAnalyzerWrapper extends Analyzer {
|
||||
|
||||
private final Analyzer defaultAnalyzer;
|
||||
private int maxShingleSize = 2;
|
||||
private int maxShingleSize = ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE;
|
||||
private int minShingleSize = ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE;
|
||||
private String tokenSeparator = ShingleFilter.TOKEN_SEPARATOR;
|
||||
private boolean outputUnigrams = true;
|
||||
|
||||
public ShingleAnalyzerWrapper(Analyzer defaultAnalyzer) {
|
||||
|
@ -44,7 +46,13 @@ public final class ShingleAnalyzerWrapper extends Analyzer {
|
|||
|
||||
public ShingleAnalyzerWrapper(Analyzer defaultAnalyzer, int maxShingleSize) {
|
||||
this(defaultAnalyzer);
|
||||
this.maxShingleSize = maxShingleSize;
|
||||
setMaxShingleSize(maxShingleSize);
|
||||
}
|
||||
|
||||
public ShingleAnalyzerWrapper(Analyzer defaultAnalyzer, int minShingleSize, int maxShingleSize) {
|
||||
this(defaultAnalyzer);
|
||||
setMaxShingleSize(maxShingleSize);
|
||||
setMinShingleSize(minShingleSize);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -58,29 +66,73 @@ public final class ShingleAnalyzerWrapper extends Analyzer {
|
|||
/**
|
||||
* Wraps {@link StandardAnalyzer}.
|
||||
*/
|
||||
public ShingleAnalyzerWrapper(Version matchVersion, int nGramSize) {
|
||||
public ShingleAnalyzerWrapper(Version matchVersion, int minShingleSize, int maxShingleSize) {
|
||||
this(matchVersion);
|
||||
this.maxShingleSize = nGramSize;
|
||||
setMaxShingleSize(maxShingleSize);
|
||||
setMinShingleSize(minShingleSize);
|
||||
}
|
||||
|
||||
/**
|
||||
* The max shingle (ngram) size
|
||||
* The max shingle (token ngram) size
|
||||
*
|
||||
* @return The max shingle (ngram) size
|
||||
* @return The max shingle (token ngram) size
|
||||
*/
|
||||
public int getMaxShingleSize() {
|
||||
return maxShingleSize;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the maximum size of output shingles
|
||||
* Set the maximum size of output shingles (default: 2)
|
||||
*
|
||||
* @param maxShingleSize max shingle size
|
||||
*/
|
||||
public void setMaxShingleSize(int maxShingleSize) {
|
||||
if (maxShingleSize < 2) {
|
||||
throw new IllegalArgumentException("Max shingle size must be >= 2");
|
||||
}
|
||||
this.maxShingleSize = maxShingleSize;
|
||||
}
|
||||
|
||||
/**
|
||||
* The min shingle (token ngram) size
|
||||
*
|
||||
* @return The min shingle (token ngram) size
|
||||
*/
|
||||
public int getMinShingleSize() {
|
||||
return minShingleSize;
|
||||
}
|
||||
|
||||
/**
|
||||
* <p>Set the min shingle size (default: 2).
|
||||
* <p>This method requires that the passed in minShingleSize is not greater
|
||||
* than maxShingleSize, so make sure that maxShingleSize is set before
|
||||
* calling this method.
|
||||
*
|
||||
* @param minShingleSize min size of output shingles
|
||||
*/
|
||||
public void setMinShingleSize(int minShingleSize) {
|
||||
if (minShingleSize < 2) {
|
||||
throw new IllegalArgumentException("Min shingle size must be >= 2");
|
||||
}
|
||||
if (minShingleSize > maxShingleSize) {
|
||||
throw new IllegalArgumentException
|
||||
("Min shingle size must be <= max shingle size");
|
||||
}
|
||||
this.minShingleSize = minShingleSize;
|
||||
}
|
||||
|
||||
public String getTokenSeparator() {
|
||||
return tokenSeparator;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the string to use when joining adjacent tokens to form a shingle
|
||||
* @param tokenSeparator used to separate input stream tokens in output shingles
|
||||
*/
|
||||
public void setTokenSeparator(String tokenSeparator) {
|
||||
this.tokenSeparator = (tokenSeparator == null ? "" : tokenSeparator);
|
||||
}
|
||||
|
||||
public boolean isOutputUnigrams() {
|
||||
return outputUnigrams;
|
||||
}
|
||||
|
@ -104,8 +156,10 @@ public final class ShingleAnalyzerWrapper extends Analyzer {
|
|||
} catch (IOException e) {
|
||||
wrapped = defaultAnalyzer.tokenStream(fieldName, reader);
|
||||
}
|
||||
ShingleFilter filter = new ShingleFilter(wrapped);
|
||||
ShingleFilter filter = new ShingleFilter(wrapped, minShingleSize, maxShingleSize);
|
||||
filter.setMinShingleSize(minShingleSize);
|
||||
filter.setMaxShingleSize(maxShingleSize);
|
||||
filter.setTokenSeparator(tokenSeparator);
|
||||
filter.setOutputUnigrams(outputUnigrams);
|
||||
return filter;
|
||||
}
|
||||
|
@ -113,7 +167,7 @@ public final class ShingleAnalyzerWrapper extends Analyzer {
|
|||
private class SavedStreams {
|
||||
TokenStream wrapped;
|
||||
ShingleFilter shingle;
|
||||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
|
||||
|
@ -135,6 +189,8 @@ public final class ShingleAnalyzerWrapper extends Analyzer {
|
|||
}
|
||||
}
|
||||
streams.shingle.setMaxShingleSize(maxShingleSize);
|
||||
streams.shingle.setMinShingleSize(minShingleSize);
|
||||
streams.shingle.setTokenSeparator(tokenSeparator);
|
||||
streams.shingle.setOutputUnigrams(outputUnigrams);
|
||||
return streams.shingle;
|
||||
}
|
||||
|
|
|
@ -18,18 +18,15 @@ package org.apache.lucene.analysis.shingle;
|
|||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Iterator;
|
||||
import java.util.LinkedList;
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
import org.apache.lucene.util.AttributeSource.State;
|
||||
|
||||
|
||||
/**
|
||||
* <p>A ShingleFilter constructs shingles (token n-grams) from a token stream.
|
||||
|
@ -44,26 +41,59 @@ import org.apache.lucene.util.AttributeSource.State;
|
|||
*/
|
||||
public final class ShingleFilter extends TokenFilter {
|
||||
|
||||
private LinkedList<State> shingleBuf = new LinkedList<State>();
|
||||
private StringBuilder[] shingles;
|
||||
private String tokenType = "shingle";
|
||||
|
||||
/**
|
||||
* filler token for when positionIncrement is more than 1
|
||||
*/
|
||||
public static final char[] FILLER_TOKEN = { '_' };
|
||||
|
||||
|
||||
/**
|
||||
* default maximum shingle size is 2.
|
||||
*/
|
||||
public static final int DEFAULT_MAX_SHINGLE_SIZE = 2;
|
||||
|
||||
/**
|
||||
* The string to use when joining adjacent tokens to form a shingle
|
||||
* default minimum shingle size is 2.
|
||||
*/
|
||||
public static final int DEFAULT_MIN_SHINGLE_SIZE = 2;
|
||||
|
||||
/**
|
||||
* default token type attribute value is "shingle"
|
||||
*/
|
||||
public static final String DEFAULT_TOKEN_TYPE = "shingle";
|
||||
|
||||
/**
|
||||
* The default string to use when joining adjacent tokens to form a shingle
|
||||
*/
|
||||
public static final String TOKEN_SEPARATOR = " ";
|
||||
|
||||
|
||||
/**
|
||||
* The sequence of input stream tokens (or filler tokens, if necessary)
|
||||
* that will be composed to form output shingles.
|
||||
*/
|
||||
private LinkedList<State> inputWindow = new LinkedList<State>();
|
||||
|
||||
/**
|
||||
* The number of input tokens in the next output token. This is the "n" in
|
||||
* "token n-grams".
|
||||
*/
|
||||
private CircularSequence gramSize;
|
||||
|
||||
/**
|
||||
* Shingle text is composed here.
|
||||
*/
|
||||
private StringBuilder shingleBuilder = new StringBuilder();
|
||||
|
||||
/**
|
||||
* The token type attribute value to use - default is "shingle"
|
||||
*/
|
||||
private String tokenType = DEFAULT_TOKEN_TYPE;
|
||||
|
||||
/**
|
||||
* The string to use when joining adjacent tokens to form a shingle
|
||||
*/
|
||||
private String tokenSeparator = TOKEN_SEPARATOR;
|
||||
|
||||
/**
|
||||
* By default, we output unigrams (individual tokens) as well as shingles
|
||||
* (token n-grams).
|
||||
|
@ -76,15 +106,40 @@ public final class ShingleFilter extends TokenFilter {
|
|||
private int maxShingleSize;
|
||||
|
||||
/**
|
||||
* Constructs a ShingleFilter with the specified single size from the
|
||||
* minimum shingle size (number of tokens)
|
||||
*/
|
||||
private int minShingleSize;
|
||||
|
||||
/**
|
||||
* The remaining number of filler tokens inserted into the input stream
|
||||
* from which shingles are composed, to handle position increments greater
|
||||
* than one.
|
||||
*/
|
||||
private int numFillerTokensToInsert;
|
||||
|
||||
/**
|
||||
* The next input stream token.
|
||||
*/
|
||||
private State nextInputStreamToken;
|
||||
|
||||
private final TermAttribute termAtt;
|
||||
private final OffsetAttribute offsetAtt;
|
||||
private final PositionIncrementAttribute posIncrAtt;
|
||||
private final TypeAttribute typeAtt;
|
||||
|
||||
|
||||
/**
|
||||
* Constructs a ShingleFilter with the specified shingle size from the
|
||||
* {@link TokenStream} <code>input</code>
|
||||
*
|
||||
* @param input input stream
|
||||
* @param minShingleSize minimum shingle size produced by the filter.
|
||||
* @param maxShingleSize maximum shingle size produced by the filter.
|
||||
*/
|
||||
public ShingleFilter(TokenStream input, int maxShingleSize) {
|
||||
public ShingleFilter(TokenStream input, int minShingleSize, int maxShingleSize) {
|
||||
super(input);
|
||||
setMaxShingleSize(maxShingleSize);
|
||||
setMinShingleSize(minShingleSize);
|
||||
this.termAtt = addAttribute(TermAttribute.class);
|
||||
this.offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
this.posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
|
@ -92,22 +147,34 @@ public final class ShingleFilter extends TokenFilter {
|
|||
}
|
||||
|
||||
/**
|
||||
* Construct a ShingleFilter with default shingle size.
|
||||
* Constructs a ShingleFilter with the specified shingle size from the
|
||||
* {@link TokenStream} <code>input</code>
|
||||
*
|
||||
* @param input input stream
|
||||
* @param maxShingleSize maximum shingle size produced by the filter.
|
||||
*/
|
||||
public ShingleFilter(TokenStream input, int maxShingleSize) {
|
||||
this(input, DEFAULT_MIN_SHINGLE_SIZE, maxShingleSize);
|
||||
}
|
||||
|
||||
/**
|
||||
* Construct a ShingleFilter with default shingle size: 2.
|
||||
*
|
||||
* @param input input stream
|
||||
*/
|
||||
public ShingleFilter(TokenStream input) {
|
||||
this(input, DEFAULT_MAX_SHINGLE_SIZE);
|
||||
this(input, DEFAULT_MIN_SHINGLE_SIZE, DEFAULT_MAX_SHINGLE_SIZE);
|
||||
}
|
||||
|
||||
/**
|
||||
* Construct a ShingleFilter with the specified token type for shingle tokens.
|
||||
* Construct a ShingleFilter with the specified token type for shingle tokens
|
||||
* and the default shingle size: 2
|
||||
*
|
||||
* @param input input stream
|
||||
* @param tokenType token type for shingle tokens
|
||||
*/
|
||||
public ShingleFilter(TokenStream input, String tokenType) {
|
||||
this(input, DEFAULT_MAX_SHINGLE_SIZE);
|
||||
this(input, DEFAULT_MIN_SHINGLE_SIZE, DEFAULT_MAX_SHINGLE_SIZE);
|
||||
setTokenType(tokenType);
|
||||
}
|
||||
|
||||
|
@ -130,6 +197,7 @@ public final class ShingleFilter extends TokenFilter {
|
|||
*/
|
||||
public void setOutputUnigrams(boolean outputUnigrams) {
|
||||
this.outputUnigrams = outputUnigrams;
|
||||
gramSize = new CircularSequence();
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -141,203 +209,239 @@ public final class ShingleFilter extends TokenFilter {
|
|||
if (maxShingleSize < 2) {
|
||||
throw new IllegalArgumentException("Max shingle size must be >= 2");
|
||||
}
|
||||
shingles = new StringBuilder[maxShingleSize];
|
||||
for (int i = 0; i < shingles.length; i++) {
|
||||
shingles[i] = new StringBuilder();
|
||||
}
|
||||
this.maxShingleSize = maxShingleSize;
|
||||
}
|
||||
|
||||
/**
|
||||
* Clear the StringBuilders that are used for storing the output shingles.
|
||||
* <p>Set the min shingle size (default: 2).
|
||||
* <p>This method requires that the passed in minShingleSize is not greater
|
||||
* than maxShingleSize, so make sure that maxShingleSize is set before
|
||||
* calling this method.
|
||||
* <p>The unigram output option is independent of the min shingle size.
|
||||
*
|
||||
* @param minShingleSize min size of output shingles
|
||||
*/
|
||||
private void clearShingles() {
|
||||
for (int i = 0; i < shingles.length; i++) {
|
||||
shingles[i].setLength(0);
|
||||
public void setMinShingleSize(int minShingleSize) {
|
||||
if (minShingleSize < 2) {
|
||||
throw new IllegalArgumentException("Min shingle size must be >= 2");
|
||||
}
|
||||
if (minShingleSize > maxShingleSize) {
|
||||
throw new IllegalArgumentException
|
||||
("Min shingle size must be <= max shingle size");
|
||||
}
|
||||
this.minShingleSize = minShingleSize;
|
||||
gramSize = new CircularSequence();
|
||||
}
|
||||
|
||||
private AttributeSource.State nextToken;
|
||||
private int shingleBufferPosition;
|
||||
private int[] endOffsets;
|
||||
/**
|
||||
* Sets the string to use when joining adjacent tokens to form a shingle
|
||||
* @param tokenSeparator used to separate input stream tokens in output shingles
|
||||
*/
|
||||
public void setTokenSeparator(String tokenSeparator) {
|
||||
this.tokenSeparator = null == tokenSeparator ? "" : tokenSeparator;
|
||||
}
|
||||
|
||||
/* (non-Javadoc)
|
||||
* @see org.apache.lucene.analysis.TokenStream#next()
|
||||
*/
|
||||
@Override
|
||||
public final boolean incrementToken() throws IOException {
|
||||
while (true) {
|
||||
if (nextToken == null) {
|
||||
if (!fillShingleBuffer()) {
|
||||
return false;
|
||||
boolean tokenAvailable = false;
|
||||
if (gramSize.atMinValue() || inputWindow.size() < gramSize.getValue()) {
|
||||
shiftInputWindow();
|
||||
}
|
||||
}
|
||||
|
||||
nextToken = shingleBuf.getFirst();
|
||||
|
||||
if (outputUnigrams) {
|
||||
if (shingleBufferPosition == 0) {
|
||||
restoreState(nextToken);
|
||||
if ( ! inputWindow.isEmpty()) {
|
||||
restoreState(inputWindow.getFirst());
|
||||
if (1 == gramSize.getValue()) {
|
||||
posIncrAtt.setPositionIncrement(1);
|
||||
shingleBufferPosition++;
|
||||
return true;
|
||||
}
|
||||
} else if (shingleBufferPosition % this.maxShingleSize == 0){
|
||||
shingleBufferPosition++;
|
||||
}
|
||||
|
||||
if (shingleBufferPosition < shingleBuf.size()) {
|
||||
restoreState(nextToken);
|
||||
typeAtt.setType(tokenType);
|
||||
offsetAtt.setOffset(offsetAtt.startOffset(), endOffsets[shingleBufferPosition]);
|
||||
StringBuilder buf = shingles[shingleBufferPosition];
|
||||
int termLength = buf.length();
|
||||
char[] termBuffer = termAtt.termBuffer();
|
||||
if (termBuffer.length < termLength)
|
||||
termBuffer = termAtt.resizeTermBuffer(termLength);
|
||||
buf.getChars(0, termLength, termBuffer, 0);
|
||||
termAtt.setTermLength(termLength);
|
||||
if ((! outputUnigrams) && shingleBufferPosition % this.maxShingleSize == 1) {
|
||||
posIncrAtt.setPositionIncrement(1);
|
||||
} else {
|
||||
posIncrAtt.setPositionIncrement(0);
|
||||
}
|
||||
shingleBufferPosition++;
|
||||
if (shingleBufferPosition == shingleBuf.size()) {
|
||||
nextToken = null;
|
||||
shingleBufferPosition = 0;
|
||||
}
|
||||
return true;
|
||||
} else {
|
||||
nextToken = null;
|
||||
shingleBufferPosition = 0;
|
||||
gramSize.advance();
|
||||
tokenAvailable = true;
|
||||
} else if (inputWindow.size() >= gramSize.getValue()) {
|
||||
getNextShingle();
|
||||
gramSize.advance();
|
||||
tokenAvailable = true;
|
||||
}
|
||||
}
|
||||
return tokenAvailable;
|
||||
}
|
||||
|
||||
private int numFillerTokensToInsert;
|
||||
private AttributeSource.State currentToken;
|
||||
private boolean hasCurrentToken;
|
||||
|
||||
private TermAttribute termAtt;
|
||||
private OffsetAttribute offsetAtt;
|
||||
private PositionIncrementAttribute posIncrAtt;
|
||||
private TypeAttribute typeAtt;
|
||||
|
||||
/**
|
||||
* Get the next token from the input stream and push it on the token buffer.
|
||||
* If we encounter a token with position increment > 1, we put filler tokens
|
||||
* on the token buffer.
|
||||
* <p/>
|
||||
* Returns null when the end of the input stream is reached.
|
||||
* @return the next token, or null if at end of input stream
|
||||
* <p>Makes the next token a shingle of length {@link #gramSize},
|
||||
* composed of tokens taken from {@link #inputWindow}.
|
||||
* <p>Callers of this method must first insure that there are at least
|
||||
* <code>gramSize</code> tokens available in <code>inputWindow</code>.
|
||||
*/
|
||||
private void getNextShingle() {
|
||||
int startOffset = offsetAtt.startOffset();
|
||||
|
||||
int minTokNum = gramSize.getValue() - 1; // zero-based inputWindow position
|
||||
if (gramSize.getValue() == minShingleSize) {
|
||||
// Clear the shingle text buffer if this is the first shingle
|
||||
// at the current position in the input stream.
|
||||
shingleBuilder.setLength(0);
|
||||
minTokNum = 0;
|
||||
}
|
||||
for (int tokNum = minTokNum ; tokNum < gramSize.getValue() ; ++tokNum) {
|
||||
if (tokNum > 0) {
|
||||
shingleBuilder.append(tokenSeparator);
|
||||
}
|
||||
restoreState(inputWindow.get(tokNum));
|
||||
shingleBuilder.append(termAtt.termBuffer(), 0, termAtt.termLength());
|
||||
}
|
||||
char[] termBuffer = termAtt.termBuffer();
|
||||
int termLength = shingleBuilder.length();
|
||||
if (termBuffer.length < termLength) {
|
||||
termBuffer = termAtt.resizeTermBuffer(termLength);
|
||||
}
|
||||
shingleBuilder.getChars(0, termLength, termBuffer, 0);
|
||||
termAtt.setTermLength(termLength);
|
||||
posIncrAtt.setPositionIncrement(gramSize.atMinValue() ? 1 : 0);
|
||||
typeAtt.setType(tokenType);
|
||||
offsetAtt.setOffset(startOffset, offsetAtt.endOffset());
|
||||
}
|
||||
|
||||
/**
|
||||
* <p>Get the next token from the input stream.
|
||||
* <p>If the next token has <code>positionIncrement > 1</code>,
|
||||
* <code>positionIncrement - 1</code> {@link #FILLER_TOKEN}s are
|
||||
* inserted first.
|
||||
* @return false for end of stream; true otherwise
|
||||
* @throws IOException if the input stream has a problem
|
||||
*/
|
||||
private boolean getNextToken() throws IOException {
|
||||
|
||||
while (true) {
|
||||
boolean success = false;
|
||||
if (numFillerTokensToInsert > 0) {
|
||||
if (currentToken == null) {
|
||||
currentToken = captureState();
|
||||
} else {
|
||||
restoreState(currentToken);
|
||||
}
|
||||
numFillerTokensToInsert--;
|
||||
// A filler token occupies no space
|
||||
offsetAtt.setOffset(offsetAtt.startOffset(), offsetAtt.startOffset());
|
||||
termAtt.setTermBuffer(FILLER_TOKEN, 0, FILLER_TOKEN.length);
|
||||
return true;
|
||||
}
|
||||
|
||||
if (hasCurrentToken) {
|
||||
if (currentToken != null) {
|
||||
restoreState(currentToken);
|
||||
currentToken = null;
|
||||
}
|
||||
hasCurrentToken = false;
|
||||
return true;
|
||||
}
|
||||
|
||||
if (!input.incrementToken()) return false;
|
||||
hasCurrentToken = true;
|
||||
|
||||
insertFillerToken();
|
||||
success = true;
|
||||
} else if (null != nextInputStreamToken) {
|
||||
restoreState(nextInputStreamToken);
|
||||
nextInputStreamToken = null;
|
||||
success = true;
|
||||
} else if (input.incrementToken()) {
|
||||
if (posIncrAtt.getPositionIncrement() > 1) {
|
||||
numFillerTokensToInsert = posIncrAtt.getPositionIncrement() - 1;
|
||||
insertFillerToken();
|
||||
}
|
||||
success = true;
|
||||
}
|
||||
return success;
|
||||
}
|
||||
|
||||
/**
|
||||
* Fill the output buffer with new shingles.
|
||||
* Inserts a {@link #FILLER_TOKEN} and decrements
|
||||
* {@link #numFillerTokensToInsert}.
|
||||
*/
|
||||
private void insertFillerToken() {
|
||||
if (null == nextInputStreamToken) {
|
||||
nextInputStreamToken = captureState();
|
||||
} else {
|
||||
restoreState(nextInputStreamToken);
|
||||
}
|
||||
--numFillerTokensToInsert;
|
||||
// A filler token occupies no space
|
||||
offsetAtt.setOffset(offsetAtt.startOffset(), offsetAtt.startOffset());
|
||||
termAtt.setTermBuffer(FILLER_TOKEN, 0, FILLER_TOKEN.length);
|
||||
}
|
||||
|
||||
/**
|
||||
* <p>Fills {@link #inputWindow} with input stream tokens, if available,
|
||||
* shifting to the right if the window was previously full.
|
||||
* <p>Resets {@link #gramSize} to its minimum value.
|
||||
*
|
||||
* @throws IOException if there's a problem getting the next token
|
||||
*/
|
||||
private boolean fillShingleBuffer() throws IOException {
|
||||
boolean addedToken = false;
|
||||
/*
|
||||
* Try to fill the shingle buffer.
|
||||
*/
|
||||
do {
|
||||
if (getNextToken()) {
|
||||
shingleBuf.add(captureState());
|
||||
if (shingleBuf.size() > maxShingleSize)
|
||||
{
|
||||
shingleBuf.removeFirst();
|
||||
private void shiftInputWindow() throws IOException {
|
||||
if (inputWindow.size() > 0) {
|
||||
inputWindow.removeFirst();
|
||||
}
|
||||
addedToken = true;
|
||||
} else {
|
||||
while (getNextToken()) {
|
||||
inputWindow.add(captureState());
|
||||
if (inputWindow.size() == maxShingleSize) {
|
||||
break;
|
||||
}
|
||||
} while (shingleBuf.size() < maxShingleSize);
|
||||
|
||||
if (shingleBuf.isEmpty()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
* If no new token could be added to the shingle buffer, we have reached
|
||||
* the end of the input stream and have to discard the least recent token.
|
||||
*/
|
||||
if (! addedToken) {
|
||||
shingleBuf.removeFirst();
|
||||
}
|
||||
|
||||
if (shingleBuf.isEmpty()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
clearShingles();
|
||||
|
||||
endOffsets = new int[shingleBuf.size()];
|
||||
for (int i = 0; i < endOffsets.length; i++) {
|
||||
endOffsets[i] = 0;
|
||||
}
|
||||
|
||||
int i = 0;
|
||||
for (Iterator<State> it = shingleBuf.iterator(); it.hasNext(); ) {
|
||||
restoreState(it.next());
|
||||
for (int j = i; j < shingles.length; j++) {
|
||||
if (shingles[j].length() != 0) {
|
||||
shingles[j].append(TOKEN_SEPARATOR);
|
||||
}
|
||||
shingles[j].append(termAtt.termBuffer(), 0, termAtt.termLength());
|
||||
}
|
||||
|
||||
endOffsets[i] = offsetAtt.endOffset();
|
||||
i++;
|
||||
}
|
||||
|
||||
return true;
|
||||
gramSize.reset();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
nextToken = null;
|
||||
shingleBufferPosition = 0;
|
||||
shingleBuf.clear();
|
||||
gramSize.reset();
|
||||
inputWindow.clear();
|
||||
numFillerTokensToInsert = 0;
|
||||
currentToken = null;
|
||||
hasCurrentToken = false;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* <p>An instance of this class is used to maintain the number of input
|
||||
* stream tokens that will be used to compose the next unigram or shingle:
|
||||
* {@link #gramSize}.
|
||||
* <p><code>gramSize</code> will take on values from the circular sequence
|
||||
* <b>{ [ 1, ] {@link #minShingleSize} [ , ... , {@link #maxShingleSize} ] }</b>.
|
||||
* <p>1 is included in the circular sequence only if
|
||||
* {@link #outputUnigrams} = true.
|
||||
*/
|
||||
private class CircularSequence {
|
||||
private int value;
|
||||
private int minValue;
|
||||
|
||||
public CircularSequence() {
|
||||
minValue = outputUnigrams ? 1 : minShingleSize;
|
||||
reset();
|
||||
}
|
||||
|
||||
/**
|
||||
* {@see #advance()}
|
||||
* @return the current value.
|
||||
*/
|
||||
public int getValue() {
|
||||
return value;
|
||||
}
|
||||
|
||||
/**
|
||||
* <p>Increments this circular number's value to the next member in the
|
||||
* circular sequence
|
||||
* <code>gramSize</code> will take on values from the circular sequence
|
||||
* <b>{ [ 1, ] {@link #minShingleSize} [ , ... , {@link #maxShingleSize} ] }</b>.
|
||||
* <p>1 is included in the circular sequence only if
|
||||
* {@link #outputUnigrams} = true.
|
||||
*
|
||||
* @return the next member in the circular sequence
|
||||
*/
|
||||
public int advance() {
|
||||
if (value == 1) {
|
||||
value = minShingleSize;
|
||||
} else if (value == maxShingleSize) {
|
||||
reset();
|
||||
} else {
|
||||
++value;
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
/**
|
||||
* <p>Sets this circular number's value to the first member of the
|
||||
* circular sequence
|
||||
* <p><code>gramSize</code> will take on values from the circular sequence
|
||||
* <b>{ [ 1, ] {@link #minShingleSize} [ , ... , {@link #maxShingleSize} ] }</b>.
|
||||
* <p>1 is included in the circular sequence only if
|
||||
* {@link #outputUnigrams} = true.
|
||||
*/
|
||||
public void reset() {
|
||||
value = minValue;
|
||||
}
|
||||
|
||||
/**
|
||||
* <p>Returns true if the current value is the first member of the circular
|
||||
* sequence.
|
||||
* <p>If {@link #outputUnigrams} = true, the first member of the circular
|
||||
* sequence will be 1; otherwise, it will be {@link #minShingleSize}.
|
||||
*
|
||||
* @return true if the current value is the first member of the circular
|
||||
* sequence; false otherwise
|
||||
*/
|
||||
public boolean atMinValue() {
|
||||
return value == minValue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -246,4 +246,117 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
|
|||
new int[] { 6, 13, 13, 18, 18, 27, 27 },
|
||||
new int[] { 1, 0, 1, 0, 1, 0, 1 });
|
||||
}
|
||||
|
||||
public void testNonDefaultMinShingleSize() throws Exception {
|
||||
ShingleAnalyzerWrapper analyzer
|
||||
= new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(), 3, 4);
|
||||
assertAnalyzesToReuse(analyzer, "please divide this sentence into shingles",
|
||||
new String[] { "please", "please divide this", "please divide this sentence",
|
||||
"divide", "divide this sentence", "divide this sentence into",
|
||||
"this", "this sentence into", "this sentence into shingles",
|
||||
"sentence", "sentence into shingles",
|
||||
"into",
|
||||
"shingles" },
|
||||
new int[] { 0, 0, 0, 7, 7, 7, 14, 14, 14, 19, 19, 28, 33 },
|
||||
new int[] { 6, 18, 27, 13, 27, 32, 18, 32, 41, 27, 41, 32, 41 },
|
||||
new int[] { 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1 });
|
||||
analyzer.setOutputUnigrams(false);
|
||||
assertAnalyzesToReuse(analyzer, "please divide this sentence into shingles",
|
||||
new String[] { "please divide this", "please divide this sentence",
|
||||
"divide this sentence", "divide this sentence into",
|
||||
"this sentence into", "this sentence into shingles",
|
||||
"sentence into shingles" },
|
||||
new int[] { 0, 0, 7, 7, 14, 14, 19 },
|
||||
new int[] { 18, 27, 27, 32, 32, 41, 41 },
|
||||
new int[] { 1, 0, 1, 0, 1, 0, 1 });
|
||||
}
|
||||
|
||||
public void testNonDefaultMinAndSameMaxShingleSize() throws Exception {
|
||||
ShingleAnalyzerWrapper analyzer
|
||||
= new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(), 3, 3);
|
||||
assertAnalyzesToReuse(analyzer, "please divide this sentence into shingles",
|
||||
new String[] { "please", "please divide this",
|
||||
"divide", "divide this sentence",
|
||||
"this", "this sentence into",
|
||||
"sentence", "sentence into shingles",
|
||||
"into",
|
||||
"shingles" },
|
||||
new int[] { 0, 0, 7, 7, 14, 14, 19, 19, 28, 33 },
|
||||
new int[] { 6, 18, 13, 27, 18, 32, 27, 41, 32, 41 },
|
||||
new int[] { 1, 0, 1, 0, 1, 0, 1, 0, 1, 1 });
|
||||
analyzer.setOutputUnigrams(false);
|
||||
assertAnalyzesToReuse(analyzer, "please divide this sentence into shingles",
|
||||
new String[] { "please divide this",
|
||||
"divide this sentence",
|
||||
"this sentence into",
|
||||
"sentence into shingles" },
|
||||
new int[] { 0, 7, 14, 19 },
|
||||
new int[] { 18, 27, 32, 41 },
|
||||
new int[] { 1, 1, 1, 1 });
|
||||
}
|
||||
|
||||
public void testNoTokenSeparator() throws Exception {
|
||||
ShingleAnalyzerWrapper analyzer
|
||||
= new ShingleAnalyzerWrapper(new WhitespaceAnalyzer());
|
||||
analyzer.setTokenSeparator("");
|
||||
assertAnalyzesToReuse(analyzer, "please divide into shingles",
|
||||
new String[] { "please", "pleasedivide",
|
||||
"divide", "divideinto",
|
||||
"into", "intoshingles",
|
||||
"shingles" },
|
||||
new int[] { 0, 0, 7, 7, 14, 14, 19 },
|
||||
new int[] { 6, 13, 13, 18, 18, 27, 27 },
|
||||
new int[] { 1, 0, 1, 0, 1, 0, 1 });
|
||||
analyzer.setOutputUnigrams(false);
|
||||
assertAnalyzesToReuse(analyzer, "please divide into shingles",
|
||||
new String[] { "pleasedivide",
|
||||
"divideinto",
|
||||
"intoshingles" },
|
||||
new int[] { 0, 7, 14 },
|
||||
new int[] { 13, 18, 27 },
|
||||
new int[] { 1, 1, 1 });
|
||||
}
|
||||
|
||||
public void testNullTokenSeparator() throws Exception {
|
||||
ShingleAnalyzerWrapper analyzer
|
||||
= new ShingleAnalyzerWrapper(new WhitespaceAnalyzer());
|
||||
analyzer.setTokenSeparator(null);
|
||||
assertAnalyzesToReuse(analyzer, "please divide into shingles",
|
||||
new String[] { "please", "pleasedivide",
|
||||
"divide", "divideinto",
|
||||
"into", "intoshingles",
|
||||
"shingles" },
|
||||
new int[] { 0, 0, 7, 7, 14, 14, 19 },
|
||||
new int[] { 6, 13, 13, 18, 18, 27, 27 },
|
||||
new int[] { 1, 0, 1, 0, 1, 0, 1 });
|
||||
analyzer.setOutputUnigrams(false);
|
||||
assertAnalyzesToReuse(analyzer, "please divide into shingles",
|
||||
new String[] { "pleasedivide",
|
||||
"divideinto",
|
||||
"intoshingles" },
|
||||
new int[] { 0, 7, 14 },
|
||||
new int[] { 13, 18, 27 },
|
||||
new int[] { 1, 1, 1 });
|
||||
}
|
||||
public void testAltTokenSeparator() throws Exception {
|
||||
ShingleAnalyzerWrapper analyzer
|
||||
= new ShingleAnalyzerWrapper(new WhitespaceAnalyzer());
|
||||
analyzer.setTokenSeparator("<SEP>");
|
||||
assertAnalyzesToReuse(analyzer, "please divide into shingles",
|
||||
new String[] { "please", "please<SEP>divide",
|
||||
"divide", "divide<SEP>into",
|
||||
"into", "into<SEP>shingles",
|
||||
"shingles" },
|
||||
new int[] { 0, 0, 7, 7, 14, 14, 19 },
|
||||
new int[] { 6, 13, 13, 18, 18, 27, 27 },
|
||||
new int[] { 1, 0, 1, 0, 1, 0, 1 });
|
||||
analyzer.setOutputUnigrams(false);
|
||||
assertAnalyzesToReuse(analyzer, "please divide into shingles",
|
||||
new String[] { "please<SEP>divide",
|
||||
"divide<SEP>into",
|
||||
"into<SEP>shingles" },
|
||||
new int[] { 0, 7, 14 },
|
||||
new int[] { 13, 18, 27 },
|
||||
new int[] { 1, 1, 1 });
|
||||
}
|
||||
}
|
||||
|
|
|
@ -288,6 +288,359 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
|
|||
|
||||
};
|
||||
|
||||
public static final Token[] TRI_GRAM_TOKENS_MIN_TRI_GRAM = new Token[] {
|
||||
createToken("please", 0, 6),
|
||||
createToken("please divide this", 0, 18),
|
||||
createToken("divide", 7, 13),
|
||||
createToken("divide this sentence", 7, 27),
|
||||
createToken("this", 14, 18),
|
||||
createToken("this sentence into", 14, 32),
|
||||
createToken("sentence", 19, 27),
|
||||
createToken("sentence into shingles", 19, 39),
|
||||
createToken("into", 28, 32),
|
||||
createToken("shingles", 33, 39)
|
||||
};
|
||||
|
||||
public static final int[] TRI_GRAM_POSITION_INCREMENTS_MIN_TRI_GRAM = new int[] {
|
||||
1, 0, 1, 0, 1, 0, 1, 0, 1, 1
|
||||
};
|
||||
|
||||
public static final String[] TRI_GRAM_TYPES_MIN_TRI_GRAM = new String[] {
|
||||
"word", "shingle",
|
||||
"word", "shingle",
|
||||
"word", "shingle",
|
||||
"word", "shingle",
|
||||
"word",
|
||||
"word"
|
||||
};
|
||||
|
||||
public static final Token[] TRI_GRAM_TOKENS_WITHOUT_UNIGRAMS_MIN_TRI_GRAM = new Token[] {
|
||||
createToken("please divide this", 0, 18),
|
||||
createToken("divide this sentence", 7, 27),
|
||||
createToken("this sentence into", 14, 32),
|
||||
createToken("sentence into shingles", 19, 39)
|
||||
};
|
||||
|
||||
public static final int[] TRI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_MIN_TRI_GRAM = new int[] {
|
||||
1, 1, 1, 1
|
||||
};
|
||||
|
||||
public static final String[] TRI_GRAM_TYPES_WITHOUT_UNIGRAMS_MIN_TRI_GRAM = new String[] {
|
||||
"shingle",
|
||||
"shingle",
|
||||
"shingle",
|
||||
"shingle"
|
||||
};
|
||||
|
||||
public static final Token[] FOUR_GRAM_TOKENS_MIN_TRI_GRAM = new Token[] {
|
||||
createToken("please", 0, 6),
|
||||
createToken("please divide this", 0, 18),
|
||||
createToken("please divide this sentence", 0, 27),
|
||||
createToken("divide", 7, 13),
|
||||
createToken("divide this sentence", 7, 27),
|
||||
createToken("divide this sentence into", 7, 32),
|
||||
createToken("this", 14, 18),
|
||||
createToken("this sentence into", 14, 32),
|
||||
createToken("this sentence into shingles", 14, 39),
|
||||
createToken("sentence", 19, 27),
|
||||
createToken("sentence into shingles", 19, 39),
|
||||
createToken("into", 28, 32),
|
||||
createToken("shingles", 33, 39)
|
||||
};
|
||||
|
||||
public static final int[] FOUR_GRAM_POSITION_INCREMENTS_MIN_TRI_GRAM = new int[] {
|
||||
1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1
|
||||
};
|
||||
|
||||
public static final String[] FOUR_GRAM_TYPES_MIN_TRI_GRAM = new String[] {
|
||||
"word", "shingle", "shingle",
|
||||
"word", "shingle", "shingle",
|
||||
"word", "shingle", "shingle",
|
||||
"word", "shingle",
|
||||
"word",
|
||||
"word"
|
||||
};
|
||||
|
||||
public static final Token[] FOUR_GRAM_TOKENS_WITHOUT_UNIGRAMS_MIN_TRI_GRAM = new Token[] {
|
||||
createToken("please divide this", 0, 18),
|
||||
createToken("please divide this sentence", 0, 27),
|
||||
createToken("divide this sentence", 7, 27),
|
||||
createToken("divide this sentence into", 7, 32),
|
||||
createToken("this sentence into", 14, 32),
|
||||
createToken("this sentence into shingles", 14, 39),
|
||||
createToken("sentence into shingles", 19, 39),
|
||||
};
|
||||
|
||||
public static final int[] FOUR_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_MIN_TRI_GRAM = new int[] {
|
||||
1, 0, 1, 0, 1, 0, 1
|
||||
};
|
||||
|
||||
public static final String[] FOUR_GRAM_TYPES_WITHOUT_UNIGRAMS_MIN_TRI_GRAM = new String[] {
|
||||
"shingle", "shingle",
|
||||
"shingle", "shingle",
|
||||
"shingle", "shingle",
|
||||
"shingle"
|
||||
};
|
||||
|
||||
public static final Token[] FOUR_GRAM_TOKENS_MIN_FOUR_GRAM = new Token[] {
|
||||
createToken("please", 0, 6),
|
||||
createToken("please divide this sentence", 0, 27),
|
||||
createToken("divide", 7, 13),
|
||||
createToken("divide this sentence into", 7, 32),
|
||||
createToken("this", 14, 18),
|
||||
createToken("this sentence into shingles", 14, 39),
|
||||
createToken("sentence", 19, 27),
|
||||
createToken("into", 28, 32),
|
||||
createToken("shingles", 33, 39)
|
||||
};
|
||||
|
||||
public static final int[] FOUR_GRAM_POSITION_INCREMENTS_MIN_FOUR_GRAM = new int[] {
|
||||
1, 0, 1, 0, 1, 0, 1, 1, 1
|
||||
};
|
||||
|
||||
public static final String[] FOUR_GRAM_TYPES_MIN_FOUR_GRAM = new String[] {
|
||||
"word", "shingle",
|
||||
"word", "shingle",
|
||||
"word", "shingle",
|
||||
"word",
|
||||
"word",
|
||||
"word"
|
||||
};
|
||||
|
||||
public static final Token[] FOUR_GRAM_TOKENS_WITHOUT_UNIGRAMS_MIN_FOUR_GRAM = new Token[] {
|
||||
createToken("please divide this sentence", 0, 27),
|
||||
createToken("divide this sentence into", 7, 32),
|
||||
createToken("this sentence into shingles", 14, 39),
|
||||
};
|
||||
|
||||
public static final int[] FOUR_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_MIN_FOUR_GRAM = new int[] {
|
||||
1, 1, 1
|
||||
};
|
||||
|
||||
public static final String[] FOUR_GRAM_TYPES_WITHOUT_UNIGRAMS_MIN_FOUR_GRAM = new String[] {
|
||||
"shingle",
|
||||
"shingle",
|
||||
"shingle"
|
||||
};
|
||||
|
||||
public static final Token[] BI_GRAM_TOKENS_NO_SEPARATOR = new Token[] {
|
||||
createToken("please", 0, 6),
|
||||
createToken("pleasedivide", 0, 13),
|
||||
createToken("divide", 7, 13),
|
||||
createToken("dividethis", 7, 18),
|
||||
createToken("this", 14, 18),
|
||||
createToken("thissentence", 14, 27),
|
||||
createToken("sentence", 19, 27),
|
||||
createToken("sentenceinto", 19, 32),
|
||||
createToken("into", 28, 32),
|
||||
createToken("intoshingles", 28, 39),
|
||||
createToken("shingles", 33, 39),
|
||||
};
|
||||
|
||||
public static final int[] BI_GRAM_POSITION_INCREMENTS_NO_SEPARATOR = new int[] {
|
||||
1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
|
||||
};
|
||||
|
||||
public static final String[] BI_GRAM_TYPES_NO_SEPARATOR = new String[] {
|
||||
"word", "shingle", "word", "shingle", "word", "shingle", "word",
|
||||
"shingle", "word", "shingle", "word"
|
||||
};
|
||||
|
||||
public static final Token[] BI_GRAM_TOKENS_WITHOUT_UNIGRAMS_NO_SEPARATOR = new Token[] {
|
||||
createToken("pleasedivide", 0, 13),
|
||||
createToken("dividethis", 7, 18),
|
||||
createToken("thissentence", 14, 27),
|
||||
createToken("sentenceinto", 19, 32),
|
||||
createToken("intoshingles", 28, 39),
|
||||
};
|
||||
|
||||
public static final int[] BI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_NO_SEPARATOR = new int[] {
|
||||
1, 1, 1, 1, 1
|
||||
};
|
||||
|
||||
public static final String[] BI_GRAM_TYPES_WITHOUT_UNIGRAMS_NO_SEPARATOR = new String[] {
|
||||
"shingle", "shingle", "shingle", "shingle", "shingle"
|
||||
};
|
||||
|
||||
public static final Token[] TRI_GRAM_TOKENS_NO_SEPARATOR = new Token[] {
|
||||
createToken("please", 0, 6),
|
||||
createToken("pleasedivide", 0, 13),
|
||||
createToken("pleasedividethis", 0, 18),
|
||||
createToken("divide", 7, 13),
|
||||
createToken("dividethis", 7, 18),
|
||||
createToken("dividethissentence", 7, 27),
|
||||
createToken("this", 14, 18),
|
||||
createToken("thissentence", 14, 27),
|
||||
createToken("thissentenceinto", 14, 32),
|
||||
createToken("sentence", 19, 27),
|
||||
createToken("sentenceinto", 19, 32),
|
||||
createToken("sentenceintoshingles", 19, 39),
|
||||
createToken("into", 28, 32),
|
||||
createToken("intoshingles", 28, 39),
|
||||
createToken("shingles", 33, 39)
|
||||
};
|
||||
|
||||
public static final int[] TRI_GRAM_POSITION_INCREMENTS_NO_SEPARATOR = new int[] {
|
||||
1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1
|
||||
};
|
||||
|
||||
public static final String[] TRI_GRAM_TYPES_NO_SEPARATOR = new String[] {
|
||||
"word", "shingle", "shingle",
|
||||
"word", "shingle", "shingle",
|
||||
"word", "shingle", "shingle",
|
||||
"word", "shingle", "shingle",
|
||||
"word", "shingle",
|
||||
"word"
|
||||
};
|
||||
|
||||
public static final Token[] TRI_GRAM_TOKENS_WITHOUT_UNIGRAMS_NO_SEPARATOR = new Token[] {
|
||||
createToken("pleasedivide", 0, 13),
|
||||
createToken("pleasedividethis", 0, 18),
|
||||
createToken("dividethis", 7, 18),
|
||||
createToken("dividethissentence", 7, 27),
|
||||
createToken("thissentence", 14, 27),
|
||||
createToken("thissentenceinto", 14, 32),
|
||||
createToken("sentenceinto", 19, 32),
|
||||
createToken("sentenceintoshingles", 19, 39),
|
||||
createToken("intoshingles", 28, 39),
|
||||
};
|
||||
|
||||
public static final int[] TRI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_NO_SEPARATOR = new int[] {
|
||||
1, 0, 1, 0, 1, 0, 1, 0, 1
|
||||
};
|
||||
|
||||
public static final String[] TRI_GRAM_TYPES_WITHOUT_UNIGRAMS_NO_SEPARATOR = new String[] {
|
||||
"shingle", "shingle",
|
||||
"shingle", "shingle",
|
||||
"shingle", "shingle",
|
||||
"shingle", "shingle",
|
||||
"shingle",
|
||||
};
|
||||
|
||||
public static final Token[] BI_GRAM_TOKENS_ALT_SEPARATOR = new Token[] {
|
||||
createToken("please", 0, 6),
|
||||
createToken("please<SEP>divide", 0, 13),
|
||||
createToken("divide", 7, 13),
|
||||
createToken("divide<SEP>this", 7, 18),
|
||||
createToken("this", 14, 18),
|
||||
createToken("this<SEP>sentence", 14, 27),
|
||||
createToken("sentence", 19, 27),
|
||||
createToken("sentence<SEP>into", 19, 32),
|
||||
createToken("into", 28, 32),
|
||||
createToken("into<SEP>shingles", 28, 39),
|
||||
createToken("shingles", 33, 39),
|
||||
};
|
||||
|
||||
public static final int[] BI_GRAM_POSITION_INCREMENTS_ALT_SEPARATOR = new int[] {
|
||||
1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
|
||||
};
|
||||
|
||||
public static final String[] BI_GRAM_TYPES_ALT_SEPARATOR = new String[] {
|
||||
"word", "shingle", "word", "shingle", "word", "shingle", "word",
|
||||
"shingle", "word", "shingle", "word"
|
||||
};
|
||||
|
||||
public static final Token[] BI_GRAM_TOKENS_WITHOUT_UNIGRAMS_ALT_SEPARATOR = new Token[] {
|
||||
createToken("please<SEP>divide", 0, 13),
|
||||
createToken("divide<SEP>this", 7, 18),
|
||||
createToken("this<SEP>sentence", 14, 27),
|
||||
createToken("sentence<SEP>into", 19, 32),
|
||||
createToken("into<SEP>shingles", 28, 39),
|
||||
};
|
||||
|
||||
public static final int[] BI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_ALT_SEPARATOR = new int[] {
|
||||
1, 1, 1, 1, 1
|
||||
};
|
||||
|
||||
public static final String[] BI_GRAM_TYPES_WITHOUT_UNIGRAMS_ALT_SEPARATOR = new String[] {
|
||||
"shingle", "shingle", "shingle", "shingle", "shingle"
|
||||
};
|
||||
|
||||
public static final Token[] TRI_GRAM_TOKENS_ALT_SEPARATOR = new Token[] {
|
||||
createToken("please", 0, 6),
|
||||
createToken("please<SEP>divide", 0, 13),
|
||||
createToken("please<SEP>divide<SEP>this", 0, 18),
|
||||
createToken("divide", 7, 13),
|
||||
createToken("divide<SEP>this", 7, 18),
|
||||
createToken("divide<SEP>this<SEP>sentence", 7, 27),
|
||||
createToken("this", 14, 18),
|
||||
createToken("this<SEP>sentence", 14, 27),
|
||||
createToken("this<SEP>sentence<SEP>into", 14, 32),
|
||||
createToken("sentence", 19, 27),
|
||||
createToken("sentence<SEP>into", 19, 32),
|
||||
createToken("sentence<SEP>into<SEP>shingles", 19, 39),
|
||||
createToken("into", 28, 32),
|
||||
createToken("into<SEP>shingles", 28, 39),
|
||||
createToken("shingles", 33, 39)
|
||||
};
|
||||
|
||||
public static final int[] TRI_GRAM_POSITION_INCREMENTS_ALT_SEPARATOR = new int[] {
|
||||
1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1
|
||||
};
|
||||
|
||||
public static final String[] TRI_GRAM_TYPES_ALT_SEPARATOR = new String[] {
|
||||
"word", "shingle", "shingle",
|
||||
"word", "shingle", "shingle",
|
||||
"word", "shingle", "shingle",
|
||||
"word", "shingle", "shingle",
|
||||
"word", "shingle",
|
||||
"word"
|
||||
};
|
||||
|
||||
public static final Token[] TRI_GRAM_TOKENS_WITHOUT_UNIGRAMS_ALT_SEPARATOR = new Token[] {
|
||||
createToken("please<SEP>divide", 0, 13),
|
||||
createToken("please<SEP>divide<SEP>this", 0, 18),
|
||||
createToken("divide<SEP>this", 7, 18),
|
||||
createToken("divide<SEP>this<SEP>sentence", 7, 27),
|
||||
createToken("this<SEP>sentence", 14, 27),
|
||||
createToken("this<SEP>sentence<SEP>into", 14, 32),
|
||||
createToken("sentence<SEP>into", 19, 32),
|
||||
createToken("sentence<SEP>into<SEP>shingles", 19, 39),
|
||||
createToken("into<SEP>shingles", 28, 39),
|
||||
};
|
||||
|
||||
public static final int[] TRI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_ALT_SEPARATOR = new int[] {
|
||||
1, 0, 1, 0, 1, 0, 1, 0, 1
|
||||
};
|
||||
|
||||
public static final String[] TRI_GRAM_TYPES_WITHOUT_UNIGRAMS_ALT_SEPARATOR = new String[] {
|
||||
"shingle", "shingle",
|
||||
"shingle", "shingle",
|
||||
"shingle", "shingle",
|
||||
"shingle", "shingle",
|
||||
"shingle",
|
||||
};
|
||||
|
||||
public static final Token[] TRI_GRAM_TOKENS_NULL_SEPARATOR = new Token[] {
|
||||
createToken("please", 0, 6),
|
||||
createToken("pleasedivide", 0, 13),
|
||||
createToken("pleasedividethis", 0, 18),
|
||||
createToken("divide", 7, 13),
|
||||
createToken("dividethis", 7, 18),
|
||||
createToken("dividethissentence", 7, 27),
|
||||
createToken("this", 14, 18),
|
||||
createToken("thissentence", 14, 27),
|
||||
createToken("thissentenceinto", 14, 32),
|
||||
createToken("sentence", 19, 27),
|
||||
createToken("sentenceinto", 19, 32),
|
||||
createToken("sentenceintoshingles", 19, 39),
|
||||
createToken("into", 28, 32),
|
||||
createToken("intoshingles", 28, 39),
|
||||
createToken("shingles", 33, 39)
|
||||
};
|
||||
|
||||
public static final int[] TRI_GRAM_POSITION_INCREMENTS_NULL_SEPARATOR = new int[] {
|
||||
1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1
|
||||
};
|
||||
|
||||
public static final String[] TRI_GRAM_TYPES_NULL_SEPARATOR = new String[] {
|
||||
"word", "shingle", "shingle",
|
||||
"word", "shingle", "shingle",
|
||||
"word", "shingle", "shingle",
|
||||
"word", "shingle", "shingle",
|
||||
"word", "shingle",
|
||||
"word"
|
||||
};
|
||||
|
||||
@Override
|
||||
protected void setUp() throws Exception {
|
||||
|
@ -379,6 +732,108 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
|
|||
}
|
||||
|
||||
|
||||
public void testTriGramFilterMinTriGram() throws IOException {
|
||||
this.shingleFilterTest(3, 3, TEST_TOKEN, TRI_GRAM_TOKENS_MIN_TRI_GRAM,
|
||||
TRI_GRAM_POSITION_INCREMENTS_MIN_TRI_GRAM,
|
||||
TRI_GRAM_TYPES_MIN_TRI_GRAM,
|
||||
true);
|
||||
}
|
||||
|
||||
public void testTriGramFilterWithoutUnigramsMinTriGram() throws IOException {
|
||||
this.shingleFilterTest(3, 3, TEST_TOKEN,
|
||||
TRI_GRAM_TOKENS_WITHOUT_UNIGRAMS_MIN_TRI_GRAM,
|
||||
TRI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_MIN_TRI_GRAM,
|
||||
TRI_GRAM_TYPES_WITHOUT_UNIGRAMS_MIN_TRI_GRAM,
|
||||
false);
|
||||
}
|
||||
|
||||
public void testFourGramFilterMinTriGram() throws IOException {
|
||||
this.shingleFilterTest(3, 4, TEST_TOKEN, FOUR_GRAM_TOKENS_MIN_TRI_GRAM,
|
||||
FOUR_GRAM_POSITION_INCREMENTS_MIN_TRI_GRAM,
|
||||
FOUR_GRAM_TYPES_MIN_TRI_GRAM,
|
||||
true);
|
||||
}
|
||||
|
||||
public void testFourGramFilterWithoutUnigramsMinTriGram() throws IOException {
|
||||
this.shingleFilterTest(3, 4, TEST_TOKEN,
|
||||
FOUR_GRAM_TOKENS_WITHOUT_UNIGRAMS_MIN_TRI_GRAM,
|
||||
FOUR_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_MIN_TRI_GRAM,
|
||||
FOUR_GRAM_TYPES_WITHOUT_UNIGRAMS_MIN_TRI_GRAM, false);
|
||||
}
|
||||
|
||||
public void testFourGramFilterMinFourGram() throws IOException {
|
||||
this.shingleFilterTest(4, 4, TEST_TOKEN, FOUR_GRAM_TOKENS_MIN_FOUR_GRAM,
|
||||
FOUR_GRAM_POSITION_INCREMENTS_MIN_FOUR_GRAM,
|
||||
FOUR_GRAM_TYPES_MIN_FOUR_GRAM,
|
||||
true);
|
||||
}
|
||||
|
||||
public void testFourGramFilterWithoutUnigramsMinFourGram() throws IOException {
|
||||
this.shingleFilterTest(4, 4, TEST_TOKEN,
|
||||
FOUR_GRAM_TOKENS_WITHOUT_UNIGRAMS_MIN_FOUR_GRAM,
|
||||
FOUR_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_MIN_FOUR_GRAM,
|
||||
FOUR_GRAM_TYPES_WITHOUT_UNIGRAMS_MIN_FOUR_GRAM, false);
|
||||
}
|
||||
|
||||
public void testBiGramFilterNoSeparator() throws IOException {
|
||||
this.shingleFilterTest("", 2, 2, TEST_TOKEN, BI_GRAM_TOKENS_NO_SEPARATOR,
|
||||
BI_GRAM_POSITION_INCREMENTS_NO_SEPARATOR,
|
||||
BI_GRAM_TYPES_NO_SEPARATOR, true);
|
||||
}
|
||||
|
||||
public void testBiGramFilterWithoutUnigramsNoSeparator() throws IOException {
|
||||
this.shingleFilterTest("", 2, 2, TEST_TOKEN,
|
||||
BI_GRAM_TOKENS_WITHOUT_UNIGRAMS_NO_SEPARATOR,
|
||||
BI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_NO_SEPARATOR,
|
||||
BI_GRAM_TYPES_WITHOUT_UNIGRAMS_NO_SEPARATOR,
|
||||
false);
|
||||
}
|
||||
public void testTriGramFilterNoSeparator() throws IOException {
|
||||
this.shingleFilterTest("", 2, 3, TEST_TOKEN, TRI_GRAM_TOKENS_NO_SEPARATOR,
|
||||
TRI_GRAM_POSITION_INCREMENTS_NO_SEPARATOR,
|
||||
TRI_GRAM_TYPES_NO_SEPARATOR, true);
|
||||
}
|
||||
|
||||
public void testTriGramFilterWithoutUnigramsNoSeparator() throws IOException {
|
||||
this.shingleFilterTest("", 2, 3, TEST_TOKEN,
|
||||
TRI_GRAM_TOKENS_WITHOUT_UNIGRAMS_NO_SEPARATOR,
|
||||
TRI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_NO_SEPARATOR,
|
||||
TRI_GRAM_TYPES_WITHOUT_UNIGRAMS_NO_SEPARATOR, false);
|
||||
}
|
||||
|
||||
public void testBiGramFilterAltSeparator() throws IOException {
|
||||
this.shingleFilterTest("<SEP>", 2, 2, TEST_TOKEN, BI_GRAM_TOKENS_ALT_SEPARATOR,
|
||||
BI_GRAM_POSITION_INCREMENTS_ALT_SEPARATOR,
|
||||
BI_GRAM_TYPES_ALT_SEPARATOR, true);
|
||||
}
|
||||
|
||||
public void testBiGramFilterWithoutUnigramsAltSeparator() throws IOException {
|
||||
this.shingleFilterTest("<SEP>", 2, 2, TEST_TOKEN,
|
||||
BI_GRAM_TOKENS_WITHOUT_UNIGRAMS_ALT_SEPARATOR,
|
||||
BI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_ALT_SEPARATOR,
|
||||
BI_GRAM_TYPES_WITHOUT_UNIGRAMS_ALT_SEPARATOR,
|
||||
false);
|
||||
}
|
||||
public void testTriGramFilterAltSeparator() throws IOException {
|
||||
this.shingleFilterTest("<SEP>", 2, 3, TEST_TOKEN, TRI_GRAM_TOKENS_ALT_SEPARATOR,
|
||||
TRI_GRAM_POSITION_INCREMENTS_ALT_SEPARATOR,
|
||||
TRI_GRAM_TYPES_ALT_SEPARATOR, true);
|
||||
}
|
||||
|
||||
public void testTriGramFilterWithoutUnigramsAltSeparator() throws IOException {
|
||||
this.shingleFilterTest("<SEP>", 2, 3, TEST_TOKEN,
|
||||
TRI_GRAM_TOKENS_WITHOUT_UNIGRAMS_ALT_SEPARATOR,
|
||||
TRI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_ALT_SEPARATOR,
|
||||
TRI_GRAM_TYPES_WITHOUT_UNIGRAMS_ALT_SEPARATOR, false);
|
||||
}
|
||||
|
||||
public void testTriGramFilterNullSeparator() throws IOException {
|
||||
this.shingleFilterTest(null, 2, 3, TEST_TOKEN, TRI_GRAM_TOKENS_NULL_SEPARATOR,
|
||||
TRI_GRAM_POSITION_INCREMENTS_NULL_SEPARATOR,
|
||||
TRI_GRAM_TYPES_NULL_SEPARATOR, true);
|
||||
}
|
||||
|
||||
|
||||
public void testReset() throws Exception {
|
||||
Tokenizer wsTokenizer = new WhitespaceTokenizer(new StringReader("please divide this sentence"));
|
||||
TokenStream filter = new ShingleFilter(wsTokenizer, 2);
|
||||
|
@ -403,30 +858,50 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
|
|||
throws IOException {
|
||||
|
||||
ShingleFilter filter = new ShingleFilter(new TestTokenStream(tokensToShingle), maxSize);
|
||||
shingleFilterTestCommon
|
||||
(filter, tokensToCompare, positionIncrements, types, outputUnigrams);
|
||||
}
|
||||
|
||||
protected void shingleFilterTest(int minSize, int maxSize, Token[] tokensToShingle,
|
||||
Token[] tokensToCompare, int[] positionIncrements,
|
||||
String[] types, boolean outputUnigrams)
|
||||
throws IOException {
|
||||
ShingleFilter filter
|
||||
= new ShingleFilter(new TestTokenStream(tokensToShingle), minSize, maxSize);
|
||||
shingleFilterTestCommon
|
||||
(filter, tokensToCompare, positionIncrements, types, outputUnigrams);
|
||||
}
|
||||
|
||||
protected void shingleFilterTest(String tokenSeparator, int minSize, int maxSize, Token[] tokensToShingle,
|
||||
Token[] tokensToCompare, int[] positionIncrements,
|
||||
String[] types, boolean outputUnigrams)
|
||||
throws IOException {
|
||||
ShingleFilter filter
|
||||
= new ShingleFilter(new TestTokenStream(tokensToShingle), minSize, maxSize);
|
||||
filter.setTokenSeparator(tokenSeparator);
|
||||
shingleFilterTestCommon
|
||||
(filter, tokensToCompare, positionIncrements, types, outputUnigrams);
|
||||
}
|
||||
|
||||
protected void shingleFilterTestCommon(ShingleFilter filter,
|
||||
Token[] tokensToCompare,
|
||||
int[] positionIncrements,
|
||||
String[] types, boolean outputUnigrams)
|
||||
throws IOException {
|
||||
|
||||
filter.setOutputUnigrams(outputUnigrams);
|
||||
|
||||
TermAttribute termAtt = filter.addAttribute(TermAttribute.class);
|
||||
OffsetAttribute offsetAtt = filter.addAttribute(OffsetAttribute.class);
|
||||
PositionIncrementAttribute posIncrAtt = filter.addAttribute(PositionIncrementAttribute.class);
|
||||
TypeAttribute typeAtt = filter.addAttribute(TypeAttribute.class);
|
||||
String text[] = new String[tokensToCompare.length];
|
||||
int startOffsets[] = new int[tokensToCompare.length];
|
||||
int endOffsets[] = new int[tokensToCompare.length];
|
||||
|
||||
int i = 0;
|
||||
while (filter.incrementToken()) {
|
||||
assertTrue("ShingleFilter outputted more tokens than expected", i < tokensToCompare.length);
|
||||
String termText = termAtt.term();
|
||||
String goldText = tokensToCompare[i].term();
|
||||
assertEquals("Wrong termText", goldText, termText);
|
||||
assertEquals("Wrong startOffset for token \"" + termText + "\"",
|
||||
tokensToCompare[i].startOffset(), offsetAtt.startOffset());
|
||||
assertEquals("Wrong endOffset for token \"" + termText + "\"",
|
||||
tokensToCompare[i].endOffset(), offsetAtt.endOffset());
|
||||
assertEquals("Wrong positionIncrement for token \"" + termText + "\"",
|
||||
positionIncrements[i], posIncrAtt.getPositionIncrement());
|
||||
assertEquals("Wrong type for token \"" + termText + "\"", types[i], typeAtt.type());
|
||||
i++;
|
||||
for (int i = 0; i < tokensToCompare.length; i++) {
|
||||
text[i] = tokensToCompare[i].term();
|
||||
startOffsets[i] = tokensToCompare[i].startOffset();
|
||||
endOffsets[i] = tokensToCompare[i].endOffset();
|
||||
}
|
||||
assertEquals("ShingleFilter outputted wrong # of tokens. (# output = " + i + "; # expected =" + tokensToCompare.length + ")",
|
||||
tokensToCompare.length, i);
|
||||
|
||||
assertTokenStreamContents(filter, text, startOffsets, endOffsets, types, positionIncrements);
|
||||
}
|
||||
|
||||
private static Token createToken(String term, int start, int offset)
|
||||
|
|
Loading…
Reference in New Issue