mirror of https://github.com/apache/lucene.git
LUCENE-1775: Change remaining contrib TokenFilters (shingle, prefix-suffix) to use the new TokenStream API.
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@800195 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
5aaf5b0167
commit
457c29d31e
|
@ -18,6 +18,12 @@ API Changes
|
|||
you are interested in locally and access them on each call to the method that used to pass a new
|
||||
Token. Look at the included updated impls for examples. (Mark Miller)
|
||||
|
||||
2. LUCENE-1460: Change contrib TokenStreams/Filters to use the new
|
||||
TokenStream API. (Robert Muir, Michael Busch)
|
||||
|
||||
3. LUCENE-1775: Change remaining TokenFilters (shingle, prefix-suffix) to
|
||||
use the new TokenStream API. (Robert Muir, Michael Busch)
|
||||
|
||||
Bug fixes
|
||||
|
||||
1. LUCENE-1423: InstantiatedTermEnum#skipTo(Term) throws ArrayIndexOutOfBounds on empty index.
|
||||
|
|
|
@ -24,13 +24,16 @@ import java.io.IOException;
|
|||
|
||||
/**
|
||||
* Links two PrefixAwareTokenFilter
|
||||
* @deprecated
|
||||
* <p/>
|
||||
* <b>NOTE:</b> This filter might not behave correctly if used with custom Attributes, i.e. Attributes other than
|
||||
* the ones located in org.apache.lucene.analysis.tokenattributes.
|
||||
*/
|
||||
public class PrefixAndSuffixAwareTokenFilter extends TokenStream {
|
||||
|
||||
private PrefixAwareTokenFilter suffix;
|
||||
|
||||
public PrefixAndSuffixAwareTokenFilter(TokenStream prefix, TokenStream input, TokenStream suffix) {
|
||||
super(suffix);
|
||||
prefix = new PrefixAwareTokenFilter(prefix, input) {
|
||||
public Token updateSuffixToken(Token suffixToken, Token lastInputToken) {
|
||||
return PrefixAndSuffixAwareTokenFilter.this.updateInputToken(suffixToken, lastInputToken);
|
||||
|
@ -56,11 +59,21 @@ public class PrefixAndSuffixAwareTokenFilter extends TokenStream {
|
|||
}
|
||||
|
||||
|
||||
public Token next(final Token reusableToken) throws IOException {
|
||||
assert reusableToken != null;
|
||||
return suffix.next(reusableToken);
|
||||
public final boolean incrementToken() throws IOException {
|
||||
return suffix.incrementToken();
|
||||
}
|
||||
|
||||
/** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
|
||||
* not be overridden. Delegates to the backwards compatibility layer. */
|
||||
public final Token next(final Token reusableToken) throws java.io.IOException {
|
||||
return super.next(reusableToken);
|
||||
}
|
||||
|
||||
/** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
|
||||
* not be overridden. Delegates to the backwards compatibility layer. */
|
||||
public final Token next() throws java.io.IOException {
|
||||
return super.next();
|
||||
}
|
||||
|
||||
public void reset() throws IOException {
|
||||
suffix.reset();
|
||||
|
|
|
@ -19,6 +19,12 @@ package org.apache.lucene.analysis.miscellaneous;
|
|||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
import org.apache.lucene.index.Payload;
|
||||
|
||||
import java.io.IOException;
|
||||
|
@ -29,28 +35,58 @@ import java.io.IOException;
|
|||
* to be used when updating the token values in the second stream based on that token.
|
||||
*
|
||||
* The default implementation adds last prefix token end offset to the suffix token start and end offsets.
|
||||
* @deprecated
|
||||
* <p/>
|
||||
* <b>NOTE:</b> This filter might not behave correctly if used with custom Attributes, i.e. Attributes other than
|
||||
* the ones located in org.apache.lucene.analysis.tokenattributes.
|
||||
*/
|
||||
public class PrefixAwareTokenFilter extends TokenStream {
|
||||
|
||||
private TokenStream prefix;
|
||||
private TokenStream suffix;
|
||||
|
||||
private TermAttribute termAtt;
|
||||
private PositionIncrementAttribute posIncrAtt;
|
||||
private PayloadAttribute payloadAtt;
|
||||
private OffsetAttribute offsetAtt;
|
||||
private TypeAttribute typeAtt;
|
||||
private FlagsAttribute flagsAtt;
|
||||
|
||||
private TermAttribute p_termAtt;
|
||||
private PositionIncrementAttribute p_posIncrAtt;
|
||||
private PayloadAttribute p_payloadAtt;
|
||||
private OffsetAttribute p_offsetAtt;
|
||||
private TypeAttribute p_typeAtt;
|
||||
private FlagsAttribute p_flagsAtt;
|
||||
|
||||
public PrefixAwareTokenFilter(TokenStream prefix, TokenStream suffix) {
|
||||
super(suffix);
|
||||
this.suffix = suffix;
|
||||
this.prefix = prefix;
|
||||
prefixExhausted = false;
|
||||
|
||||
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||
posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
|
||||
payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class);
|
||||
offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
|
||||
typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
|
||||
flagsAtt = (FlagsAttribute) addAttribute(FlagsAttribute.class);
|
||||
|
||||
p_termAtt = (TermAttribute) prefix.addAttribute(TermAttribute.class);
|
||||
p_posIncrAtt = (PositionIncrementAttribute) prefix.addAttribute(PositionIncrementAttribute.class);
|
||||
p_payloadAtt = (PayloadAttribute) prefix.addAttribute(PayloadAttribute.class);
|
||||
p_offsetAtt = (OffsetAttribute) prefix.addAttribute(OffsetAttribute.class);
|
||||
p_typeAtt = (TypeAttribute) prefix.addAttribute(TypeAttribute.class);
|
||||
p_flagsAtt = (FlagsAttribute) prefix.addAttribute(FlagsAttribute.class);
|
||||
}
|
||||
|
||||
private Token previousPrefixToken = new Token();
|
||||
private Token reusableToken = new Token();
|
||||
|
||||
private boolean prefixExhausted;
|
||||
|
||||
public Token next(final Token reusableToken) throws IOException {
|
||||
assert reusableToken != null;
|
||||
|
||||
public final boolean incrementToken() throws IOException {
|
||||
if (!prefixExhausted) {
|
||||
Token nextToken = prefix.next(reusableToken);
|
||||
Token nextToken = getNextPrefixInputToken(reusableToken);
|
||||
if (nextToken == null) {
|
||||
prefixExhausted = true;
|
||||
} else {
|
||||
|
@ -60,16 +96,63 @@ public class PrefixAwareTokenFilter extends TokenStream {
|
|||
if (p != null) {
|
||||
previousPrefixToken.setPayload((Payload) p.clone());
|
||||
}
|
||||
return nextToken;
|
||||
setCurrentToken(nextToken);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
Token nextToken = suffix.next(reusableToken);
|
||||
Token nextToken = getNextSuffixInputToken(reusableToken);
|
||||
if (nextToken == null) {
|
||||
return null;
|
||||
return false;
|
||||
}
|
||||
|
||||
return updateSuffixToken(nextToken, previousPrefixToken);
|
||||
nextToken = updateSuffixToken(nextToken, previousPrefixToken);
|
||||
setCurrentToken(nextToken);
|
||||
return true;
|
||||
}
|
||||
|
||||
/** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
|
||||
* not be overridden. Delegates to the backwards compatibility layer. */
|
||||
public final Token next(final Token reusableToken) throws java.io.IOException {
|
||||
return super.next(reusableToken);
|
||||
}
|
||||
|
||||
/** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
|
||||
* not be overridden. Delegates to the backwards compatibility layer. */
|
||||
public final Token next() throws java.io.IOException {
|
||||
return super.next();
|
||||
}
|
||||
|
||||
private void setCurrentToken(Token token) {
|
||||
if (token == null) return;
|
||||
termAtt.setTermBuffer(token.termBuffer(), 0, token.termLength());
|
||||
posIncrAtt.setPositionIncrement(token.getPositionIncrement());
|
||||
flagsAtt.setFlags(token.getFlags());
|
||||
offsetAtt.setOffset(token.startOffset(), token.endOffset());
|
||||
typeAtt.setType(token.type());
|
||||
payloadAtt.setPayload(token.getPayload());
|
||||
}
|
||||
|
||||
private Token getNextPrefixInputToken(Token token) throws IOException {
|
||||
if (!prefix.incrementToken()) return null;
|
||||
token.setTermBuffer(p_termAtt.termBuffer(), 0, p_termAtt.termLength());
|
||||
token.setPositionIncrement(p_posIncrAtt.getPositionIncrement());
|
||||
token.setFlags(p_flagsAtt.getFlags());
|
||||
token.setOffset(p_offsetAtt.startOffset(), p_offsetAtt.endOffset());
|
||||
token.setType(p_typeAtt.type());
|
||||
token.setPayload(p_payloadAtt.getPayload());
|
||||
return token;
|
||||
}
|
||||
|
||||
private Token getNextSuffixInputToken(Token token) throws IOException {
|
||||
if (!suffix.incrementToken()) return null;
|
||||
token.setTermBuffer(termAtt.termBuffer(), 0, termAtt.termLength());
|
||||
token.setPositionIncrement(posIncrAtt.getPositionIncrement());
|
||||
token.setFlags(flagsAtt.getFlags());
|
||||
token.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset());
|
||||
token.setType(typeAtt.type());
|
||||
token.setPayload(payloadAtt.getPayload());
|
||||
return token;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -18,12 +18,17 @@ package org.apache.lucene.analysis.shingle;
|
|||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.LinkedList;
|
||||
import java.util.Iterator;
|
||||
import java.util.LinkedList;
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
|
||||
/**
|
||||
* <p>A ShingleFilter constructs shingles (token n-grams) from a token stream.
|
||||
|
@ -39,8 +44,6 @@ import org.apache.lucene.analysis.Token;
|
|||
public class ShingleFilter extends TokenFilter {
|
||||
|
||||
private LinkedList shingleBuf = new LinkedList();
|
||||
private LinkedList outputBuf = new LinkedList();
|
||||
private LinkedList tokenBuf = new LinkedList();
|
||||
private StringBuffer[] shingles;
|
||||
private String tokenType = "shingle";
|
||||
|
||||
|
@ -81,6 +84,11 @@ public class ShingleFilter extends TokenFilter {
|
|||
public ShingleFilter(TokenStream input, int maxShingleSize) {
|
||||
super(input);
|
||||
setMaxShingleSize(maxShingleSize);
|
||||
this.termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||
this.offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
|
||||
this.posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
|
||||
this.typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -148,23 +156,69 @@ public class ShingleFilter extends TokenFilter {
|
|||
shingles[i].setLength(0);
|
||||
}
|
||||
}
|
||||
|
||||
private AttributeSource.State nextToken;
|
||||
private int shingleBufferPosition;
|
||||
private int[] endOffsets;
|
||||
|
||||
/* (non-Javadoc)
|
||||
* @see org.apache.lucene.analysis.TokenStream#next()
|
||||
*/
|
||||
public Token next(final Token reusableToken) throws IOException {
|
||||
assert reusableToken != null;
|
||||
if (outputBuf.isEmpty()) {
|
||||
fillOutputBuf(reusableToken);
|
||||
public final boolean incrementToken() throws IOException {
|
||||
while (true) {
|
||||
if (nextToken == null) {
|
||||
if (!fillShingleBuffer()) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
nextToken = (AttributeSource.State) shingleBuf.getFirst();
|
||||
|
||||
if (shingleBufferPosition == 0 && (! shingleBuf.isEmpty()) && outputUnigrams) {
|
||||
restoreState(nextToken);
|
||||
posIncrAtt.setPositionIncrement(1);
|
||||
shingleBufferPosition++;
|
||||
return true;
|
||||
}
|
||||
|
||||
if (shingleBufferPosition < shingleBuf.size()) {
|
||||
restoreState(nextToken);
|
||||
typeAtt.setType(tokenType);
|
||||
offsetAtt.setOffset(offsetAtt.startOffset(), endOffsets[shingleBufferPosition]);
|
||||
StringBuffer buf = shingles[shingleBufferPosition];
|
||||
int termLength = buf.length();
|
||||
char[] termBuffer = termAtt.termBuffer();
|
||||
if (termBuffer.length < termLength)
|
||||
termBuffer = termAtt.resizeTermBuffer(termLength);
|
||||
buf.getChars(0, termLength, termBuffer, 0);
|
||||
termAtt.setTermLength(termLength);
|
||||
if ((! outputUnigrams) && shingleBufferPosition == 1) {
|
||||
posIncrAtt.setPositionIncrement(1);
|
||||
} else {
|
||||
posIncrAtt.setPositionIncrement(0);
|
||||
}
|
||||
shingleBufferPosition++;
|
||||
if (shingleBufferPosition == shingleBuf.size()) {
|
||||
nextToken = null;
|
||||
shingleBufferPosition = 0;
|
||||
}
|
||||
return true;
|
||||
} else {
|
||||
nextToken = null;
|
||||
shingleBufferPosition = 0;
|
||||
}
|
||||
}
|
||||
Token nextToken = null;
|
||||
if ( ! outputBuf.isEmpty())
|
||||
{
|
||||
nextToken = (Token)outputBuf.remove(0);
|
||||
}
|
||||
return nextToken;
|
||||
}
|
||||
|
||||
|
||||
private int numFillerTokensToInsert;
|
||||
private AttributeSource.State currentToken;
|
||||
private boolean hasCurrentToken;
|
||||
|
||||
private TermAttribute termAtt;
|
||||
private OffsetAttribute offsetAtt;
|
||||
private PositionIncrementAttribute posIncrAtt;
|
||||
private TypeAttribute typeAtt;
|
||||
|
||||
/**
|
||||
* Get the next token from the input stream and push it on the token buffer.
|
||||
* If we encounter a token with position increment > 1, we put filler tokens
|
||||
|
@ -174,41 +228,53 @@ public class ShingleFilter extends TokenFilter {
|
|||
* @return the next token, or null if at end of input stream
|
||||
* @throws IOException if the input stream has a problem
|
||||
*/
|
||||
private Token getNextToken(final Token reusableToken) throws IOException {
|
||||
if (tokenBuf.isEmpty()) {
|
||||
Token nextToken = input.next(reusableToken);
|
||||
if (nextToken != null) {
|
||||
for (int i = 1; i < nextToken.getPositionIncrement(); i++) {
|
||||
Token fillerToken = (Token) nextToken.clone();
|
||||
// A filler token occupies no space
|
||||
fillerToken.setEndOffset(fillerToken.startOffset());
|
||||
fillerToken.setTermBuffer(FILLER_TOKEN, 0, FILLER_TOKEN.length);
|
||||
tokenBuf.add(fillerToken);
|
||||
}
|
||||
tokenBuf.add(nextToken.clone());
|
||||
return getNextToken(nextToken);
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
} else {
|
||||
return (Token)tokenBuf.remove(0);
|
||||
private boolean getNextToken() throws IOException {
|
||||
|
||||
while (true) {
|
||||
if (numFillerTokensToInsert > 0) {
|
||||
if (currentToken == null) {
|
||||
currentToken = captureState();
|
||||
} else {
|
||||
restoreState(currentToken);
|
||||
}
|
||||
numFillerTokensToInsert--;
|
||||
// A filler token occupies no space
|
||||
offsetAtt.setOffset(offsetAtt.startOffset(), offsetAtt.startOffset());
|
||||
termAtt.setTermBuffer(FILLER_TOKEN, 0, FILLER_TOKEN.length);
|
||||
return true;
|
||||
}
|
||||
|
||||
if (hasCurrentToken) {
|
||||
if (currentToken != null) {
|
||||
restoreState(currentToken);
|
||||
currentToken = null;
|
||||
}
|
||||
hasCurrentToken = false;
|
||||
return true;
|
||||
}
|
||||
|
||||
if (!input.incrementToken()) return false;
|
||||
hasCurrentToken = true;
|
||||
|
||||
if (posIncrAtt.getPositionIncrement() > 1) {
|
||||
numFillerTokensToInsert = posIncrAtt.getPositionIncrement() - 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Fill the output buffer with new shingles.
|
||||
*
|
||||
* @throws IOException if there's a problem getting the next token
|
||||
*/
|
||||
private void fillOutputBuf(Token token) throws IOException {
|
||||
private boolean fillShingleBuffer() throws IOException {
|
||||
boolean addedToken = false;
|
||||
/*
|
||||
* Try to fill the shingle buffer.
|
||||
*/
|
||||
do {
|
||||
token = getNextToken(token);
|
||||
if (token != null) {
|
||||
shingleBuf.add(token.clone());
|
||||
if (getNextToken()) {
|
||||
shingleBuf.add(captureState());
|
||||
if (shingleBuf.size() > maxShingleSize)
|
||||
{
|
||||
shingleBuf.remove(0);
|
||||
|
@ -219,69 +285,55 @@ public class ShingleFilter extends TokenFilter {
|
|||
}
|
||||
} while (shingleBuf.size() < maxShingleSize);
|
||||
|
||||
if (shingleBuf.isEmpty()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
* If no new token could be added to the shingle buffer, we have reached
|
||||
* the end of the input stream and have to discard the least recent token.
|
||||
*/
|
||||
if (! addedToken) {
|
||||
if (shingleBuf.isEmpty()) {
|
||||
return;
|
||||
} else {
|
||||
shingleBuf.remove(0);
|
||||
}
|
||||
shingleBuf.remove(0);
|
||||
}
|
||||
|
||||
if (shingleBuf.isEmpty()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
clearShingles();
|
||||
|
||||
int[] endOffsets = new int[shingleBuf.size()];
|
||||
endOffsets = new int[shingleBuf.size()];
|
||||
for (int i = 0; i < endOffsets.length; i++) {
|
||||
endOffsets[i] = 0;
|
||||
}
|
||||
|
||||
int i = 0;
|
||||
Token shingle = null;
|
||||
for (Iterator it = shingleBuf.iterator(); it.hasNext(); ) {
|
||||
shingle = (Token) it.next();
|
||||
restoreState((AttributeSource.State) it.next());
|
||||
for (int j = i; j < shingles.length; j++) {
|
||||
if (shingles[j].length() != 0) {
|
||||
shingles[j].append(TOKEN_SEPARATOR);
|
||||
}
|
||||
shingles[j].append(shingle.termBuffer(), 0, shingle.termLength());
|
||||
shingles[j].append(termAtt.termBuffer(), 0, termAtt.termLength());
|
||||
}
|
||||
|
||||
endOffsets[i] = shingle.endOffset();
|
||||
endOffsets[i] = offsetAtt.endOffset();
|
||||
i++;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
|
||||
* not be overridden. Delegates to the backwards compatibility layer. */
|
||||
public final Token next(final Token reusableToken) throws java.io.IOException {
|
||||
return super.next(reusableToken);
|
||||
}
|
||||
|
||||
if ((! shingleBuf.isEmpty()) && outputUnigrams) {
|
||||
Token unigram = (Token) shingleBuf.getFirst();
|
||||
unigram.setPositionIncrement(1);
|
||||
outputBuf.add(unigram);
|
||||
}
|
||||
|
||||
/*
|
||||
* Push new tokens to the output buffer.
|
||||
*/
|
||||
if (!shingleBuf.isEmpty()) {
|
||||
Token firstShingle = (Token) shingleBuf.get(0);
|
||||
shingle = (Token) firstShingle.clone();
|
||||
shingle.setType(tokenType);
|
||||
}
|
||||
for (int j = 1; j < shingleBuf.size(); j++) {
|
||||
shingle.setEndOffset(endOffsets[j]);
|
||||
StringBuffer buf = shingles[j];
|
||||
int termLength = buf.length();
|
||||
char[] termBuffer = shingle.termBuffer();
|
||||
if (termBuffer.length < termLength)
|
||||
termBuffer = shingle.resizeTermBuffer(termLength);
|
||||
buf.getChars(0, termLength, termBuffer, 0);
|
||||
shingle.setTermLength(termLength);
|
||||
if ((! outputUnigrams) && j == 1) {
|
||||
shingle.setPositionIncrement(1);
|
||||
} else {
|
||||
shingle.setPositionIncrement(0);
|
||||
}
|
||||
outputBuf.add(shingle.clone());
|
||||
}
|
||||
/** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
|
||||
* not be overridden. Delegates to the backwards compatibility layer. */
|
||||
public final Token next() throws java.io.IOException {
|
||||
return super.next();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -30,6 +30,12 @@ import org.apache.lucene.analysis.Token;
|
|||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.miscellaneous.EmptyTokenStream;
|
||||
import org.apache.lucene.analysis.payloads.PayloadHelper;
|
||||
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
import org.apache.lucene.index.Payload;
|
||||
|
||||
|
||||
|
@ -104,6 +110,9 @@ import org.apache.lucene.index.Payload;
|
|||
* <p>The filter also has basic support for calculating weights for the shingles
|
||||
* based on the weights of the tokens from the input stream, output shingle size, et c.
|
||||
* See {@link #calculateShingleWeight(org.apache.lucene.analysis.Token, java.util.List, int, java.util.List, java.util.List)}.
|
||||
* <p/>
|
||||
* <b>NOTE:</b> This filter might not behave correctly if used with custom Attributes, i.e. Attributes other than
|
||||
* the ones located in org.apache.lucene.analysis.tokenattributes.
|
||||
*/
|
||||
public class ShingleMatrixFilter extends TokenStream {
|
||||
|
||||
|
@ -183,7 +192,21 @@ public class ShingleMatrixFilter extends TokenStream {
|
|||
|
||||
private TokenStream input;
|
||||
|
||||
private TermAttribute termAtt;
|
||||
private PositionIncrementAttribute posIncrAtt;
|
||||
private PayloadAttribute payloadAtt;
|
||||
private OffsetAttribute offsetAtt;
|
||||
private TypeAttribute typeAtt;
|
||||
private FlagsAttribute flagsAtt;
|
||||
|
||||
private TermAttribute in_termAtt;
|
||||
private PositionIncrementAttribute in_posIncrAtt;
|
||||
private PayloadAttribute in_payloadAtt;
|
||||
private OffsetAttribute in_offsetAtt;
|
||||
private TypeAttribute in_typeAtt;
|
||||
private FlagsAttribute in_flagsAtt;
|
||||
|
||||
|
||||
/**
|
||||
* Creates a shingle filter based on a user defined matrix.
|
||||
*
|
||||
|
@ -205,8 +228,22 @@ public class ShingleMatrixFilter extends TokenStream {
|
|||
this.ignoringSinglePrefixOrSuffixShingle = ignoringSinglePrefixOrSuffixShingle;
|
||||
this.settingsCodec = settingsCodec;
|
||||
|
||||
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||
posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
|
||||
payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class);
|
||||
offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
|
||||
typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
|
||||
flagsAtt = (FlagsAttribute) addAttribute(FlagsAttribute.class);
|
||||
|
||||
// set the input to be an empty token stream, we already have the data.
|
||||
this.input = new EmptyTokenStream();
|
||||
|
||||
in_termAtt = (TermAttribute) input.addAttribute(TermAttribute.class);
|
||||
in_posIncrAtt = (PositionIncrementAttribute) input.addAttribute(PositionIncrementAttribute.class);
|
||||
in_payloadAtt = (PayloadAttribute) input.addAttribute(PayloadAttribute.class);
|
||||
in_offsetAtt = (OffsetAttribute) input.addAttribute(OffsetAttribute.class);
|
||||
in_typeAtt = (TypeAttribute) input.addAttribute(TypeAttribute.class);
|
||||
in_flagsAtt = (FlagsAttribute) input.addAttribute(FlagsAttribute.class);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -273,6 +310,19 @@ public class ShingleMatrixFilter extends TokenStream {
|
|||
this.spacerCharacter = spacerCharacter;
|
||||
this.ignoringSinglePrefixOrSuffixShingle = ignoringSinglePrefixOrSuffixShingle;
|
||||
this.settingsCodec = settingsCodec;
|
||||
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||
posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
|
||||
payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class);
|
||||
offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
|
||||
typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
|
||||
flagsAtt = (FlagsAttribute) addAttribute(FlagsAttribute.class);
|
||||
|
||||
in_termAtt = (TermAttribute) input.addAttribute(TermAttribute.class);
|
||||
in_posIncrAtt = (PositionIncrementAttribute) input.addAttribute(PositionIncrementAttribute.class);
|
||||
in_payloadAtt = (PayloadAttribute) input.addAttribute(PayloadAttribute.class);
|
||||
in_offsetAtt = (OffsetAttribute) input.addAttribute(OffsetAttribute.class);
|
||||
in_typeAtt = (TypeAttribute) input.addAttribute(TypeAttribute.class);
|
||||
in_flagsAtt = (FlagsAttribute) input.addAttribute(FlagsAttribute.class);
|
||||
}
|
||||
|
||||
// internal filter instance variables
|
||||
|
@ -302,10 +352,10 @@ public class ShingleMatrixFilter extends TokenStream {
|
|||
}
|
||||
|
||||
private Matrix matrix;
|
||||
|
||||
|
||||
public Token next(final Token reusableToken) throws IOException {
|
||||
assert reusableToken != null;
|
||||
|
||||
private Token reusableToken = new Token();
|
||||
|
||||
public final boolean incrementToken() throws IOException {
|
||||
if (matrix == null) {
|
||||
matrix = new Matrix();
|
||||
// fill matrix with maximumShingleSize columns
|
||||
|
@ -321,9 +371,39 @@ public class ShingleMatrixFilter extends TokenStream {
|
|||
do {
|
||||
token = produceNextToken(reusableToken);
|
||||
} while (token == request_next_token);
|
||||
if (token == null) return false;
|
||||
|
||||
termAtt.setTermBuffer(token.termBuffer(), 0, token.termLength());
|
||||
posIncrAtt.setPositionIncrement(token.getPositionIncrement());
|
||||
flagsAtt.setFlags(token.getFlags());
|
||||
offsetAtt.setOffset(token.startOffset(), token.endOffset());
|
||||
typeAtt.setType(token.type());
|
||||
payloadAtt.setPayload(token.getPayload());
|
||||
return true;
|
||||
}
|
||||
|
||||
private Token getNextInputToken(Token token) throws IOException {
|
||||
if (!input.incrementToken()) return null;
|
||||
token.setTermBuffer(in_termAtt.termBuffer(), 0, in_termAtt.termLength());
|
||||
token.setPositionIncrement(in_posIncrAtt.getPositionIncrement());
|
||||
token.setFlags(in_flagsAtt.getFlags());
|
||||
token.setOffset(in_offsetAtt.startOffset(), in_offsetAtt.endOffset());
|
||||
token.setType(in_typeAtt.type());
|
||||
token.setPayload(in_payloadAtt.getPayload());
|
||||
return token;
|
||||
}
|
||||
|
||||
/** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
|
||||
* not be overridden. Delegates to the backwards compatibility layer. */
|
||||
public final Token next(final Token reusableToken) throws java.io.IOException {
|
||||
return super.next(reusableToken);
|
||||
}
|
||||
|
||||
/** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
|
||||
* not be overridden. Delegates to the backwards compatibility layer. */
|
||||
public final Token next() throws java.io.IOException {
|
||||
return super.next();
|
||||
}
|
||||
|
||||
private static final Token request_next_token = new Token();
|
||||
|
||||
|
@ -573,7 +653,7 @@ public class ShingleMatrixFilter extends TokenStream {
|
|||
token = readColumnBuf;
|
||||
readColumnBuf = null;
|
||||
} else {
|
||||
token = input.next(new Token());
|
||||
token = getNextInputToken(new Token());
|
||||
}
|
||||
|
||||
if (token == null) {
|
||||
|
@ -585,7 +665,7 @@ public class ShingleMatrixFilter extends TokenStream {
|
|||
|
||||
currentReaderRow.getTokens().add(token);
|
||||
TokenPositioner tokenPositioner;
|
||||
while ((readColumnBuf = input.next(new Token())) != null
|
||||
while ((readColumnBuf = getNextInputToken(new Token())) != null
|
||||
&& (tokenPositioner = settingsCodec.getTokenPositioner(readColumnBuf)) != TokenPositioner.newColumn) {
|
||||
|
||||
if (tokenPositioner == TokenPositioner.sameRow) {
|
||||
|
@ -599,7 +679,7 @@ public class ShingleMatrixFilter extends TokenStream {
|
|||
}
|
||||
|
||||
if (readColumnBuf == null) {
|
||||
readColumnBuf = input.next(new Token());
|
||||
readColumnBuf = getNextInputToken(new Token());
|
||||
if (readColumnBuf == null) {
|
||||
currentReaderColumn.setLast(true);
|
||||
}
|
||||
|
|
|
@ -21,6 +21,8 @@ import junit.framework.TestCase;
|
|||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
|
@ -34,22 +36,22 @@ public class TestPrefixAndSuffixAwareTokenFilter extends TestCase {
|
|||
new WhitespaceTokenizer(new StringReader("hello world")),
|
||||
new SingleTokenTokenStream(createToken("$", 0, 0)));
|
||||
|
||||
Token token = new Token();
|
||||
assertNext(ts, token, "^", 0, 0);
|
||||
assertNext(ts, token, "hello", 0, 5);
|
||||
assertNext(ts, token, "world", 6, 11);
|
||||
assertNext(ts, token, "$", 11, 11);
|
||||
assertNull(ts.next(token));
|
||||
assertNext(ts, "^", 0, 0);
|
||||
assertNext(ts, "hello", 0, 5);
|
||||
assertNext(ts, "world", 6, 11);
|
||||
assertNext(ts, "$", 11, 11);
|
||||
assertFalse(ts.incrementToken());
|
||||
}
|
||||
|
||||
|
||||
private Token assertNext(TokenStream ts, final Token reusableToken, String text, int startOffset, int endOffset) throws IOException {
|
||||
Token nextToken = ts.next(reusableToken);
|
||||
assertNotNull(nextToken);
|
||||
assertEquals(text, nextToken.term());
|
||||
assertEquals(startOffset, nextToken.startOffset());
|
||||
assertEquals(endOffset, nextToken.endOffset());
|
||||
return nextToken;
|
||||
private void assertNext(TokenStream ts, String text, int startOffset, int endOffset) throws IOException {
|
||||
TermAttribute termAtt = (TermAttribute) ts.addAttribute(TermAttribute.class);
|
||||
OffsetAttribute offsetAtt = (OffsetAttribute) ts.addAttribute(OffsetAttribute.class);
|
||||
|
||||
assertTrue(ts.incrementToken());
|
||||
assertEquals(text, termAtt.term());
|
||||
assertEquals(startOffset, offsetAtt.startOffset());
|
||||
assertEquals(endOffset, offsetAtt.endOffset());
|
||||
}
|
||||
|
||||
private static Token createToken(String term, int start, int offset)
|
||||
|
|
|
@ -21,6 +21,8 @@ import junit.framework.TestCase;
|
|||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
|
@ -34,32 +36,31 @@ public class TestPrefixAwareTokenFilter extends TestCase {
|
|||
ts = new PrefixAwareTokenFilter(
|
||||
new SingleTokenTokenStream(createToken("a", 0, 1)),
|
||||
new SingleTokenTokenStream(createToken("b", 0, 1)));
|
||||
final Token reusableToken = new Token();
|
||||
assertNext(ts, reusableToken, "a", 0, 1);
|
||||
assertNext(ts, reusableToken, "b", 1, 2);
|
||||
assertNull(ts.next(reusableToken));
|
||||
|
||||
assertNext(ts, "a", 0, 1);
|
||||
assertNext(ts, "b", 1, 2);
|
||||
assertFalse(ts.incrementToken());
|
||||
|
||||
// prefix and suffix using 2x prefix
|
||||
|
||||
ts = new PrefixAwareTokenFilter(new SingleTokenTokenStream(createToken("^", 0, 0)), new WhitespaceTokenizer(new StringReader("hello world")));
|
||||
ts = new PrefixAwareTokenFilter(ts, new SingleTokenTokenStream(createToken("$", 0, 0)));
|
||||
|
||||
assertNext(ts, reusableToken, "^", 0, 0);
|
||||
assertNext(ts, reusableToken, "hello", 0, 5);
|
||||
assertNext(ts, reusableToken, "world", 6, 11);
|
||||
assertNext(ts, reusableToken, "$", 11, 11);
|
||||
assertNull(ts.next(reusableToken));
|
||||
assertNext(ts, "^", 0, 0);
|
||||
assertNext(ts, "hello", 0, 5);
|
||||
assertNext(ts, "world", 6, 11);
|
||||
assertNext(ts, "$", 11, 11);
|
||||
assertFalse(ts.incrementToken());
|
||||
}
|
||||
|
||||
|
||||
private Token assertNext(TokenStream ts, final Token reusableToken, String text, int startOffset, int endOffset) throws IOException {
|
||||
Token nextToken = ts.next(reusableToken);
|
||||
assertNotNull(nextToken);
|
||||
assertEquals(text, nextToken.term());
|
||||
assertEquals(startOffset, nextToken.startOffset());
|
||||
assertEquals(endOffset, nextToken.endOffset());
|
||||
return nextToken;
|
||||
private void assertNext(TokenStream ts, String text, int startOffset, int endOffset) throws IOException {
|
||||
TermAttribute termAtt = (TermAttribute) ts.addAttribute(TermAttribute.class);
|
||||
OffsetAttribute offsetAtt = (OffsetAttribute) ts.addAttribute(OffsetAttribute.class);
|
||||
|
||||
assertTrue(ts.incrementToken());
|
||||
assertEquals(text, termAtt.term());
|
||||
assertEquals(startOffset, offsetAtt.startOffset());
|
||||
assertEquals(endOffset, offsetAtt.endOffset());
|
||||
}
|
||||
|
||||
private static Token createToken(String term, int start, int offset)
|
||||
|
|
|
@ -23,6 +23,8 @@ import org.apache.lucene.analysis.Analyzer;
|
|||
import org.apache.lucene.analysis.WhitespaceAnalyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
|
@ -157,10 +159,13 @@ public class ShingleAnalyzerWrapperTest extends TestCase {
|
|||
TokenStream ts = analyzer.tokenStream("content",
|
||||
new StringReader("this sentence"));
|
||||
int j = -1;
|
||||
final Token reusableToken = new Token();
|
||||
for (Token nextToken = ts.next(reusableToken); nextToken != null; nextToken = ts.next(reusableToken)) {
|
||||
j += nextToken.getPositionIncrement();
|
||||
String termText = nextToken.term();
|
||||
|
||||
PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) ts.addAttribute(PositionIncrementAttribute.class);
|
||||
TermAttribute termAtt = (TermAttribute) ts.addAttribute(TermAttribute.class);
|
||||
|
||||
while (ts.incrementToken()) {
|
||||
j += posIncrAtt.getPositionIncrement();
|
||||
String termText = termAtt.term();
|
||||
q.add(new Term("content", termText), j);
|
||||
}
|
||||
|
||||
|
@ -182,9 +187,11 @@ public class ShingleAnalyzerWrapperTest extends TestCase {
|
|||
|
||||
TokenStream ts = analyzer.tokenStream("content",
|
||||
new StringReader("test sentence"));
|
||||
final Token reusableToken = new Token();
|
||||
for (Token nextToken = ts.next(reusableToken); nextToken != null; nextToken = ts.next(reusableToken)) {
|
||||
String termText = nextToken.term();
|
||||
|
||||
TermAttribute termAtt = (TermAttribute) ts.addAttribute(TermAttribute.class);
|
||||
|
||||
while (ts.incrementToken()) {
|
||||
String termText = termAtt.term();
|
||||
q.add(new TermQuery(new Term("content", termText)),
|
||||
BooleanClause.Occur.SHOULD);
|
||||
}
|
||||
|
|
|
@ -22,6 +22,11 @@ import java.io.IOException;
|
|||
import junit.framework.TestCase;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttributeImpl;
|
||||
|
||||
public class ShingleFilterTest extends TestCase {
|
||||
|
||||
|
@ -29,18 +34,31 @@ public class ShingleFilterTest extends TestCase {
|
|||
|
||||
protected int index = 0;
|
||||
protected Token[] testToken;
|
||||
|
||||
private TermAttribute termAtt;
|
||||
private OffsetAttribute offsetAtt;
|
||||
private PositionIncrementAttribute posIncrAtt;
|
||||
private TypeAttribute typeAtt;
|
||||
|
||||
public TestTokenStream(Token[] testToken) {
|
||||
super();
|
||||
this.testToken = testToken;
|
||||
this.termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||
this.offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
|
||||
this.posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
|
||||
this.typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
|
||||
}
|
||||
|
||||
public Token next(final Token reusableToken) throws IOException {
|
||||
assert reusableToken != null;
|
||||
public final boolean incrementToken() throws IOException {
|
||||
if (index < testToken.length) {
|
||||
return testToken[index++];
|
||||
Token t = testToken[index++];
|
||||
termAtt.setTermBuffer(t.termBuffer(), 0, t.termLength());
|
||||
offsetAtt.setOffset(t.startOffset(), t.endOffset());
|
||||
posIncrAtt.setPositionIncrement(t.getPositionIncrement());
|
||||
typeAtt.setType(TypeAttributeImpl.DEFAULT_TYPE);
|
||||
return true;
|
||||
} else {
|
||||
return null;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -163,25 +181,29 @@ public class ShingleFilterTest extends TestCase {
|
|||
this.shingleFilterTest(3, TEST_TOKEN, TRI_GRAM_TOKENS,
|
||||
TRI_GRAM_POSITION_INCREMENTS, TRI_GRAM_TYPES);
|
||||
}
|
||||
|
||||
|
||||
protected void shingleFilterTest(int maxSize, Token[] tokensToShingle, Token[] tokensToCompare,
|
||||
int[] positionIncrements, String[] types)
|
||||
throws IOException {
|
||||
|
||||
TokenStream filter = new ShingleFilter(new TestTokenStream(tokensToShingle), maxSize);
|
||||
TermAttribute termAtt = (TermAttribute) filter.addAttribute(TermAttribute.class);
|
||||
OffsetAttribute offsetAtt = (OffsetAttribute) filter.addAttribute(OffsetAttribute.class);
|
||||
PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) filter.addAttribute(PositionIncrementAttribute.class);
|
||||
TypeAttribute typeAtt = (TypeAttribute) filter.addAttribute(TypeAttribute.class);
|
||||
|
||||
int i = 0;
|
||||
final Token reusableToken = new Token();
|
||||
for (Token nextToken = filter.next(reusableToken); nextToken != null; nextToken = filter.next(reusableToken)) {
|
||||
String termText = nextToken.term();
|
||||
while (filter.incrementToken()) {
|
||||
String termText = termAtt.term();
|
||||
String goldText = tokensToCompare[i].term();
|
||||
assertEquals("Wrong termText", goldText, termText);
|
||||
assertEquals("Wrong startOffset for token \"" + termText + "\"",
|
||||
tokensToCompare[i].startOffset(), nextToken.startOffset());
|
||||
tokensToCompare[i].startOffset(), offsetAtt.startOffset());
|
||||
assertEquals("Wrong endOffset for token \"" + termText + "\"",
|
||||
tokensToCompare[i].endOffset(), nextToken.endOffset());
|
||||
tokensToCompare[i].endOffset(), offsetAtt.endOffset());
|
||||
assertEquals("Wrong positionIncrement for token \"" + termText + "\"",
|
||||
positionIncrements[i], nextToken.getPositionIncrement());
|
||||
assertEquals("Wrong type for token \"" + termText + "\"", types[i], nextToken.type());
|
||||
positionIncrements[i], posIncrAtt.getPositionIncrement());
|
||||
assertEquals("Wrong type for token \"" + termText + "\"", types[i], typeAtt.type());
|
||||
i++;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,22 +17,29 @@ package org.apache.lucene.analysis.shingle;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import junit.framework.TestCase;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.miscellaneous.PrefixAndSuffixAwareTokenFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SingleTokenTokenStream;
|
||||
import org.apache.lucene.analysis.miscellaneous.EmptyTokenStream;
|
||||
import org.apache.lucene.analysis.payloads.PayloadHelper;
|
||||
import org.apache.lucene.analysis.shingle.ShingleMatrixFilter.Matrix;
|
||||
import org.apache.lucene.analysis.shingle.ShingleMatrixFilter.Matrix.Column;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Iterator;
|
||||
import java.util.LinkedList;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
import org.apache.lucene.analysis.CachingTokenFilter;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.miscellaneous.EmptyTokenStream;
|
||||
import org.apache.lucene.analysis.miscellaneous.PrefixAndSuffixAwareTokenFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SingleTokenTokenStream;
|
||||
import org.apache.lucene.analysis.payloads.PayloadHelper;
|
||||
import org.apache.lucene.analysis.shingle.ShingleMatrixFilter.Matrix;
|
||||
import org.apache.lucene.analysis.shingle.ShingleMatrixFilter.Matrix.Column;
|
||||
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
|
||||
public class TestShingleMatrixFilter extends TestCase {
|
||||
|
||||
|
||||
|
@ -43,7 +50,7 @@ public class TestShingleMatrixFilter extends TestCase {
|
|||
TokenStream ts;
|
||||
|
||||
ts = new ShingleMatrixFilter(new EmptyTokenStream(), 1, 2, new Character(' '), false, new ShingleMatrixFilter.OneDimensionalNonWeightedTokenSettingsCodec());
|
||||
assertNull(ts.next(new Token()));
|
||||
assertFalse(ts.incrementToken());
|
||||
|
||||
TokenListStream tls;
|
||||
LinkedList tokens;
|
||||
|
@ -66,20 +73,20 @@ public class TestShingleMatrixFilter extends TestCase {
|
|||
|
||||
Token reusableToken = new Token();
|
||||
|
||||
assertNext(ts, reusableToken, "please", 0, 6);
|
||||
assertNext(ts, reusableToken, "please divide", 0, 13);
|
||||
assertNext(ts, reusableToken, "divide", 7, 13);
|
||||
assertNext(ts, reusableToken, "divide this", 7, 18);
|
||||
assertNext(ts, reusableToken, "this", 14, 18);
|
||||
assertNext(ts, reusableToken, "this sentence", 14, 27);
|
||||
assertNext(ts, reusableToken, "sentence", 19, 27);
|
||||
assertNext(ts, reusableToken, "sentence into", 19, 32);
|
||||
assertNext(ts, reusableToken, "into", 28, 32);
|
||||
assertNext(ts, reusableToken, "into shingles", 28, 39);
|
||||
assertNext(ts, reusableToken, "shingles", 33, 39);
|
||||
assertNext(ts, "please", 0, 6);
|
||||
assertNext(ts, "please divide", 0, 13);
|
||||
assertNext(ts, "divide", 7, 13);
|
||||
assertNext(ts, "divide this", 7, 18);
|
||||
assertNext(ts, "this", 14, 18);
|
||||
assertNext(ts, "this sentence", 14, 27);
|
||||
assertNext(ts, "sentence", 19, 27);
|
||||
assertNext(ts, "sentence into", 19, 32);
|
||||
assertNext(ts, "into", 28, 32);
|
||||
assertNext(ts, "into shingles", 28, 39);
|
||||
assertNext(ts, "shingles", 33, 39);
|
||||
|
||||
|
||||
assertNull(ts.next(reusableToken));
|
||||
assertFalse(ts.incrementToken());
|
||||
|
||||
}
|
||||
|
||||
|
@ -92,7 +99,7 @@ public class TestShingleMatrixFilter extends TestCase {
|
|||
ShingleMatrixFilter.defaultSettingsCodec = null;//new ShingleMatrixFilter.SimpleThreeDimensionalTokenSettingsCodec();
|
||||
|
||||
TokenStream ts;
|
||||
TokenListStream tls;
|
||||
TokenStream tls;
|
||||
LinkedList tokens;
|
||||
|
||||
// test a plain old token stream with synonyms tranlated to rows.
|
||||
|
@ -111,25 +118,25 @@ public class TestShingleMatrixFilter extends TestCase {
|
|||
ts = new ShingleMatrixFilter(tls, 2, 2, new Character('_'), false, new ShingleMatrixFilter.TwoDimensionalNonWeightedSynonymTokenSettingsCodec());
|
||||
|
||||
final Token reusableToken = new Token();
|
||||
assertNext(ts, reusableToken, "hello_world");
|
||||
assertNext(ts, reusableToken, "greetings_world");
|
||||
assertNext(ts, reusableToken, "hello_earth");
|
||||
assertNext(ts, reusableToken, "greetings_earth");
|
||||
assertNext(ts, reusableToken, "hello_tellus");
|
||||
assertNext(ts, reusableToken, "greetings_tellus");
|
||||
assertNull(ts.next(reusableToken));
|
||||
assertNext(ts, "hello_world");
|
||||
assertNext(ts, "greetings_world");
|
||||
assertNext(ts, "hello_earth");
|
||||
assertNext(ts, "greetings_earth");
|
||||
assertNext(ts, "hello_tellus");
|
||||
assertNext(ts, "greetings_tellus");
|
||||
assertFalse(ts.incrementToken());
|
||||
|
||||
// bi-grams with no spacer character, start offset, end offset
|
||||
|
||||
tls.reset();
|
||||
ts = new ShingleMatrixFilter(tls, 2, 2, null, false, new ShingleMatrixFilter.TwoDimensionalNonWeightedSynonymTokenSettingsCodec());
|
||||
assertNext(ts, reusableToken, "helloworld", 0, 10);
|
||||
assertNext(ts, reusableToken, "greetingsworld", 0, 10);
|
||||
assertNext(ts, reusableToken, "helloearth", 0, 10);
|
||||
assertNext(ts, reusableToken, "greetingsearth", 0, 10);
|
||||
assertNext(ts, reusableToken, "hellotellus", 0, 10);
|
||||
assertNext(ts, reusableToken, "greetingstellus", 0, 10);
|
||||
assertNull(ts.next(reusableToken));
|
||||
assertNext(ts, "helloworld", 0, 10);
|
||||
assertNext(ts, "greetingsworld", 0, 10);
|
||||
assertNext(ts, "helloearth", 0, 10);
|
||||
assertNext(ts, "greetingsearth", 0, 10);
|
||||
assertNext(ts, "hellotellus", 0, 10);
|
||||
assertNext(ts, "greetingstellus", 0, 10);
|
||||
assertFalse(ts.incrementToken());
|
||||
|
||||
|
||||
// add ^_prefix_and_suffix_$
|
||||
|
@ -148,7 +155,7 @@ public class TestShingleMatrixFilter extends TestCase {
|
|||
tls = new TokenListStream(tokens);
|
||||
|
||||
ts = new PrefixAndSuffixAwareTokenFilter(new SingleTokenTokenStream(tokenFactory("^", 1, 100f, 0, 0)), tls, new SingleTokenTokenStream(tokenFactory("$", 1, 50f, 0, 0)));
|
||||
tls = new TokenListStream(ts);
|
||||
tls = new CachingTokenFilter(ts);
|
||||
|
||||
// bi-grams, position incrememnt, weight, start offset, end offset
|
||||
|
||||
|
@ -159,18 +166,18 @@ public class TestShingleMatrixFilter extends TestCase {
|
|||
// token.clear();
|
||||
// }
|
||||
|
||||
assertNext(ts, reusableToken, "^_hello", 1, 10.049875f, 0, 4);
|
||||
assertNext(ts, reusableToken, "^_greetings", 1, 10.049875f, 0, 4);
|
||||
assertNext(ts, reusableToken, "hello_world", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, reusableToken, "greetings_world", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, reusableToken, "hello_earth", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, reusableToken, "greetings_earth", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, reusableToken, "hello_tellus", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, reusableToken, "greetings_tellus", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, reusableToken, "world_$", 1, 7.1414285f, 5, 10);
|
||||
assertNext(ts, reusableToken, "earth_$", 1, 7.1414285f, 5, 10);
|
||||
assertNext(ts, reusableToken, "tellus_$", 1, 7.1414285f, 5, 10);
|
||||
assertNull(ts.next(reusableToken));
|
||||
assertNext(ts, "^_hello", 1, 10.049875f, 0, 4);
|
||||
assertNext(ts, "^_greetings", 1, 10.049875f, 0, 4);
|
||||
assertNext(ts, "hello_world", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, "greetings_world", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, "hello_earth", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, "greetings_earth", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, "hello_tellus", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, "greetings_tellus", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, "world_$", 1, 7.1414285f, 5, 10);
|
||||
assertNext(ts, "earth_$", 1, 7.1414285f, 5, 10);
|
||||
assertNext(ts, "tellus_$", 1, 7.1414285f, 5, 10);
|
||||
assertFalse(ts.incrementToken());
|
||||
|
||||
// test unlimited size and allow single boundary token as shingle
|
||||
tls.reset();
|
||||
|
@ -182,44 +189,44 @@ public class TestShingleMatrixFilter extends TestCase {
|
|||
// token.clear();
|
||||
// }
|
||||
|
||||
assertNext(ts, reusableToken, "^", 1, 10.0f, 0, 0);
|
||||
assertNext(ts, reusableToken, "^_hello", 1, 10.049875f, 0, 4);
|
||||
assertNext(ts, reusableToken, "^_hello_world", 1, 10.099504f, 0, 10);
|
||||
assertNext(ts, reusableToken, "^_hello_world_$", 1, 12.328828f, 0, 10);
|
||||
assertNext(ts, reusableToken, "hello", 1, 1.0f, 0, 4);
|
||||
assertNext(ts, reusableToken, "hello_world", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, reusableToken, "hello_world_$", 1, 7.2111025f, 0, 10);
|
||||
assertNext(ts, reusableToken, "world", 1, 1.0f, 5, 10);
|
||||
assertNext(ts, reusableToken, "world_$", 1, 7.1414285f, 5, 10);
|
||||
assertNext(ts, reusableToken, "$", 1, 7.071068f, 10, 10);
|
||||
assertNext(ts, reusableToken, "^_greetings", 1, 10.049875f, 0, 4);
|
||||
assertNext(ts, reusableToken, "^_greetings_world", 1, 10.099504f, 0, 10);
|
||||
assertNext(ts, reusableToken, "^_greetings_world_$", 1, 12.328828f, 0, 10);
|
||||
assertNext(ts, reusableToken, "greetings", 1, 1.0f, 0, 4);
|
||||
assertNext(ts, reusableToken, "greetings_world", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, reusableToken, "greetings_world_$", 1, 7.2111025f, 0, 10);
|
||||
assertNext(ts, reusableToken, "^_hello_earth", 1, 10.099504f, 0, 10);
|
||||
assertNext(ts, reusableToken, "^_hello_earth_$", 1, 12.328828f, 0, 10);
|
||||
assertNext(ts, reusableToken, "hello_earth", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, reusableToken, "hello_earth_$", 1, 7.2111025f, 0, 10);
|
||||
assertNext(ts, reusableToken, "earth", 1, 1.0f, 5, 10);
|
||||
assertNext(ts, reusableToken, "earth_$", 1, 7.1414285f, 5, 10);
|
||||
assertNext(ts, reusableToken, "^_greetings_earth", 1, 10.099504f, 0, 10);
|
||||
assertNext(ts, reusableToken, "^_greetings_earth_$", 1, 12.328828f, 0, 10);
|
||||
assertNext(ts, reusableToken, "greetings_earth", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, reusableToken, "greetings_earth_$", 1, 7.2111025f, 0, 10);
|
||||
assertNext(ts, reusableToken, "^_hello_tellus", 1, 10.099504f, 0, 10);
|
||||
assertNext(ts, reusableToken, "^_hello_tellus_$", 1, 12.328828f, 0, 10);
|
||||
assertNext(ts, reusableToken, "hello_tellus", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, reusableToken, "hello_tellus_$", 1, 7.2111025f, 0, 10);
|
||||
assertNext(ts, reusableToken, "tellus", 1, 1.0f, 5, 10);
|
||||
assertNext(ts, reusableToken, "tellus_$", 1, 7.1414285f, 5, 10);
|
||||
assertNext(ts, reusableToken, "^_greetings_tellus", 1, 10.099504f, 0, 10);
|
||||
assertNext(ts, reusableToken, "^_greetings_tellus_$", 1, 12.328828f, 0, 10);
|
||||
assertNext(ts, reusableToken, "greetings_tellus", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, reusableToken, "greetings_tellus_$", 1, 7.2111025f, 0, 10);
|
||||
assertNext(ts, "^", 1, 10.0f, 0, 0);
|
||||
assertNext(ts, "^_hello", 1, 10.049875f, 0, 4);
|
||||
assertNext(ts, "^_hello_world", 1, 10.099504f, 0, 10);
|
||||
assertNext(ts, "^_hello_world_$", 1, 12.328828f, 0, 10);
|
||||
assertNext(ts, "hello", 1, 1.0f, 0, 4);
|
||||
assertNext(ts, "hello_world", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, "hello_world_$", 1, 7.2111025f, 0, 10);
|
||||
assertNext(ts, "world", 1, 1.0f, 5, 10);
|
||||
assertNext(ts, "world_$", 1, 7.1414285f, 5, 10);
|
||||
assertNext(ts, "$", 1, 7.071068f, 10, 10);
|
||||
assertNext(ts, "^_greetings", 1, 10.049875f, 0, 4);
|
||||
assertNext(ts, "^_greetings_world", 1, 10.099504f, 0, 10);
|
||||
assertNext(ts, "^_greetings_world_$", 1, 12.328828f, 0, 10);
|
||||
assertNext(ts, "greetings", 1, 1.0f, 0, 4);
|
||||
assertNext(ts, "greetings_world", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, "greetings_world_$", 1, 7.2111025f, 0, 10);
|
||||
assertNext(ts, "^_hello_earth", 1, 10.099504f, 0, 10);
|
||||
assertNext(ts, "^_hello_earth_$", 1, 12.328828f, 0, 10);
|
||||
assertNext(ts, "hello_earth", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, "hello_earth_$", 1, 7.2111025f, 0, 10);
|
||||
assertNext(ts, "earth", 1, 1.0f, 5, 10);
|
||||
assertNext(ts, "earth_$", 1, 7.1414285f, 5, 10);
|
||||
assertNext(ts, "^_greetings_earth", 1, 10.099504f, 0, 10);
|
||||
assertNext(ts, "^_greetings_earth_$", 1, 12.328828f, 0, 10);
|
||||
assertNext(ts, "greetings_earth", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, "greetings_earth_$", 1, 7.2111025f, 0, 10);
|
||||
assertNext(ts, "^_hello_tellus", 1, 10.099504f, 0, 10);
|
||||
assertNext(ts, "^_hello_tellus_$", 1, 12.328828f, 0, 10);
|
||||
assertNext(ts, "hello_tellus", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, "hello_tellus_$", 1, 7.2111025f, 0, 10);
|
||||
assertNext(ts, "tellus", 1, 1.0f, 5, 10);
|
||||
assertNext(ts, "tellus_$", 1, 7.1414285f, 5, 10);
|
||||
assertNext(ts, "^_greetings_tellus", 1, 10.099504f, 0, 10);
|
||||
assertNext(ts, "^_greetings_tellus_$", 1, 12.328828f, 0, 10);
|
||||
assertNext(ts, "greetings_tellus", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, "greetings_tellus_$", 1, 7.2111025f, 0, 10);
|
||||
|
||||
assertNull(ts.next(reusableToken));
|
||||
assertFalse(ts.incrementToken());
|
||||
|
||||
// test unlimited size but don't allow single boundary token as shingle
|
||||
|
||||
|
@ -230,43 +237,43 @@ public class TestShingleMatrixFilter extends TestCase {
|
|||
// token.clear();
|
||||
// }
|
||||
|
||||
assertNext(ts, reusableToken, "^_hello", 1, 10.049875f, 0, 4);
|
||||
assertNext(ts, reusableToken, "^_hello_world", 1, 10.099504f, 0, 10);
|
||||
assertNext(ts, reusableToken, "^_hello_world_$", 1, 12.328828f, 0, 10);
|
||||
assertNext(ts, reusableToken, "hello", 1, 1.0f, 0, 4);
|
||||
assertNext(ts, reusableToken, "hello_world", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, reusableToken, "hello_world_$", 1, 7.2111025f, 0, 10);
|
||||
assertNext(ts, reusableToken, "world", 1, 1.0f, 5, 10);
|
||||
assertNext(ts, reusableToken, "world_$", 1, 7.1414285f, 5, 10);
|
||||
assertNext(ts, reusableToken, "^_greetings", 1, 10.049875f, 0, 4);
|
||||
assertNext(ts, reusableToken, "^_greetings_world", 1, 10.099504f, 0, 10);
|
||||
assertNext(ts, reusableToken, "^_greetings_world_$", 1, 12.328828f, 0, 10);
|
||||
assertNext(ts, reusableToken, "greetings", 1, 1.0f, 0, 4);
|
||||
assertNext(ts, reusableToken, "greetings_world", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, reusableToken, "greetings_world_$", 1, 7.2111025f, 0, 10);
|
||||
assertNext(ts, reusableToken, "^_hello_earth", 1, 10.099504f, 0, 10);
|
||||
assertNext(ts, reusableToken, "^_hello_earth_$", 1, 12.328828f, 0, 10);
|
||||
assertNext(ts, reusableToken, "hello_earth", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, reusableToken, "hello_earth_$", 1, 7.2111025f, 0, 10);
|
||||
assertNext(ts, reusableToken, "earth", 1, 1.0f, 5, 10);
|
||||
assertNext(ts, reusableToken, "earth_$", 1, 7.1414285f, 5, 10);
|
||||
assertNext(ts, reusableToken, "^_greetings_earth", 1, 10.099504f, 0, 10);
|
||||
assertNext(ts, reusableToken, "^_greetings_earth_$", 1, 12.328828f, 0, 10);
|
||||
assertNext(ts, reusableToken, "greetings_earth", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, reusableToken, "greetings_earth_$", 1, 7.2111025f, 0, 10);
|
||||
assertNext(ts, reusableToken, "^_hello_tellus", 1, 10.099504f, 0, 10);
|
||||
assertNext(ts, reusableToken, "^_hello_tellus_$", 1, 12.328828f, 0, 10);
|
||||
assertNext(ts, reusableToken, "hello_tellus", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, reusableToken, "hello_tellus_$", 1, 7.2111025f, 0, 10);
|
||||
assertNext(ts, reusableToken, "tellus", 1, 1.0f, 5, 10);
|
||||
assertNext(ts, reusableToken, "tellus_$", 1, 7.1414285f, 5, 10);
|
||||
assertNext(ts, reusableToken, "^_greetings_tellus", 1, 10.099504f, 0, 10);
|
||||
assertNext(ts, reusableToken, "^_greetings_tellus_$", 1, 12.328828f, 0, 10);
|
||||
assertNext(ts, reusableToken, "greetings_tellus", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, reusableToken, "greetings_tellus_$", 1, 7.2111025f, 0, 10);
|
||||
assertNext(ts, "^_hello", 1, 10.049875f, 0, 4);
|
||||
assertNext(ts, "^_hello_world", 1, 10.099504f, 0, 10);
|
||||
assertNext(ts, "^_hello_world_$", 1, 12.328828f, 0, 10);
|
||||
assertNext(ts, "hello", 1, 1.0f, 0, 4);
|
||||
assertNext(ts, "hello_world", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, "hello_world_$", 1, 7.2111025f, 0, 10);
|
||||
assertNext(ts, "world", 1, 1.0f, 5, 10);
|
||||
assertNext(ts, "world_$", 1, 7.1414285f, 5, 10);
|
||||
assertNext(ts, "^_greetings", 1, 10.049875f, 0, 4);
|
||||
assertNext(ts, "^_greetings_world", 1, 10.099504f, 0, 10);
|
||||
assertNext(ts, "^_greetings_world_$", 1, 12.328828f, 0, 10);
|
||||
assertNext(ts, "greetings", 1, 1.0f, 0, 4);
|
||||
assertNext(ts, "greetings_world", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, "greetings_world_$", 1, 7.2111025f, 0, 10);
|
||||
assertNext(ts, "^_hello_earth", 1, 10.099504f, 0, 10);
|
||||
assertNext(ts, "^_hello_earth_$", 1, 12.328828f, 0, 10);
|
||||
assertNext(ts, "hello_earth", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, "hello_earth_$", 1, 7.2111025f, 0, 10);
|
||||
assertNext(ts, "earth", 1, 1.0f, 5, 10);
|
||||
assertNext(ts, "earth_$", 1, 7.1414285f, 5, 10);
|
||||
assertNext(ts, "^_greetings_earth", 1, 10.099504f, 0, 10);
|
||||
assertNext(ts, "^_greetings_earth_$", 1, 12.328828f, 0, 10);
|
||||
assertNext(ts, "greetings_earth", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, "greetings_earth_$", 1, 7.2111025f, 0, 10);
|
||||
assertNext(ts, "^_hello_tellus", 1, 10.099504f, 0, 10);
|
||||
assertNext(ts, "^_hello_tellus_$", 1, 12.328828f, 0, 10);
|
||||
assertNext(ts, "hello_tellus", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, "hello_tellus_$", 1, 7.2111025f, 0, 10);
|
||||
assertNext(ts, "tellus", 1, 1.0f, 5, 10);
|
||||
assertNext(ts, "tellus_$", 1, 7.1414285f, 5, 10);
|
||||
assertNext(ts, "^_greetings_tellus", 1, 10.099504f, 0, 10);
|
||||
assertNext(ts, "^_greetings_tellus_$", 1, 12.328828f, 0, 10);
|
||||
assertNext(ts, "greetings_tellus", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, "greetings_tellus_$", 1, 7.2111025f, 0, 10);
|
||||
|
||||
|
||||
assertNull(ts.next(reusableToken));
|
||||
assertFalse(ts.incrementToken());
|
||||
|
||||
System.currentTimeMillis();
|
||||
|
||||
|
@ -301,20 +308,20 @@ public class TestShingleMatrixFilter extends TestCase {
|
|||
|
||||
// shingle, position increment, weight, start offset, end offset
|
||||
|
||||
assertNext(ts, reusableToken, "hello_world", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, reusableToken, "greetings_and", 1, 1.4142135f, 0, 4);
|
||||
assertNext(ts, reusableToken, "greetings_and_salutations", 1, 1.7320508f, 0, 4);
|
||||
assertNext(ts, reusableToken, "and_salutations", 1, 1.4142135f, 0, 4);
|
||||
assertNext(ts, reusableToken, "and_salutations_world", 1, 1.7320508f, 0, 10);
|
||||
assertNext(ts, reusableToken, "salutations_world", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, reusableToken, "hello_earth", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, reusableToken, "and_salutations_earth", 1, 1.7320508f, 0, 10);
|
||||
assertNext(ts, reusableToken, "salutations_earth", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, reusableToken, "hello_tellus", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, reusableToken, "and_salutations_tellus", 1, 1.7320508f, 0, 10);
|
||||
assertNext(ts, reusableToken, "salutations_tellus", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, "hello_world", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, "greetings_and", 1, 1.4142135f, 0, 4);
|
||||
assertNext(ts, "greetings_and_salutations", 1, 1.7320508f, 0, 4);
|
||||
assertNext(ts, "and_salutations", 1, 1.4142135f, 0, 4);
|
||||
assertNext(ts, "and_salutations_world", 1, 1.7320508f, 0, 10);
|
||||
assertNext(ts, "salutations_world", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, "hello_earth", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, "and_salutations_earth", 1, 1.7320508f, 0, 10);
|
||||
assertNext(ts, "salutations_earth", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, "hello_tellus", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, "and_salutations_tellus", 1, 1.7320508f, 0, 10);
|
||||
assertNext(ts, "salutations_tellus", 1, 1.4142135f, 0, 10);
|
||||
|
||||
assertNull(ts.next(reusableToken));
|
||||
assertFalse(ts.incrementToken());
|
||||
|
||||
System.currentTimeMillis();
|
||||
|
||||
|
@ -361,47 +368,47 @@ public class TestShingleMatrixFilter extends TestCase {
|
|||
// }
|
||||
|
||||
final Token reusableToken = new Token();
|
||||
assertNext(ts, reusableToken, "no_surprise", 1, 1.4142135f, 0, 0);
|
||||
assertNext(ts, reusableToken, "no_surprise_to", 1, 1.7320508f, 0, 0);
|
||||
assertNext(ts, reusableToken, "no_surprise_to_see", 1, 2.0f, 0, 0);
|
||||
assertNext(ts, reusableToken, "surprise_to", 1, 1.4142135f, 0, 0);
|
||||
assertNext(ts, reusableToken, "surprise_to_see", 1, 1.7320508f, 0, 0);
|
||||
assertNext(ts, reusableToken, "surprise_to_see_england", 1, 2.0f, 0, 0);
|
||||
assertNext(ts, reusableToken, "to_see", 1, 1.4142135f, 0, 0);
|
||||
assertNext(ts, reusableToken, "to_see_england", 1, 1.7320508f, 0, 0);
|
||||
assertNext(ts, reusableToken, "to_see_england_manager", 1, 2.0f, 0, 0);
|
||||
assertNext(ts, reusableToken, "see_england", 1, 1.4142135f, 0, 0);
|
||||
assertNext(ts, reusableToken, "see_england_manager", 1, 1.7320508f, 0, 0);
|
||||
assertNext(ts, reusableToken, "see_england_manager_svennis", 1, 2.0f, 0, 0);
|
||||
assertNext(ts, reusableToken, "england_manager", 1, 1.4142135f, 0, 0);
|
||||
assertNext(ts, reusableToken, "england_manager_svennis", 1, 1.7320508f, 0, 0);
|
||||
assertNext(ts, reusableToken, "england_manager_svennis_in", 1, 2.0f, 0, 0);
|
||||
assertNext(ts, reusableToken, "manager_svennis", 1, 1.4142135f, 0, 0);
|
||||
assertNext(ts, reusableToken, "manager_svennis_in", 1, 1.7320508f, 0, 0);
|
||||
assertNext(ts, reusableToken, "manager_svennis_in_the", 1, 2.0f, 0, 0);
|
||||
assertNext(ts, reusableToken, "svennis_in", 1, 1.4142135f, 0, 0);
|
||||
assertNext(ts, reusableToken, "svennis_in_the", 1, 1.7320508f, 0, 0);
|
||||
assertNext(ts, reusableToken, "svennis_in_the_croud", 1, 2.0f, 0, 0);
|
||||
assertNext(ts, reusableToken, "in_the", 1, 1.4142135f, 0, 0);
|
||||
assertNext(ts, reusableToken, "in_the_croud", 1, 1.7320508f, 0, 0);
|
||||
assertNext(ts, reusableToken, "the_croud", 1, 1.4142135f, 0, 0);
|
||||
assertNext(ts, reusableToken, "see_england_manager_sven", 1, 2.0f, 0, 0);
|
||||
assertNext(ts, reusableToken, "england_manager_sven", 1, 1.7320508f, 0, 0);
|
||||
assertNext(ts, reusableToken, "england_manager_sven_göran", 1, 2.0f, 0, 0);
|
||||
assertNext(ts, reusableToken, "manager_sven", 1, 1.4142135f, 0, 0);
|
||||
assertNext(ts, reusableToken, "manager_sven_göran", 1, 1.7320508f, 0, 0);
|
||||
assertNext(ts, reusableToken, "manager_sven_göran_eriksson", 1, 2.0f, 0, 0);
|
||||
assertNext(ts, reusableToken, "sven_göran", 1, 1.4142135f, 0, 0);
|
||||
assertNext(ts, reusableToken, "sven_göran_eriksson", 1, 1.7320508f, 0, 0);
|
||||
assertNext(ts, reusableToken, "sven_göran_eriksson_in", 1, 2.0f, 0, 0);
|
||||
assertNext(ts, reusableToken, "göran_eriksson", 1, 1.4142135f, 0, 0);
|
||||
assertNext(ts, reusableToken, "göran_eriksson_in", 1, 1.7320508f, 0, 0);
|
||||
assertNext(ts, reusableToken, "göran_eriksson_in_the", 1, 2.0f, 0, 0);
|
||||
assertNext(ts, reusableToken, "eriksson_in", 1, 1.4142135f, 0, 0);
|
||||
assertNext(ts, reusableToken, "eriksson_in_the", 1, 1.7320508f, 0, 0);
|
||||
assertNext(ts, reusableToken, "eriksson_in_the_croud", 1, 2.0f, 0, 0);
|
||||
assertNext(ts, "no_surprise", 1, 1.4142135f, 0, 0);
|
||||
assertNext(ts, "no_surprise_to", 1, 1.7320508f, 0, 0);
|
||||
assertNext(ts, "no_surprise_to_see", 1, 2.0f, 0, 0);
|
||||
assertNext(ts, "surprise_to", 1, 1.4142135f, 0, 0);
|
||||
assertNext(ts, "surprise_to_see", 1, 1.7320508f, 0, 0);
|
||||
assertNext(ts, "surprise_to_see_england", 1, 2.0f, 0, 0);
|
||||
assertNext(ts, "to_see", 1, 1.4142135f, 0, 0);
|
||||
assertNext(ts, "to_see_england", 1, 1.7320508f, 0, 0);
|
||||
assertNext(ts, "to_see_england_manager", 1, 2.0f, 0, 0);
|
||||
assertNext(ts, "see_england", 1, 1.4142135f, 0, 0);
|
||||
assertNext(ts, "see_england_manager", 1, 1.7320508f, 0, 0);
|
||||
assertNext(ts, "see_england_manager_svennis", 1, 2.0f, 0, 0);
|
||||
assertNext(ts, "england_manager", 1, 1.4142135f, 0, 0);
|
||||
assertNext(ts, "england_manager_svennis", 1, 1.7320508f, 0, 0);
|
||||
assertNext(ts, "england_manager_svennis_in", 1, 2.0f, 0, 0);
|
||||
assertNext(ts, "manager_svennis", 1, 1.4142135f, 0, 0);
|
||||
assertNext(ts, "manager_svennis_in", 1, 1.7320508f, 0, 0);
|
||||
assertNext(ts, "manager_svennis_in_the", 1, 2.0f, 0, 0);
|
||||
assertNext(ts, "svennis_in", 1, 1.4142135f, 0, 0);
|
||||
assertNext(ts, "svennis_in_the", 1, 1.7320508f, 0, 0);
|
||||
assertNext(ts, "svennis_in_the_croud", 1, 2.0f, 0, 0);
|
||||
assertNext(ts, "in_the", 1, 1.4142135f, 0, 0);
|
||||
assertNext(ts, "in_the_croud", 1, 1.7320508f, 0, 0);
|
||||
assertNext(ts, "the_croud", 1, 1.4142135f, 0, 0);
|
||||
assertNext(ts, "see_england_manager_sven", 1, 2.0f, 0, 0);
|
||||
assertNext(ts, "england_manager_sven", 1, 1.7320508f, 0, 0);
|
||||
assertNext(ts, "england_manager_sven_göran", 1, 2.0f, 0, 0);
|
||||
assertNext(ts, "manager_sven", 1, 1.4142135f, 0, 0);
|
||||
assertNext(ts, "manager_sven_göran", 1, 1.7320508f, 0, 0);
|
||||
assertNext(ts, "manager_sven_göran_eriksson", 1, 2.0f, 0, 0);
|
||||
assertNext(ts, "sven_göran", 1, 1.4142135f, 0, 0);
|
||||
assertNext(ts, "sven_göran_eriksson", 1, 1.7320508f, 0, 0);
|
||||
assertNext(ts, "sven_göran_eriksson_in", 1, 2.0f, 0, 0);
|
||||
assertNext(ts, "göran_eriksson", 1, 1.4142135f, 0, 0);
|
||||
assertNext(ts, "göran_eriksson_in", 1, 1.7320508f, 0, 0);
|
||||
assertNext(ts, "göran_eriksson_in_the", 1, 2.0f, 0, 0);
|
||||
assertNext(ts, "eriksson_in", 1, 1.4142135f, 0, 0);
|
||||
assertNext(ts, "eriksson_in_the", 1, 1.7320508f, 0, 0);
|
||||
assertNext(ts, "eriksson_in_the_croud", 1, 2.0f, 0, 0);
|
||||
|
||||
assertNull(ts.next(reusableToken));
|
||||
assertFalse(ts.incrementToken());
|
||||
|
||||
}
|
||||
|
||||
|
@ -445,40 +452,46 @@ public class TestShingleMatrixFilter extends TestCase {
|
|||
|
||||
// assert-methods start here
|
||||
|
||||
private Token assertNext(TokenStream ts, final Token reusableToken, String text) throws IOException {
|
||||
Token nextToken = ts.next(reusableToken);
|
||||
assertNotNull(nextToken);
|
||||
assertEquals(text, nextToken.term());
|
||||
return nextToken;
|
||||
private void assertNext(TokenStream ts, String text) throws IOException {
|
||||
TermAttribute termAtt = (TermAttribute) ts.addAttribute(TermAttribute.class);
|
||||
|
||||
assertTrue(ts.incrementToken());
|
||||
assertEquals(text, termAtt.term());
|
||||
}
|
||||
|
||||
private Token assertNext(TokenStream ts, final Token reusableToken, String text, int positionIncrement, float boost) throws IOException {
|
||||
Token nextToken = ts.next(reusableToken);
|
||||
assertNotNull(nextToken);
|
||||
assertEquals(text, nextToken.term());
|
||||
assertEquals(positionIncrement, nextToken.getPositionIncrement());
|
||||
assertEquals(boost, nextToken.getPayload() == null ? 1f : PayloadHelper.decodeFloat(nextToken.getPayload().getData()), 0);
|
||||
return nextToken;
|
||||
private void assertNext(TokenStream ts, String text, int positionIncrement, float boost) throws IOException {
|
||||
TermAttribute termAtt = (TermAttribute) ts.addAttribute(TermAttribute.class);
|
||||
PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) ts.addAttribute(PositionIncrementAttribute.class);
|
||||
PayloadAttribute payloadAtt = (PayloadAttribute) ts.addAttribute(PayloadAttribute.class);
|
||||
|
||||
assertTrue(ts.incrementToken());
|
||||
assertEquals(text, termAtt.term());
|
||||
assertEquals(positionIncrement, posIncrAtt.getPositionIncrement());
|
||||
assertEquals(boost, payloadAtt.getPayload() == null ? 1f : PayloadHelper.decodeFloat(payloadAtt.getPayload().getData()), 0);
|
||||
}
|
||||
|
||||
private Token assertNext(TokenStream ts, final Token reusableToken, String text, int positionIncrement, float boost, int startOffset, int endOffset) throws IOException {
|
||||
Token nextToken = ts.next(reusableToken);
|
||||
assertNotNull(nextToken);
|
||||
assertEquals(text, nextToken.term());
|
||||
assertEquals(positionIncrement, nextToken.getPositionIncrement());
|
||||
assertEquals(boost, nextToken.getPayload() == null ? 1f : PayloadHelper.decodeFloat(nextToken.getPayload().getData()), 0);
|
||||
assertEquals(startOffset, nextToken.startOffset());
|
||||
assertEquals(endOffset, nextToken.endOffset());
|
||||
return nextToken;
|
||||
private void assertNext(TokenStream ts, String text, int positionIncrement, float boost, int startOffset, int endOffset) throws IOException {
|
||||
TermAttribute termAtt = (TermAttribute) ts.addAttribute(TermAttribute.class);
|
||||
PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) ts.addAttribute(PositionIncrementAttribute.class);
|
||||
PayloadAttribute payloadAtt = (PayloadAttribute) ts.addAttribute(PayloadAttribute.class);
|
||||
OffsetAttribute offsetAtt = (OffsetAttribute) ts.addAttribute(OffsetAttribute.class);
|
||||
|
||||
assertTrue(ts.incrementToken());
|
||||
assertEquals(text, termAtt.term());
|
||||
assertEquals(positionIncrement, posIncrAtt.getPositionIncrement());
|
||||
assertEquals(boost, payloadAtt.getPayload() == null ? 1f : PayloadHelper.decodeFloat(payloadAtt.getPayload().getData()), 0);
|
||||
assertEquals(startOffset, offsetAtt.startOffset());
|
||||
assertEquals(endOffset, offsetAtt.endOffset());
|
||||
}
|
||||
|
||||
private void assertNext(TokenStream ts, String text, int startOffset, int endOffset) throws IOException {
|
||||
TermAttribute termAtt = (TermAttribute) ts.addAttribute(TermAttribute.class);
|
||||
OffsetAttribute offsetAtt = (OffsetAttribute) ts.addAttribute(OffsetAttribute.class);
|
||||
|
||||
private Token assertNext(TokenStream ts, final Token reusableToken, String text, int startOffset, int endOffset) throws IOException {
|
||||
Token nextToken = ts.next(reusableToken);
|
||||
assertNotNull(nextToken);
|
||||
assertEquals(text, nextToken.term());
|
||||
assertEquals(startOffset, nextToken.startOffset());
|
||||
assertEquals(endOffset, nextToken.endOffset());
|
||||
return nextToken;
|
||||
assertTrue(ts.incrementToken());
|
||||
assertEquals(text, termAtt.term());
|
||||
assertEquals(startOffset, offsetAtt.startOffset());
|
||||
assertEquals(endOffset, offsetAtt.endOffset());
|
||||
}
|
||||
|
||||
private static Token createToken(String term, int start, int offset)
|
||||
|
@ -492,31 +505,41 @@ public class TestShingleMatrixFilter extends TestCase {
|
|||
public static class TokenListStream extends TokenStream {
|
||||
|
||||
private Collection tokens;
|
||||
|
||||
public TokenListStream(TokenStream ts) throws IOException {
|
||||
tokens = new ArrayList();
|
||||
final Token reusableToken = new Token();
|
||||
for (Token nextToken = ts.next(reusableToken); nextToken != null; nextToken = ts.next(reusableToken)) {
|
||||
tokens.add((Token) nextToken.clone());
|
||||
}
|
||||
}
|
||||
|
||||
TermAttribute termAtt;
|
||||
PositionIncrementAttribute posIncrAtt;
|
||||
PayloadAttribute payloadAtt;
|
||||
OffsetAttribute offsetAtt;
|
||||
TypeAttribute typeAtt;
|
||||
FlagsAttribute flagsAtt;
|
||||
|
||||
public TokenListStream(Collection tokens) {
|
||||
this.tokens = tokens;
|
||||
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||
posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
|
||||
payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class);
|
||||
offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
|
||||
typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
|
||||
flagsAtt = (FlagsAttribute) addAttribute(FlagsAttribute.class);
|
||||
}
|
||||
|
||||
private Iterator iterator;
|
||||
|
||||
public Token next(final Token reusableToken) throws IOException {
|
||||
assert reusableToken != null;
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (iterator == null) {
|
||||
iterator = tokens.iterator();
|
||||
}
|
||||
if (!iterator.hasNext()) {
|
||||
return null;
|
||||
return false;
|
||||
}
|
||||
Token nextToken = (Token) iterator.next();
|
||||
return (Token) nextToken.clone();
|
||||
Token prototype = (Token) iterator.next();
|
||||
termAtt.setTermBuffer(prototype.termBuffer(), 0, prototype.termLength());
|
||||
posIncrAtt.setPositionIncrement(prototype.getPositionIncrement());
|
||||
flagsAtt.setFlags(prototype.getFlags());
|
||||
offsetAtt.setOffset(prototype.startOffset(), prototype.endOffset());
|
||||
typeAtt.setType(prototype.type());
|
||||
payloadAtt.setPayload(prototype.getPayload());
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue