LUCENE-1775: Change remaining contrib TokenFilters (shingle, prefix-suffix) to use the new TokenStream API.

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@800195 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael Busch 2009-08-03 04:33:10 +00:00
parent 5aaf5b0167
commit 457c29d31e
10 changed files with 655 additions and 366 deletions

View File

@ -18,6 +18,12 @@ API Changes
you are interested in locally and access them on each call to the method that used to pass a new
Token. Look at the included updated impls for examples. (Mark Miller)
2. LUCENE-1460: Change contrib TokenStreams/Filters to use the new
TokenStream API. (Robert Muir, Michael Busch)
3. LUCENE-1775: Change remaining TokenFilters (shingle, prefix-suffix) to
use the new TokenStream API. (Robert Muir, Michael Busch)
Bug fixes
1. LUCENE-1423: InstantiatedTermEnum#skipTo(Term) throws ArrayIndexOutOfBounds on empty index.

View File

@ -24,13 +24,16 @@ import java.io.IOException;
/**
* Links two PrefixAwareTokenFilter
* @deprecated
* <p/>
* <b>NOTE:</b> This filter might not behave correctly if used with custom Attributes, i.e. Attributes other than
* the ones located in org.apache.lucene.analysis.tokenattributes.
*/
public class PrefixAndSuffixAwareTokenFilter extends TokenStream {
private PrefixAwareTokenFilter suffix;
public PrefixAndSuffixAwareTokenFilter(TokenStream prefix, TokenStream input, TokenStream suffix) {
super(suffix);
prefix = new PrefixAwareTokenFilter(prefix, input) {
public Token updateSuffixToken(Token suffixToken, Token lastInputToken) {
return PrefixAndSuffixAwareTokenFilter.this.updateInputToken(suffixToken, lastInputToken);
@ -56,11 +59,21 @@ public class PrefixAndSuffixAwareTokenFilter extends TokenStream {
}
public Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
return suffix.next(reusableToken);
public final boolean incrementToken() throws IOException {
return suffix.incrementToken();
}
/** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
* not be overridden. Delegates to the backwards compatibility layer. */
public final Token next(final Token reusableToken) throws java.io.IOException {
return super.next(reusableToken);
}
/** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
* not be overridden. Delegates to the backwards compatibility layer. */
public final Token next() throws java.io.IOException {
return super.next();
}
public void reset() throws IOException {
suffix.reset();

View File

@ -19,6 +19,12 @@ package org.apache.lucene.analysis.miscellaneous;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.index.Payload;
import java.io.IOException;
@ -29,28 +35,58 @@ import java.io.IOException;
* to be used when updating the token values in the second stream based on that token.
*
* The default implementation adds last prefix token end offset to the suffix token start and end offsets.
* @deprecated
* <p/>
* <b>NOTE:</b> This filter might not behave correctly if used with custom Attributes, i.e. Attributes other than
* the ones located in org.apache.lucene.analysis.tokenattributes.
*/
public class PrefixAwareTokenFilter extends TokenStream {
private TokenStream prefix;
private TokenStream suffix;
private TermAttribute termAtt;
private PositionIncrementAttribute posIncrAtt;
private PayloadAttribute payloadAtt;
private OffsetAttribute offsetAtt;
private TypeAttribute typeAtt;
private FlagsAttribute flagsAtt;
private TermAttribute p_termAtt;
private PositionIncrementAttribute p_posIncrAtt;
private PayloadAttribute p_payloadAtt;
private OffsetAttribute p_offsetAtt;
private TypeAttribute p_typeAtt;
private FlagsAttribute p_flagsAtt;
public PrefixAwareTokenFilter(TokenStream prefix, TokenStream suffix) {
super(suffix);
this.suffix = suffix;
this.prefix = prefix;
prefixExhausted = false;
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class);
offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
flagsAtt = (FlagsAttribute) addAttribute(FlagsAttribute.class);
p_termAtt = (TermAttribute) prefix.addAttribute(TermAttribute.class);
p_posIncrAtt = (PositionIncrementAttribute) prefix.addAttribute(PositionIncrementAttribute.class);
p_payloadAtt = (PayloadAttribute) prefix.addAttribute(PayloadAttribute.class);
p_offsetAtt = (OffsetAttribute) prefix.addAttribute(OffsetAttribute.class);
p_typeAtt = (TypeAttribute) prefix.addAttribute(TypeAttribute.class);
p_flagsAtt = (FlagsAttribute) prefix.addAttribute(FlagsAttribute.class);
}
private Token previousPrefixToken = new Token();
private Token reusableToken = new Token();
private boolean prefixExhausted;
public Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
public final boolean incrementToken() throws IOException {
if (!prefixExhausted) {
Token nextToken = prefix.next(reusableToken);
Token nextToken = getNextPrefixInputToken(reusableToken);
if (nextToken == null) {
prefixExhausted = true;
} else {
@ -60,16 +96,63 @@ public class PrefixAwareTokenFilter extends TokenStream {
if (p != null) {
previousPrefixToken.setPayload((Payload) p.clone());
}
return nextToken;
setCurrentToken(nextToken);
return true;
}
}
Token nextToken = suffix.next(reusableToken);
Token nextToken = getNextSuffixInputToken(reusableToken);
if (nextToken == null) {
return null;
return false;
}
return updateSuffixToken(nextToken, previousPrefixToken);
nextToken = updateSuffixToken(nextToken, previousPrefixToken);
setCurrentToken(nextToken);
return true;
}
/** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
* not be overridden. Delegates to the backwards compatibility layer. */
public final Token next(final Token reusableToken) throws java.io.IOException {
return super.next(reusableToken);
}
/** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
* not be overridden. Delegates to the backwards compatibility layer. */
public final Token next() throws java.io.IOException {
return super.next();
}
private void setCurrentToken(Token token) {
if (token == null) return;
termAtt.setTermBuffer(token.termBuffer(), 0, token.termLength());
posIncrAtt.setPositionIncrement(token.getPositionIncrement());
flagsAtt.setFlags(token.getFlags());
offsetAtt.setOffset(token.startOffset(), token.endOffset());
typeAtt.setType(token.type());
payloadAtt.setPayload(token.getPayload());
}
private Token getNextPrefixInputToken(Token token) throws IOException {
if (!prefix.incrementToken()) return null;
token.setTermBuffer(p_termAtt.termBuffer(), 0, p_termAtt.termLength());
token.setPositionIncrement(p_posIncrAtt.getPositionIncrement());
token.setFlags(p_flagsAtt.getFlags());
token.setOffset(p_offsetAtt.startOffset(), p_offsetAtt.endOffset());
token.setType(p_typeAtt.type());
token.setPayload(p_payloadAtt.getPayload());
return token;
}
private Token getNextSuffixInputToken(Token token) throws IOException {
if (!suffix.incrementToken()) return null;
token.setTermBuffer(termAtt.termBuffer(), 0, termAtt.termLength());
token.setPositionIncrement(posIncrAtt.getPositionIncrement());
token.setFlags(flagsAtt.getFlags());
token.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset());
token.setType(typeAtt.type());
token.setPayload(payloadAtt.getPayload());
return token;
}
/**

View File

@ -18,12 +18,17 @@ package org.apache.lucene.analysis.shingle;
*/
import java.io.IOException;
import java.util.LinkedList;
import java.util.Iterator;
import java.util.LinkedList;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.AttributeSource;
/**
* <p>A ShingleFilter constructs shingles (token n-grams) from a token stream.
@ -39,8 +44,6 @@ import org.apache.lucene.analysis.Token;
public class ShingleFilter extends TokenFilter {
private LinkedList shingleBuf = new LinkedList();
private LinkedList outputBuf = new LinkedList();
private LinkedList tokenBuf = new LinkedList();
private StringBuffer[] shingles;
private String tokenType = "shingle";
@ -81,6 +84,11 @@ public class ShingleFilter extends TokenFilter {
public ShingleFilter(TokenStream input, int maxShingleSize) {
super(input);
setMaxShingleSize(maxShingleSize);
this.termAtt = (TermAttribute) addAttribute(TermAttribute.class);
this.offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
this.posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
this.typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
}
/**
@ -148,23 +156,69 @@ public class ShingleFilter extends TokenFilter {
shingles[i].setLength(0);
}
}
private AttributeSource.State nextToken;
private int shingleBufferPosition;
private int[] endOffsets;
/* (non-Javadoc)
* @see org.apache.lucene.analysis.TokenStream#next()
*/
public Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
if (outputBuf.isEmpty()) {
fillOutputBuf(reusableToken);
public final boolean incrementToken() throws IOException {
while (true) {
if (nextToken == null) {
if (!fillShingleBuffer()) {
return false;
}
}
nextToken = (AttributeSource.State) shingleBuf.getFirst();
if (shingleBufferPosition == 0 && (! shingleBuf.isEmpty()) && outputUnigrams) {
restoreState(nextToken);
posIncrAtt.setPositionIncrement(1);
shingleBufferPosition++;
return true;
}
if (shingleBufferPosition < shingleBuf.size()) {
restoreState(nextToken);
typeAtt.setType(tokenType);
offsetAtt.setOffset(offsetAtt.startOffset(), endOffsets[shingleBufferPosition]);
StringBuffer buf = shingles[shingleBufferPosition];
int termLength = buf.length();
char[] termBuffer = termAtt.termBuffer();
if (termBuffer.length < termLength)
termBuffer = termAtt.resizeTermBuffer(termLength);
buf.getChars(0, termLength, termBuffer, 0);
termAtt.setTermLength(termLength);
if ((! outputUnigrams) && shingleBufferPosition == 1) {
posIncrAtt.setPositionIncrement(1);
} else {
posIncrAtt.setPositionIncrement(0);
}
shingleBufferPosition++;
if (shingleBufferPosition == shingleBuf.size()) {
nextToken = null;
shingleBufferPosition = 0;
}
return true;
} else {
nextToken = null;
shingleBufferPosition = 0;
}
}
Token nextToken = null;
if ( ! outputBuf.isEmpty())
{
nextToken = (Token)outputBuf.remove(0);
}
return nextToken;
}
private int numFillerTokensToInsert;
private AttributeSource.State currentToken;
private boolean hasCurrentToken;
private TermAttribute termAtt;
private OffsetAttribute offsetAtt;
private PositionIncrementAttribute posIncrAtt;
private TypeAttribute typeAtt;
/**
* Get the next token from the input stream and push it on the token buffer.
* If we encounter a token with position increment > 1, we put filler tokens
@ -174,41 +228,53 @@ public class ShingleFilter extends TokenFilter {
* @return the next token, or null if at end of input stream
* @throws IOException if the input stream has a problem
*/
private Token getNextToken(final Token reusableToken) throws IOException {
if (tokenBuf.isEmpty()) {
Token nextToken = input.next(reusableToken);
if (nextToken != null) {
for (int i = 1; i < nextToken.getPositionIncrement(); i++) {
Token fillerToken = (Token) nextToken.clone();
// A filler token occupies no space
fillerToken.setEndOffset(fillerToken.startOffset());
fillerToken.setTermBuffer(FILLER_TOKEN, 0, FILLER_TOKEN.length);
tokenBuf.add(fillerToken);
}
tokenBuf.add(nextToken.clone());
return getNextToken(nextToken);
} else {
return null;
}
} else {
return (Token)tokenBuf.remove(0);
private boolean getNextToken() throws IOException {
while (true) {
if (numFillerTokensToInsert > 0) {
if (currentToken == null) {
currentToken = captureState();
} else {
restoreState(currentToken);
}
numFillerTokensToInsert--;
// A filler token occupies no space
offsetAtt.setOffset(offsetAtt.startOffset(), offsetAtt.startOffset());
termAtt.setTermBuffer(FILLER_TOKEN, 0, FILLER_TOKEN.length);
return true;
}
if (hasCurrentToken) {
if (currentToken != null) {
restoreState(currentToken);
currentToken = null;
}
hasCurrentToken = false;
return true;
}
if (!input.incrementToken()) return false;
hasCurrentToken = true;
if (posIncrAtt.getPositionIncrement() > 1) {
numFillerTokensToInsert = posIncrAtt.getPositionIncrement() - 1;
}
}
}
}
/**
* Fill the output buffer with new shingles.
*
* @throws IOException if there's a problem getting the next token
*/
private void fillOutputBuf(Token token) throws IOException {
private boolean fillShingleBuffer() throws IOException {
boolean addedToken = false;
/*
* Try to fill the shingle buffer.
*/
do {
token = getNextToken(token);
if (token != null) {
shingleBuf.add(token.clone());
if (getNextToken()) {
shingleBuf.add(captureState());
if (shingleBuf.size() > maxShingleSize)
{
shingleBuf.remove(0);
@ -219,69 +285,55 @@ public class ShingleFilter extends TokenFilter {
}
} while (shingleBuf.size() < maxShingleSize);
if (shingleBuf.isEmpty()) {
return false;
}
/*
* If no new token could be added to the shingle buffer, we have reached
* the end of the input stream and have to discard the least recent token.
*/
if (! addedToken) {
if (shingleBuf.isEmpty()) {
return;
} else {
shingleBuf.remove(0);
}
shingleBuf.remove(0);
}
if (shingleBuf.isEmpty()) {
return false;
}
clearShingles();
int[] endOffsets = new int[shingleBuf.size()];
endOffsets = new int[shingleBuf.size()];
for (int i = 0; i < endOffsets.length; i++) {
endOffsets[i] = 0;
}
int i = 0;
Token shingle = null;
for (Iterator it = shingleBuf.iterator(); it.hasNext(); ) {
shingle = (Token) it.next();
restoreState((AttributeSource.State) it.next());
for (int j = i; j < shingles.length; j++) {
if (shingles[j].length() != 0) {
shingles[j].append(TOKEN_SEPARATOR);
}
shingles[j].append(shingle.termBuffer(), 0, shingle.termLength());
shingles[j].append(termAtt.termBuffer(), 0, termAtt.termLength());
}
endOffsets[i] = shingle.endOffset();
endOffsets[i] = offsetAtt.endOffset();
i++;
}
return true;
}
/** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
* not be overridden. Delegates to the backwards compatibility layer. */
public final Token next(final Token reusableToken) throws java.io.IOException {
return super.next(reusableToken);
}
if ((! shingleBuf.isEmpty()) && outputUnigrams) {
Token unigram = (Token) shingleBuf.getFirst();
unigram.setPositionIncrement(1);
outputBuf.add(unigram);
}
/*
* Push new tokens to the output buffer.
*/
if (!shingleBuf.isEmpty()) {
Token firstShingle = (Token) shingleBuf.get(0);
shingle = (Token) firstShingle.clone();
shingle.setType(tokenType);
}
for (int j = 1; j < shingleBuf.size(); j++) {
shingle.setEndOffset(endOffsets[j]);
StringBuffer buf = shingles[j];
int termLength = buf.length();
char[] termBuffer = shingle.termBuffer();
if (termBuffer.length < termLength)
termBuffer = shingle.resizeTermBuffer(termLength);
buf.getChars(0, termLength, termBuffer, 0);
shingle.setTermLength(termLength);
if ((! outputUnigrams) && j == 1) {
shingle.setPositionIncrement(1);
} else {
shingle.setPositionIncrement(0);
}
outputBuf.add(shingle.clone());
}
/** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
* not be overridden. Delegates to the backwards compatibility layer. */
public final Token next() throws java.io.IOException {
return super.next();
}
}

View File

@ -30,6 +30,12 @@ import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.EmptyTokenStream;
import org.apache.lucene.analysis.payloads.PayloadHelper;
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.index.Payload;
@ -104,6 +110,9 @@ import org.apache.lucene.index.Payload;
* <p>The filter also has basic support for calculating weights for the shingles
* based on the weights of the tokens from the input stream, output shingle size, et c.
* See {@link #calculateShingleWeight(org.apache.lucene.analysis.Token, java.util.List, int, java.util.List, java.util.List)}.
* <p/>
* <b>NOTE:</b> This filter might not behave correctly if used with custom Attributes, i.e. Attributes other than
* the ones located in org.apache.lucene.analysis.tokenattributes.
*/
public class ShingleMatrixFilter extends TokenStream {
@ -183,7 +192,21 @@ public class ShingleMatrixFilter extends TokenStream {
private TokenStream input;
private TermAttribute termAtt;
private PositionIncrementAttribute posIncrAtt;
private PayloadAttribute payloadAtt;
private OffsetAttribute offsetAtt;
private TypeAttribute typeAtt;
private FlagsAttribute flagsAtt;
private TermAttribute in_termAtt;
private PositionIncrementAttribute in_posIncrAtt;
private PayloadAttribute in_payloadAtt;
private OffsetAttribute in_offsetAtt;
private TypeAttribute in_typeAtt;
private FlagsAttribute in_flagsAtt;
/**
* Creates a shingle filter based on a user defined matrix.
*
@ -205,8 +228,22 @@ public class ShingleMatrixFilter extends TokenStream {
this.ignoringSinglePrefixOrSuffixShingle = ignoringSinglePrefixOrSuffixShingle;
this.settingsCodec = settingsCodec;
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class);
offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
flagsAtt = (FlagsAttribute) addAttribute(FlagsAttribute.class);
// set the input to be an empty token stream, we already have the data.
this.input = new EmptyTokenStream();
in_termAtt = (TermAttribute) input.addAttribute(TermAttribute.class);
in_posIncrAtt = (PositionIncrementAttribute) input.addAttribute(PositionIncrementAttribute.class);
in_payloadAtt = (PayloadAttribute) input.addAttribute(PayloadAttribute.class);
in_offsetAtt = (OffsetAttribute) input.addAttribute(OffsetAttribute.class);
in_typeAtt = (TypeAttribute) input.addAttribute(TypeAttribute.class);
in_flagsAtt = (FlagsAttribute) input.addAttribute(FlagsAttribute.class);
}
/**
@ -273,6 +310,19 @@ public class ShingleMatrixFilter extends TokenStream {
this.spacerCharacter = spacerCharacter;
this.ignoringSinglePrefixOrSuffixShingle = ignoringSinglePrefixOrSuffixShingle;
this.settingsCodec = settingsCodec;
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class);
offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
flagsAtt = (FlagsAttribute) addAttribute(FlagsAttribute.class);
in_termAtt = (TermAttribute) input.addAttribute(TermAttribute.class);
in_posIncrAtt = (PositionIncrementAttribute) input.addAttribute(PositionIncrementAttribute.class);
in_payloadAtt = (PayloadAttribute) input.addAttribute(PayloadAttribute.class);
in_offsetAtt = (OffsetAttribute) input.addAttribute(OffsetAttribute.class);
in_typeAtt = (TypeAttribute) input.addAttribute(TypeAttribute.class);
in_flagsAtt = (FlagsAttribute) input.addAttribute(FlagsAttribute.class);
}
// internal filter instance variables
@ -302,10 +352,10 @@ public class ShingleMatrixFilter extends TokenStream {
}
private Matrix matrix;
public Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
private Token reusableToken = new Token();
public final boolean incrementToken() throws IOException {
if (matrix == null) {
matrix = new Matrix();
// fill matrix with maximumShingleSize columns
@ -321,9 +371,39 @@ public class ShingleMatrixFilter extends TokenStream {
do {
token = produceNextToken(reusableToken);
} while (token == request_next_token);
if (token == null) return false;
termAtt.setTermBuffer(token.termBuffer(), 0, token.termLength());
posIncrAtt.setPositionIncrement(token.getPositionIncrement());
flagsAtt.setFlags(token.getFlags());
offsetAtt.setOffset(token.startOffset(), token.endOffset());
typeAtt.setType(token.type());
payloadAtt.setPayload(token.getPayload());
return true;
}
private Token getNextInputToken(Token token) throws IOException {
if (!input.incrementToken()) return null;
token.setTermBuffer(in_termAtt.termBuffer(), 0, in_termAtt.termLength());
token.setPositionIncrement(in_posIncrAtt.getPositionIncrement());
token.setFlags(in_flagsAtt.getFlags());
token.setOffset(in_offsetAtt.startOffset(), in_offsetAtt.endOffset());
token.setType(in_typeAtt.type());
token.setPayload(in_payloadAtt.getPayload());
return token;
}
/** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
* not be overridden. Delegates to the backwards compatibility layer. */
public final Token next(final Token reusableToken) throws java.io.IOException {
return super.next(reusableToken);
}
/** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
* not be overridden. Delegates to the backwards compatibility layer. */
public final Token next() throws java.io.IOException {
return super.next();
}
private static final Token request_next_token = new Token();
@ -573,7 +653,7 @@ public class ShingleMatrixFilter extends TokenStream {
token = readColumnBuf;
readColumnBuf = null;
} else {
token = input.next(new Token());
token = getNextInputToken(new Token());
}
if (token == null) {
@ -585,7 +665,7 @@ public class ShingleMatrixFilter extends TokenStream {
currentReaderRow.getTokens().add(token);
TokenPositioner tokenPositioner;
while ((readColumnBuf = input.next(new Token())) != null
while ((readColumnBuf = getNextInputToken(new Token())) != null
&& (tokenPositioner = settingsCodec.getTokenPositioner(readColumnBuf)) != TokenPositioner.newColumn) {
if (tokenPositioner == TokenPositioner.sameRow) {
@ -599,7 +679,7 @@ public class ShingleMatrixFilter extends TokenStream {
}
if (readColumnBuf == null) {
readColumnBuf = input.next(new Token());
readColumnBuf = getNextInputToken(new Token());
if (readColumnBuf == null) {
currentReaderColumn.setLast(true);
}

View File

@ -21,6 +21,8 @@ import junit.framework.TestCase;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import java.io.IOException;
import java.io.StringReader;
@ -34,22 +36,22 @@ public class TestPrefixAndSuffixAwareTokenFilter extends TestCase {
new WhitespaceTokenizer(new StringReader("hello world")),
new SingleTokenTokenStream(createToken("$", 0, 0)));
Token token = new Token();
assertNext(ts, token, "^", 0, 0);
assertNext(ts, token, "hello", 0, 5);
assertNext(ts, token, "world", 6, 11);
assertNext(ts, token, "$", 11, 11);
assertNull(ts.next(token));
assertNext(ts, "^", 0, 0);
assertNext(ts, "hello", 0, 5);
assertNext(ts, "world", 6, 11);
assertNext(ts, "$", 11, 11);
assertFalse(ts.incrementToken());
}
private Token assertNext(TokenStream ts, final Token reusableToken, String text, int startOffset, int endOffset) throws IOException {
Token nextToken = ts.next(reusableToken);
assertNotNull(nextToken);
assertEquals(text, nextToken.term());
assertEquals(startOffset, nextToken.startOffset());
assertEquals(endOffset, nextToken.endOffset());
return nextToken;
private void assertNext(TokenStream ts, String text, int startOffset, int endOffset) throws IOException {
TermAttribute termAtt = (TermAttribute) ts.addAttribute(TermAttribute.class);
OffsetAttribute offsetAtt = (OffsetAttribute) ts.addAttribute(OffsetAttribute.class);
assertTrue(ts.incrementToken());
assertEquals(text, termAtt.term());
assertEquals(startOffset, offsetAtt.startOffset());
assertEquals(endOffset, offsetAtt.endOffset());
}
private static Token createToken(String term, int start, int offset)

View File

@ -21,6 +21,8 @@ import junit.framework.TestCase;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import java.io.IOException;
import java.io.StringReader;
@ -34,32 +36,31 @@ public class TestPrefixAwareTokenFilter extends TestCase {
ts = new PrefixAwareTokenFilter(
new SingleTokenTokenStream(createToken("a", 0, 1)),
new SingleTokenTokenStream(createToken("b", 0, 1)));
final Token reusableToken = new Token();
assertNext(ts, reusableToken, "a", 0, 1);
assertNext(ts, reusableToken, "b", 1, 2);
assertNull(ts.next(reusableToken));
assertNext(ts, "a", 0, 1);
assertNext(ts, "b", 1, 2);
assertFalse(ts.incrementToken());
// prefix and suffix using 2x prefix
ts = new PrefixAwareTokenFilter(new SingleTokenTokenStream(createToken("^", 0, 0)), new WhitespaceTokenizer(new StringReader("hello world")));
ts = new PrefixAwareTokenFilter(ts, new SingleTokenTokenStream(createToken("$", 0, 0)));
assertNext(ts, reusableToken, "^", 0, 0);
assertNext(ts, reusableToken, "hello", 0, 5);
assertNext(ts, reusableToken, "world", 6, 11);
assertNext(ts, reusableToken, "$", 11, 11);
assertNull(ts.next(reusableToken));
assertNext(ts, "^", 0, 0);
assertNext(ts, "hello", 0, 5);
assertNext(ts, "world", 6, 11);
assertNext(ts, "$", 11, 11);
assertFalse(ts.incrementToken());
}
private Token assertNext(TokenStream ts, final Token reusableToken, String text, int startOffset, int endOffset) throws IOException {
Token nextToken = ts.next(reusableToken);
assertNotNull(nextToken);
assertEquals(text, nextToken.term());
assertEquals(startOffset, nextToken.startOffset());
assertEquals(endOffset, nextToken.endOffset());
return nextToken;
private void assertNext(TokenStream ts, String text, int startOffset, int endOffset) throws IOException {
TermAttribute termAtt = (TermAttribute) ts.addAttribute(TermAttribute.class);
OffsetAttribute offsetAtt = (OffsetAttribute) ts.addAttribute(OffsetAttribute.class);
assertTrue(ts.incrementToken());
assertEquals(text, termAtt.term());
assertEquals(startOffset, offsetAtt.startOffset());
assertEquals(endOffset, offsetAtt.endOffset());
}
private static Token createToken(String term, int start, int offset)

View File

@ -23,6 +23,8 @@ import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
@ -157,10 +159,13 @@ public class ShingleAnalyzerWrapperTest extends TestCase {
TokenStream ts = analyzer.tokenStream("content",
new StringReader("this sentence"));
int j = -1;
final Token reusableToken = new Token();
for (Token nextToken = ts.next(reusableToken); nextToken != null; nextToken = ts.next(reusableToken)) {
j += nextToken.getPositionIncrement();
String termText = nextToken.term();
PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) ts.addAttribute(PositionIncrementAttribute.class);
TermAttribute termAtt = (TermAttribute) ts.addAttribute(TermAttribute.class);
while (ts.incrementToken()) {
j += posIncrAtt.getPositionIncrement();
String termText = termAtt.term();
q.add(new Term("content", termText), j);
}
@ -182,9 +187,11 @@ public class ShingleAnalyzerWrapperTest extends TestCase {
TokenStream ts = analyzer.tokenStream("content",
new StringReader("test sentence"));
final Token reusableToken = new Token();
for (Token nextToken = ts.next(reusableToken); nextToken != null; nextToken = ts.next(reusableToken)) {
String termText = nextToken.term();
TermAttribute termAtt = (TermAttribute) ts.addAttribute(TermAttribute.class);
while (ts.incrementToken()) {
String termText = termAtt.term();
q.add(new TermQuery(new Term("content", termText)),
BooleanClause.Occur.SHOULD);
}

View File

@ -22,6 +22,11 @@ import java.io.IOException;
import junit.framework.TestCase;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttributeImpl;
public class ShingleFilterTest extends TestCase {
@ -29,18 +34,31 @@ public class ShingleFilterTest extends TestCase {
protected int index = 0;
protected Token[] testToken;
private TermAttribute termAtt;
private OffsetAttribute offsetAtt;
private PositionIncrementAttribute posIncrAtt;
private TypeAttribute typeAtt;
public TestTokenStream(Token[] testToken) {
super();
this.testToken = testToken;
this.termAtt = (TermAttribute) addAttribute(TermAttribute.class);
this.offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
this.posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
this.typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
}
public Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
public final boolean incrementToken() throws IOException {
if (index < testToken.length) {
return testToken[index++];
Token t = testToken[index++];
termAtt.setTermBuffer(t.termBuffer(), 0, t.termLength());
offsetAtt.setOffset(t.startOffset(), t.endOffset());
posIncrAtt.setPositionIncrement(t.getPositionIncrement());
typeAtt.setType(TypeAttributeImpl.DEFAULT_TYPE);
return true;
} else {
return null;
return false;
}
}
}
@ -163,25 +181,29 @@ public class ShingleFilterTest extends TestCase {
this.shingleFilterTest(3, TEST_TOKEN, TRI_GRAM_TOKENS,
TRI_GRAM_POSITION_INCREMENTS, TRI_GRAM_TYPES);
}
protected void shingleFilterTest(int maxSize, Token[] tokensToShingle, Token[] tokensToCompare,
int[] positionIncrements, String[] types)
throws IOException {
TokenStream filter = new ShingleFilter(new TestTokenStream(tokensToShingle), maxSize);
TermAttribute termAtt = (TermAttribute) filter.addAttribute(TermAttribute.class);
OffsetAttribute offsetAtt = (OffsetAttribute) filter.addAttribute(OffsetAttribute.class);
PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) filter.addAttribute(PositionIncrementAttribute.class);
TypeAttribute typeAtt = (TypeAttribute) filter.addAttribute(TypeAttribute.class);
int i = 0;
final Token reusableToken = new Token();
for (Token nextToken = filter.next(reusableToken); nextToken != null; nextToken = filter.next(reusableToken)) {
String termText = nextToken.term();
while (filter.incrementToken()) {
String termText = termAtt.term();
String goldText = tokensToCompare[i].term();
assertEquals("Wrong termText", goldText, termText);
assertEquals("Wrong startOffset for token \"" + termText + "\"",
tokensToCompare[i].startOffset(), nextToken.startOffset());
tokensToCompare[i].startOffset(), offsetAtt.startOffset());
assertEquals("Wrong endOffset for token \"" + termText + "\"",
tokensToCompare[i].endOffset(), nextToken.endOffset());
tokensToCompare[i].endOffset(), offsetAtt.endOffset());
assertEquals("Wrong positionIncrement for token \"" + termText + "\"",
positionIncrements[i], nextToken.getPositionIncrement());
assertEquals("Wrong type for token \"" + termText + "\"", types[i], nextToken.type());
positionIncrements[i], posIncrAtt.getPositionIncrement());
assertEquals("Wrong type for token \"" + termText + "\"", types[i], typeAtt.type());
i++;
}
}

View File

@ -17,22 +17,29 @@ package org.apache.lucene.analysis.shingle;
* limitations under the License.
*/
import junit.framework.TestCase;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.PrefixAndSuffixAwareTokenFilter;
import org.apache.lucene.analysis.miscellaneous.SingleTokenTokenStream;
import org.apache.lucene.analysis.miscellaneous.EmptyTokenStream;
import org.apache.lucene.analysis.payloads.PayloadHelper;
import org.apache.lucene.analysis.shingle.ShingleMatrixFilter.Matrix;
import org.apache.lucene.analysis.shingle.ShingleMatrixFilter.Matrix.Column;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Iterator;
import java.util.LinkedList;
import junit.framework.TestCase;
import org.apache.lucene.analysis.CachingTokenFilter;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.EmptyTokenStream;
import org.apache.lucene.analysis.miscellaneous.PrefixAndSuffixAwareTokenFilter;
import org.apache.lucene.analysis.miscellaneous.SingleTokenTokenStream;
import org.apache.lucene.analysis.payloads.PayloadHelper;
import org.apache.lucene.analysis.shingle.ShingleMatrixFilter.Matrix;
import org.apache.lucene.analysis.shingle.ShingleMatrixFilter.Matrix.Column;
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
public class TestShingleMatrixFilter extends TestCase {
@ -43,7 +50,7 @@ public class TestShingleMatrixFilter extends TestCase {
TokenStream ts;
ts = new ShingleMatrixFilter(new EmptyTokenStream(), 1, 2, new Character(' '), false, new ShingleMatrixFilter.OneDimensionalNonWeightedTokenSettingsCodec());
assertNull(ts.next(new Token()));
assertFalse(ts.incrementToken());
TokenListStream tls;
LinkedList tokens;
@ -66,20 +73,20 @@ public class TestShingleMatrixFilter extends TestCase {
Token reusableToken = new Token();
assertNext(ts, reusableToken, "please", 0, 6);
assertNext(ts, reusableToken, "please divide", 0, 13);
assertNext(ts, reusableToken, "divide", 7, 13);
assertNext(ts, reusableToken, "divide this", 7, 18);
assertNext(ts, reusableToken, "this", 14, 18);
assertNext(ts, reusableToken, "this sentence", 14, 27);
assertNext(ts, reusableToken, "sentence", 19, 27);
assertNext(ts, reusableToken, "sentence into", 19, 32);
assertNext(ts, reusableToken, "into", 28, 32);
assertNext(ts, reusableToken, "into shingles", 28, 39);
assertNext(ts, reusableToken, "shingles", 33, 39);
assertNext(ts, "please", 0, 6);
assertNext(ts, "please divide", 0, 13);
assertNext(ts, "divide", 7, 13);
assertNext(ts, "divide this", 7, 18);
assertNext(ts, "this", 14, 18);
assertNext(ts, "this sentence", 14, 27);
assertNext(ts, "sentence", 19, 27);
assertNext(ts, "sentence into", 19, 32);
assertNext(ts, "into", 28, 32);
assertNext(ts, "into shingles", 28, 39);
assertNext(ts, "shingles", 33, 39);
assertNull(ts.next(reusableToken));
assertFalse(ts.incrementToken());
}
@ -92,7 +99,7 @@ public class TestShingleMatrixFilter extends TestCase {
ShingleMatrixFilter.defaultSettingsCodec = null;//new ShingleMatrixFilter.SimpleThreeDimensionalTokenSettingsCodec();
TokenStream ts;
TokenListStream tls;
TokenStream tls;
LinkedList tokens;
// test a plain old token stream with synonyms tranlated to rows.
@ -111,25 +118,25 @@ public class TestShingleMatrixFilter extends TestCase {
ts = new ShingleMatrixFilter(tls, 2, 2, new Character('_'), false, new ShingleMatrixFilter.TwoDimensionalNonWeightedSynonymTokenSettingsCodec());
final Token reusableToken = new Token();
assertNext(ts, reusableToken, "hello_world");
assertNext(ts, reusableToken, "greetings_world");
assertNext(ts, reusableToken, "hello_earth");
assertNext(ts, reusableToken, "greetings_earth");
assertNext(ts, reusableToken, "hello_tellus");
assertNext(ts, reusableToken, "greetings_tellus");
assertNull(ts.next(reusableToken));
assertNext(ts, "hello_world");
assertNext(ts, "greetings_world");
assertNext(ts, "hello_earth");
assertNext(ts, "greetings_earth");
assertNext(ts, "hello_tellus");
assertNext(ts, "greetings_tellus");
assertFalse(ts.incrementToken());
// bi-grams with no spacer character, start offset, end offset
tls.reset();
ts = new ShingleMatrixFilter(tls, 2, 2, null, false, new ShingleMatrixFilter.TwoDimensionalNonWeightedSynonymTokenSettingsCodec());
assertNext(ts, reusableToken, "helloworld", 0, 10);
assertNext(ts, reusableToken, "greetingsworld", 0, 10);
assertNext(ts, reusableToken, "helloearth", 0, 10);
assertNext(ts, reusableToken, "greetingsearth", 0, 10);
assertNext(ts, reusableToken, "hellotellus", 0, 10);
assertNext(ts, reusableToken, "greetingstellus", 0, 10);
assertNull(ts.next(reusableToken));
assertNext(ts, "helloworld", 0, 10);
assertNext(ts, "greetingsworld", 0, 10);
assertNext(ts, "helloearth", 0, 10);
assertNext(ts, "greetingsearth", 0, 10);
assertNext(ts, "hellotellus", 0, 10);
assertNext(ts, "greetingstellus", 0, 10);
assertFalse(ts.incrementToken());
// add ^_prefix_and_suffix_$
@ -148,7 +155,7 @@ public class TestShingleMatrixFilter extends TestCase {
tls = new TokenListStream(tokens);
ts = new PrefixAndSuffixAwareTokenFilter(new SingleTokenTokenStream(tokenFactory("^", 1, 100f, 0, 0)), tls, new SingleTokenTokenStream(tokenFactory("$", 1, 50f, 0, 0)));
tls = new TokenListStream(ts);
tls = new CachingTokenFilter(ts);
// bi-grams, position incrememnt, weight, start offset, end offset
@ -159,18 +166,18 @@ public class TestShingleMatrixFilter extends TestCase {
// token.clear();
// }
assertNext(ts, reusableToken, "^_hello", 1, 10.049875f, 0, 4);
assertNext(ts, reusableToken, "^_greetings", 1, 10.049875f, 0, 4);
assertNext(ts, reusableToken, "hello_world", 1, 1.4142135f, 0, 10);
assertNext(ts, reusableToken, "greetings_world", 1, 1.4142135f, 0, 10);
assertNext(ts, reusableToken, "hello_earth", 1, 1.4142135f, 0, 10);
assertNext(ts, reusableToken, "greetings_earth", 1, 1.4142135f, 0, 10);
assertNext(ts, reusableToken, "hello_tellus", 1, 1.4142135f, 0, 10);
assertNext(ts, reusableToken, "greetings_tellus", 1, 1.4142135f, 0, 10);
assertNext(ts, reusableToken, "world_$", 1, 7.1414285f, 5, 10);
assertNext(ts, reusableToken, "earth_$", 1, 7.1414285f, 5, 10);
assertNext(ts, reusableToken, "tellus_$", 1, 7.1414285f, 5, 10);
assertNull(ts.next(reusableToken));
assertNext(ts, "^_hello", 1, 10.049875f, 0, 4);
assertNext(ts, "^_greetings", 1, 10.049875f, 0, 4);
assertNext(ts, "hello_world", 1, 1.4142135f, 0, 10);
assertNext(ts, "greetings_world", 1, 1.4142135f, 0, 10);
assertNext(ts, "hello_earth", 1, 1.4142135f, 0, 10);
assertNext(ts, "greetings_earth", 1, 1.4142135f, 0, 10);
assertNext(ts, "hello_tellus", 1, 1.4142135f, 0, 10);
assertNext(ts, "greetings_tellus", 1, 1.4142135f, 0, 10);
assertNext(ts, "world_$", 1, 7.1414285f, 5, 10);
assertNext(ts, "earth_$", 1, 7.1414285f, 5, 10);
assertNext(ts, "tellus_$", 1, 7.1414285f, 5, 10);
assertFalse(ts.incrementToken());
// test unlimited size and allow single boundary token as shingle
tls.reset();
@ -182,44 +189,44 @@ public class TestShingleMatrixFilter extends TestCase {
// token.clear();
// }
assertNext(ts, reusableToken, "^", 1, 10.0f, 0, 0);
assertNext(ts, reusableToken, "^_hello", 1, 10.049875f, 0, 4);
assertNext(ts, reusableToken, "^_hello_world", 1, 10.099504f, 0, 10);
assertNext(ts, reusableToken, "^_hello_world_$", 1, 12.328828f, 0, 10);
assertNext(ts, reusableToken, "hello", 1, 1.0f, 0, 4);
assertNext(ts, reusableToken, "hello_world", 1, 1.4142135f, 0, 10);
assertNext(ts, reusableToken, "hello_world_$", 1, 7.2111025f, 0, 10);
assertNext(ts, reusableToken, "world", 1, 1.0f, 5, 10);
assertNext(ts, reusableToken, "world_$", 1, 7.1414285f, 5, 10);
assertNext(ts, reusableToken, "$", 1, 7.071068f, 10, 10);
assertNext(ts, reusableToken, "^_greetings", 1, 10.049875f, 0, 4);
assertNext(ts, reusableToken, "^_greetings_world", 1, 10.099504f, 0, 10);
assertNext(ts, reusableToken, "^_greetings_world_$", 1, 12.328828f, 0, 10);
assertNext(ts, reusableToken, "greetings", 1, 1.0f, 0, 4);
assertNext(ts, reusableToken, "greetings_world", 1, 1.4142135f, 0, 10);
assertNext(ts, reusableToken, "greetings_world_$", 1, 7.2111025f, 0, 10);
assertNext(ts, reusableToken, "^_hello_earth", 1, 10.099504f, 0, 10);
assertNext(ts, reusableToken, "^_hello_earth_$", 1, 12.328828f, 0, 10);
assertNext(ts, reusableToken, "hello_earth", 1, 1.4142135f, 0, 10);
assertNext(ts, reusableToken, "hello_earth_$", 1, 7.2111025f, 0, 10);
assertNext(ts, reusableToken, "earth", 1, 1.0f, 5, 10);
assertNext(ts, reusableToken, "earth_$", 1, 7.1414285f, 5, 10);
assertNext(ts, reusableToken, "^_greetings_earth", 1, 10.099504f, 0, 10);
assertNext(ts, reusableToken, "^_greetings_earth_$", 1, 12.328828f, 0, 10);
assertNext(ts, reusableToken, "greetings_earth", 1, 1.4142135f, 0, 10);
assertNext(ts, reusableToken, "greetings_earth_$", 1, 7.2111025f, 0, 10);
assertNext(ts, reusableToken, "^_hello_tellus", 1, 10.099504f, 0, 10);
assertNext(ts, reusableToken, "^_hello_tellus_$", 1, 12.328828f, 0, 10);
assertNext(ts, reusableToken, "hello_tellus", 1, 1.4142135f, 0, 10);
assertNext(ts, reusableToken, "hello_tellus_$", 1, 7.2111025f, 0, 10);
assertNext(ts, reusableToken, "tellus", 1, 1.0f, 5, 10);
assertNext(ts, reusableToken, "tellus_$", 1, 7.1414285f, 5, 10);
assertNext(ts, reusableToken, "^_greetings_tellus", 1, 10.099504f, 0, 10);
assertNext(ts, reusableToken, "^_greetings_tellus_$", 1, 12.328828f, 0, 10);
assertNext(ts, reusableToken, "greetings_tellus", 1, 1.4142135f, 0, 10);
assertNext(ts, reusableToken, "greetings_tellus_$", 1, 7.2111025f, 0, 10);
assertNext(ts, "^", 1, 10.0f, 0, 0);
assertNext(ts, "^_hello", 1, 10.049875f, 0, 4);
assertNext(ts, "^_hello_world", 1, 10.099504f, 0, 10);
assertNext(ts, "^_hello_world_$", 1, 12.328828f, 0, 10);
assertNext(ts, "hello", 1, 1.0f, 0, 4);
assertNext(ts, "hello_world", 1, 1.4142135f, 0, 10);
assertNext(ts, "hello_world_$", 1, 7.2111025f, 0, 10);
assertNext(ts, "world", 1, 1.0f, 5, 10);
assertNext(ts, "world_$", 1, 7.1414285f, 5, 10);
assertNext(ts, "$", 1, 7.071068f, 10, 10);
assertNext(ts, "^_greetings", 1, 10.049875f, 0, 4);
assertNext(ts, "^_greetings_world", 1, 10.099504f, 0, 10);
assertNext(ts, "^_greetings_world_$", 1, 12.328828f, 0, 10);
assertNext(ts, "greetings", 1, 1.0f, 0, 4);
assertNext(ts, "greetings_world", 1, 1.4142135f, 0, 10);
assertNext(ts, "greetings_world_$", 1, 7.2111025f, 0, 10);
assertNext(ts, "^_hello_earth", 1, 10.099504f, 0, 10);
assertNext(ts, "^_hello_earth_$", 1, 12.328828f, 0, 10);
assertNext(ts, "hello_earth", 1, 1.4142135f, 0, 10);
assertNext(ts, "hello_earth_$", 1, 7.2111025f, 0, 10);
assertNext(ts, "earth", 1, 1.0f, 5, 10);
assertNext(ts, "earth_$", 1, 7.1414285f, 5, 10);
assertNext(ts, "^_greetings_earth", 1, 10.099504f, 0, 10);
assertNext(ts, "^_greetings_earth_$", 1, 12.328828f, 0, 10);
assertNext(ts, "greetings_earth", 1, 1.4142135f, 0, 10);
assertNext(ts, "greetings_earth_$", 1, 7.2111025f, 0, 10);
assertNext(ts, "^_hello_tellus", 1, 10.099504f, 0, 10);
assertNext(ts, "^_hello_tellus_$", 1, 12.328828f, 0, 10);
assertNext(ts, "hello_tellus", 1, 1.4142135f, 0, 10);
assertNext(ts, "hello_tellus_$", 1, 7.2111025f, 0, 10);
assertNext(ts, "tellus", 1, 1.0f, 5, 10);
assertNext(ts, "tellus_$", 1, 7.1414285f, 5, 10);
assertNext(ts, "^_greetings_tellus", 1, 10.099504f, 0, 10);
assertNext(ts, "^_greetings_tellus_$", 1, 12.328828f, 0, 10);
assertNext(ts, "greetings_tellus", 1, 1.4142135f, 0, 10);
assertNext(ts, "greetings_tellus_$", 1, 7.2111025f, 0, 10);
assertNull(ts.next(reusableToken));
assertFalse(ts.incrementToken());
// test unlimited size but don't allow single boundary token as shingle
@ -230,43 +237,43 @@ public class TestShingleMatrixFilter extends TestCase {
// token.clear();
// }
assertNext(ts, reusableToken, "^_hello", 1, 10.049875f, 0, 4);
assertNext(ts, reusableToken, "^_hello_world", 1, 10.099504f, 0, 10);
assertNext(ts, reusableToken, "^_hello_world_$", 1, 12.328828f, 0, 10);
assertNext(ts, reusableToken, "hello", 1, 1.0f, 0, 4);
assertNext(ts, reusableToken, "hello_world", 1, 1.4142135f, 0, 10);
assertNext(ts, reusableToken, "hello_world_$", 1, 7.2111025f, 0, 10);
assertNext(ts, reusableToken, "world", 1, 1.0f, 5, 10);
assertNext(ts, reusableToken, "world_$", 1, 7.1414285f, 5, 10);
assertNext(ts, reusableToken, "^_greetings", 1, 10.049875f, 0, 4);
assertNext(ts, reusableToken, "^_greetings_world", 1, 10.099504f, 0, 10);
assertNext(ts, reusableToken, "^_greetings_world_$", 1, 12.328828f, 0, 10);
assertNext(ts, reusableToken, "greetings", 1, 1.0f, 0, 4);
assertNext(ts, reusableToken, "greetings_world", 1, 1.4142135f, 0, 10);
assertNext(ts, reusableToken, "greetings_world_$", 1, 7.2111025f, 0, 10);
assertNext(ts, reusableToken, "^_hello_earth", 1, 10.099504f, 0, 10);
assertNext(ts, reusableToken, "^_hello_earth_$", 1, 12.328828f, 0, 10);
assertNext(ts, reusableToken, "hello_earth", 1, 1.4142135f, 0, 10);
assertNext(ts, reusableToken, "hello_earth_$", 1, 7.2111025f, 0, 10);
assertNext(ts, reusableToken, "earth", 1, 1.0f, 5, 10);
assertNext(ts, reusableToken, "earth_$", 1, 7.1414285f, 5, 10);
assertNext(ts, reusableToken, "^_greetings_earth", 1, 10.099504f, 0, 10);
assertNext(ts, reusableToken, "^_greetings_earth_$", 1, 12.328828f, 0, 10);
assertNext(ts, reusableToken, "greetings_earth", 1, 1.4142135f, 0, 10);
assertNext(ts, reusableToken, "greetings_earth_$", 1, 7.2111025f, 0, 10);
assertNext(ts, reusableToken, "^_hello_tellus", 1, 10.099504f, 0, 10);
assertNext(ts, reusableToken, "^_hello_tellus_$", 1, 12.328828f, 0, 10);
assertNext(ts, reusableToken, "hello_tellus", 1, 1.4142135f, 0, 10);
assertNext(ts, reusableToken, "hello_tellus_$", 1, 7.2111025f, 0, 10);
assertNext(ts, reusableToken, "tellus", 1, 1.0f, 5, 10);
assertNext(ts, reusableToken, "tellus_$", 1, 7.1414285f, 5, 10);
assertNext(ts, reusableToken, "^_greetings_tellus", 1, 10.099504f, 0, 10);
assertNext(ts, reusableToken, "^_greetings_tellus_$", 1, 12.328828f, 0, 10);
assertNext(ts, reusableToken, "greetings_tellus", 1, 1.4142135f, 0, 10);
assertNext(ts, reusableToken, "greetings_tellus_$", 1, 7.2111025f, 0, 10);
assertNext(ts, "^_hello", 1, 10.049875f, 0, 4);
assertNext(ts, "^_hello_world", 1, 10.099504f, 0, 10);
assertNext(ts, "^_hello_world_$", 1, 12.328828f, 0, 10);
assertNext(ts, "hello", 1, 1.0f, 0, 4);
assertNext(ts, "hello_world", 1, 1.4142135f, 0, 10);
assertNext(ts, "hello_world_$", 1, 7.2111025f, 0, 10);
assertNext(ts, "world", 1, 1.0f, 5, 10);
assertNext(ts, "world_$", 1, 7.1414285f, 5, 10);
assertNext(ts, "^_greetings", 1, 10.049875f, 0, 4);
assertNext(ts, "^_greetings_world", 1, 10.099504f, 0, 10);
assertNext(ts, "^_greetings_world_$", 1, 12.328828f, 0, 10);
assertNext(ts, "greetings", 1, 1.0f, 0, 4);
assertNext(ts, "greetings_world", 1, 1.4142135f, 0, 10);
assertNext(ts, "greetings_world_$", 1, 7.2111025f, 0, 10);
assertNext(ts, "^_hello_earth", 1, 10.099504f, 0, 10);
assertNext(ts, "^_hello_earth_$", 1, 12.328828f, 0, 10);
assertNext(ts, "hello_earth", 1, 1.4142135f, 0, 10);
assertNext(ts, "hello_earth_$", 1, 7.2111025f, 0, 10);
assertNext(ts, "earth", 1, 1.0f, 5, 10);
assertNext(ts, "earth_$", 1, 7.1414285f, 5, 10);
assertNext(ts, "^_greetings_earth", 1, 10.099504f, 0, 10);
assertNext(ts, "^_greetings_earth_$", 1, 12.328828f, 0, 10);
assertNext(ts, "greetings_earth", 1, 1.4142135f, 0, 10);
assertNext(ts, "greetings_earth_$", 1, 7.2111025f, 0, 10);
assertNext(ts, "^_hello_tellus", 1, 10.099504f, 0, 10);
assertNext(ts, "^_hello_tellus_$", 1, 12.328828f, 0, 10);
assertNext(ts, "hello_tellus", 1, 1.4142135f, 0, 10);
assertNext(ts, "hello_tellus_$", 1, 7.2111025f, 0, 10);
assertNext(ts, "tellus", 1, 1.0f, 5, 10);
assertNext(ts, "tellus_$", 1, 7.1414285f, 5, 10);
assertNext(ts, "^_greetings_tellus", 1, 10.099504f, 0, 10);
assertNext(ts, "^_greetings_tellus_$", 1, 12.328828f, 0, 10);
assertNext(ts, "greetings_tellus", 1, 1.4142135f, 0, 10);
assertNext(ts, "greetings_tellus_$", 1, 7.2111025f, 0, 10);
assertNull(ts.next(reusableToken));
assertFalse(ts.incrementToken());
System.currentTimeMillis();
@ -301,20 +308,20 @@ public class TestShingleMatrixFilter extends TestCase {
// shingle, position increment, weight, start offset, end offset
assertNext(ts, reusableToken, "hello_world", 1, 1.4142135f, 0, 10);
assertNext(ts, reusableToken, "greetings_and", 1, 1.4142135f, 0, 4);
assertNext(ts, reusableToken, "greetings_and_salutations", 1, 1.7320508f, 0, 4);
assertNext(ts, reusableToken, "and_salutations", 1, 1.4142135f, 0, 4);
assertNext(ts, reusableToken, "and_salutations_world", 1, 1.7320508f, 0, 10);
assertNext(ts, reusableToken, "salutations_world", 1, 1.4142135f, 0, 10);
assertNext(ts, reusableToken, "hello_earth", 1, 1.4142135f, 0, 10);
assertNext(ts, reusableToken, "and_salutations_earth", 1, 1.7320508f, 0, 10);
assertNext(ts, reusableToken, "salutations_earth", 1, 1.4142135f, 0, 10);
assertNext(ts, reusableToken, "hello_tellus", 1, 1.4142135f, 0, 10);
assertNext(ts, reusableToken, "and_salutations_tellus", 1, 1.7320508f, 0, 10);
assertNext(ts, reusableToken, "salutations_tellus", 1, 1.4142135f, 0, 10);
assertNext(ts, "hello_world", 1, 1.4142135f, 0, 10);
assertNext(ts, "greetings_and", 1, 1.4142135f, 0, 4);
assertNext(ts, "greetings_and_salutations", 1, 1.7320508f, 0, 4);
assertNext(ts, "and_salutations", 1, 1.4142135f, 0, 4);
assertNext(ts, "and_salutations_world", 1, 1.7320508f, 0, 10);
assertNext(ts, "salutations_world", 1, 1.4142135f, 0, 10);
assertNext(ts, "hello_earth", 1, 1.4142135f, 0, 10);
assertNext(ts, "and_salutations_earth", 1, 1.7320508f, 0, 10);
assertNext(ts, "salutations_earth", 1, 1.4142135f, 0, 10);
assertNext(ts, "hello_tellus", 1, 1.4142135f, 0, 10);
assertNext(ts, "and_salutations_tellus", 1, 1.7320508f, 0, 10);
assertNext(ts, "salutations_tellus", 1, 1.4142135f, 0, 10);
assertNull(ts.next(reusableToken));
assertFalse(ts.incrementToken());
System.currentTimeMillis();
@ -361,47 +368,47 @@ public class TestShingleMatrixFilter extends TestCase {
// }
final Token reusableToken = new Token();
assertNext(ts, reusableToken, "no_surprise", 1, 1.4142135f, 0, 0);
assertNext(ts, reusableToken, "no_surprise_to", 1, 1.7320508f, 0, 0);
assertNext(ts, reusableToken, "no_surprise_to_see", 1, 2.0f, 0, 0);
assertNext(ts, reusableToken, "surprise_to", 1, 1.4142135f, 0, 0);
assertNext(ts, reusableToken, "surprise_to_see", 1, 1.7320508f, 0, 0);
assertNext(ts, reusableToken, "surprise_to_see_england", 1, 2.0f, 0, 0);
assertNext(ts, reusableToken, "to_see", 1, 1.4142135f, 0, 0);
assertNext(ts, reusableToken, "to_see_england", 1, 1.7320508f, 0, 0);
assertNext(ts, reusableToken, "to_see_england_manager", 1, 2.0f, 0, 0);
assertNext(ts, reusableToken, "see_england", 1, 1.4142135f, 0, 0);
assertNext(ts, reusableToken, "see_england_manager", 1, 1.7320508f, 0, 0);
assertNext(ts, reusableToken, "see_england_manager_svennis", 1, 2.0f, 0, 0);
assertNext(ts, reusableToken, "england_manager", 1, 1.4142135f, 0, 0);
assertNext(ts, reusableToken, "england_manager_svennis", 1, 1.7320508f, 0, 0);
assertNext(ts, reusableToken, "england_manager_svennis_in", 1, 2.0f, 0, 0);
assertNext(ts, reusableToken, "manager_svennis", 1, 1.4142135f, 0, 0);
assertNext(ts, reusableToken, "manager_svennis_in", 1, 1.7320508f, 0, 0);
assertNext(ts, reusableToken, "manager_svennis_in_the", 1, 2.0f, 0, 0);
assertNext(ts, reusableToken, "svennis_in", 1, 1.4142135f, 0, 0);
assertNext(ts, reusableToken, "svennis_in_the", 1, 1.7320508f, 0, 0);
assertNext(ts, reusableToken, "svennis_in_the_croud", 1, 2.0f, 0, 0);
assertNext(ts, reusableToken, "in_the", 1, 1.4142135f, 0, 0);
assertNext(ts, reusableToken, "in_the_croud", 1, 1.7320508f, 0, 0);
assertNext(ts, reusableToken, "the_croud", 1, 1.4142135f, 0, 0);
assertNext(ts, reusableToken, "see_england_manager_sven", 1, 2.0f, 0, 0);
assertNext(ts, reusableToken, "england_manager_sven", 1, 1.7320508f, 0, 0);
assertNext(ts, reusableToken, "england_manager_sven_göran", 1, 2.0f, 0, 0);
assertNext(ts, reusableToken, "manager_sven", 1, 1.4142135f, 0, 0);
assertNext(ts, reusableToken, "manager_sven_göran", 1, 1.7320508f, 0, 0);
assertNext(ts, reusableToken, "manager_sven_göran_eriksson", 1, 2.0f, 0, 0);
assertNext(ts, reusableToken, "sven_göran", 1, 1.4142135f, 0, 0);
assertNext(ts, reusableToken, "sven_göran_eriksson", 1, 1.7320508f, 0, 0);
assertNext(ts, reusableToken, "sven_göran_eriksson_in", 1, 2.0f, 0, 0);
assertNext(ts, reusableToken, "göran_eriksson", 1, 1.4142135f, 0, 0);
assertNext(ts, reusableToken, "göran_eriksson_in", 1, 1.7320508f, 0, 0);
assertNext(ts, reusableToken, "göran_eriksson_in_the", 1, 2.0f, 0, 0);
assertNext(ts, reusableToken, "eriksson_in", 1, 1.4142135f, 0, 0);
assertNext(ts, reusableToken, "eriksson_in_the", 1, 1.7320508f, 0, 0);
assertNext(ts, reusableToken, "eriksson_in_the_croud", 1, 2.0f, 0, 0);
assertNext(ts, "no_surprise", 1, 1.4142135f, 0, 0);
assertNext(ts, "no_surprise_to", 1, 1.7320508f, 0, 0);
assertNext(ts, "no_surprise_to_see", 1, 2.0f, 0, 0);
assertNext(ts, "surprise_to", 1, 1.4142135f, 0, 0);
assertNext(ts, "surprise_to_see", 1, 1.7320508f, 0, 0);
assertNext(ts, "surprise_to_see_england", 1, 2.0f, 0, 0);
assertNext(ts, "to_see", 1, 1.4142135f, 0, 0);
assertNext(ts, "to_see_england", 1, 1.7320508f, 0, 0);
assertNext(ts, "to_see_england_manager", 1, 2.0f, 0, 0);
assertNext(ts, "see_england", 1, 1.4142135f, 0, 0);
assertNext(ts, "see_england_manager", 1, 1.7320508f, 0, 0);
assertNext(ts, "see_england_manager_svennis", 1, 2.0f, 0, 0);
assertNext(ts, "england_manager", 1, 1.4142135f, 0, 0);
assertNext(ts, "england_manager_svennis", 1, 1.7320508f, 0, 0);
assertNext(ts, "england_manager_svennis_in", 1, 2.0f, 0, 0);
assertNext(ts, "manager_svennis", 1, 1.4142135f, 0, 0);
assertNext(ts, "manager_svennis_in", 1, 1.7320508f, 0, 0);
assertNext(ts, "manager_svennis_in_the", 1, 2.0f, 0, 0);
assertNext(ts, "svennis_in", 1, 1.4142135f, 0, 0);
assertNext(ts, "svennis_in_the", 1, 1.7320508f, 0, 0);
assertNext(ts, "svennis_in_the_croud", 1, 2.0f, 0, 0);
assertNext(ts, "in_the", 1, 1.4142135f, 0, 0);
assertNext(ts, "in_the_croud", 1, 1.7320508f, 0, 0);
assertNext(ts, "the_croud", 1, 1.4142135f, 0, 0);
assertNext(ts, "see_england_manager_sven", 1, 2.0f, 0, 0);
assertNext(ts, "england_manager_sven", 1, 1.7320508f, 0, 0);
assertNext(ts, "england_manager_sven_göran", 1, 2.0f, 0, 0);
assertNext(ts, "manager_sven", 1, 1.4142135f, 0, 0);
assertNext(ts, "manager_sven_göran", 1, 1.7320508f, 0, 0);
assertNext(ts, "manager_sven_göran_eriksson", 1, 2.0f, 0, 0);
assertNext(ts, "sven_göran", 1, 1.4142135f, 0, 0);
assertNext(ts, "sven_göran_eriksson", 1, 1.7320508f, 0, 0);
assertNext(ts, "sven_göran_eriksson_in", 1, 2.0f, 0, 0);
assertNext(ts, "göran_eriksson", 1, 1.4142135f, 0, 0);
assertNext(ts, "göran_eriksson_in", 1, 1.7320508f, 0, 0);
assertNext(ts, "göran_eriksson_in_the", 1, 2.0f, 0, 0);
assertNext(ts, "eriksson_in", 1, 1.4142135f, 0, 0);
assertNext(ts, "eriksson_in_the", 1, 1.7320508f, 0, 0);
assertNext(ts, "eriksson_in_the_croud", 1, 2.0f, 0, 0);
assertNull(ts.next(reusableToken));
assertFalse(ts.incrementToken());
}
@ -445,40 +452,46 @@ public class TestShingleMatrixFilter extends TestCase {
// assert-methods start here
private Token assertNext(TokenStream ts, final Token reusableToken, String text) throws IOException {
Token nextToken = ts.next(reusableToken);
assertNotNull(nextToken);
assertEquals(text, nextToken.term());
return nextToken;
private void assertNext(TokenStream ts, String text) throws IOException {
TermAttribute termAtt = (TermAttribute) ts.addAttribute(TermAttribute.class);
assertTrue(ts.incrementToken());
assertEquals(text, termAtt.term());
}
private Token assertNext(TokenStream ts, final Token reusableToken, String text, int positionIncrement, float boost) throws IOException {
Token nextToken = ts.next(reusableToken);
assertNotNull(nextToken);
assertEquals(text, nextToken.term());
assertEquals(positionIncrement, nextToken.getPositionIncrement());
assertEquals(boost, nextToken.getPayload() == null ? 1f : PayloadHelper.decodeFloat(nextToken.getPayload().getData()), 0);
return nextToken;
private void assertNext(TokenStream ts, String text, int positionIncrement, float boost) throws IOException {
TermAttribute termAtt = (TermAttribute) ts.addAttribute(TermAttribute.class);
PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) ts.addAttribute(PositionIncrementAttribute.class);
PayloadAttribute payloadAtt = (PayloadAttribute) ts.addAttribute(PayloadAttribute.class);
assertTrue(ts.incrementToken());
assertEquals(text, termAtt.term());
assertEquals(positionIncrement, posIncrAtt.getPositionIncrement());
assertEquals(boost, payloadAtt.getPayload() == null ? 1f : PayloadHelper.decodeFloat(payloadAtt.getPayload().getData()), 0);
}
private Token assertNext(TokenStream ts, final Token reusableToken, String text, int positionIncrement, float boost, int startOffset, int endOffset) throws IOException {
Token nextToken = ts.next(reusableToken);
assertNotNull(nextToken);
assertEquals(text, nextToken.term());
assertEquals(positionIncrement, nextToken.getPositionIncrement());
assertEquals(boost, nextToken.getPayload() == null ? 1f : PayloadHelper.decodeFloat(nextToken.getPayload().getData()), 0);
assertEquals(startOffset, nextToken.startOffset());
assertEquals(endOffset, nextToken.endOffset());
return nextToken;
private void assertNext(TokenStream ts, String text, int positionIncrement, float boost, int startOffset, int endOffset) throws IOException {
TermAttribute termAtt = (TermAttribute) ts.addAttribute(TermAttribute.class);
PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) ts.addAttribute(PositionIncrementAttribute.class);
PayloadAttribute payloadAtt = (PayloadAttribute) ts.addAttribute(PayloadAttribute.class);
OffsetAttribute offsetAtt = (OffsetAttribute) ts.addAttribute(OffsetAttribute.class);
assertTrue(ts.incrementToken());
assertEquals(text, termAtt.term());
assertEquals(positionIncrement, posIncrAtt.getPositionIncrement());
assertEquals(boost, payloadAtt.getPayload() == null ? 1f : PayloadHelper.decodeFloat(payloadAtt.getPayload().getData()), 0);
assertEquals(startOffset, offsetAtt.startOffset());
assertEquals(endOffset, offsetAtt.endOffset());
}
private void assertNext(TokenStream ts, String text, int startOffset, int endOffset) throws IOException {
TermAttribute termAtt = (TermAttribute) ts.addAttribute(TermAttribute.class);
OffsetAttribute offsetAtt = (OffsetAttribute) ts.addAttribute(OffsetAttribute.class);
private Token assertNext(TokenStream ts, final Token reusableToken, String text, int startOffset, int endOffset) throws IOException {
Token nextToken = ts.next(reusableToken);
assertNotNull(nextToken);
assertEquals(text, nextToken.term());
assertEquals(startOffset, nextToken.startOffset());
assertEquals(endOffset, nextToken.endOffset());
return nextToken;
assertTrue(ts.incrementToken());
assertEquals(text, termAtt.term());
assertEquals(startOffset, offsetAtt.startOffset());
assertEquals(endOffset, offsetAtt.endOffset());
}
private static Token createToken(String term, int start, int offset)
@ -492,31 +505,41 @@ public class TestShingleMatrixFilter extends TestCase {
public static class TokenListStream extends TokenStream {
private Collection tokens;
public TokenListStream(TokenStream ts) throws IOException {
tokens = new ArrayList();
final Token reusableToken = new Token();
for (Token nextToken = ts.next(reusableToken); nextToken != null; nextToken = ts.next(reusableToken)) {
tokens.add((Token) nextToken.clone());
}
}
TermAttribute termAtt;
PositionIncrementAttribute posIncrAtt;
PayloadAttribute payloadAtt;
OffsetAttribute offsetAtt;
TypeAttribute typeAtt;
FlagsAttribute flagsAtt;
public TokenListStream(Collection tokens) {
this.tokens = tokens;
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class);
offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
flagsAtt = (FlagsAttribute) addAttribute(FlagsAttribute.class);
}
private Iterator iterator;
public Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
public boolean incrementToken() throws IOException {
if (iterator == null) {
iterator = tokens.iterator();
}
if (!iterator.hasNext()) {
return null;
return false;
}
Token nextToken = (Token) iterator.next();
return (Token) nextToken.clone();
Token prototype = (Token) iterator.next();
termAtt.setTermBuffer(prototype.termBuffer(), 0, prototype.termLength());
posIncrAtt.setPositionIncrement(prototype.getPositionIncrement());
flagsAtt.setFlags(prototype.getFlags());
offsetAtt.setOffset(prototype.startOffset(), prototype.endOffset());
typeAtt.setType(prototype.type());
payloadAtt.setPayload(prototype.getPayload());
return true;
}