LUCENE-2400: ShingleFilter: don't output all-filler shingles/unigrams; also, convert from TermAttribute to CharTermAttribute

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@940451 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Uwe Schindler 2010-05-03 13:28:14 +00:00
parent 5abbf3429c
commit 7cd9a8da03
3 changed files with 350 additions and 113 deletions

View File

@ -145,6 +145,10 @@ New features
multiple languages. The default configuration includes special support for
Thai, Lao, Myanmar, and Khmer. (Robert Muir, Uwe Schindler)
* LUCENE-2400: ShingleFilter was changed to don't output all-filler shingles and
unigrams, and uses a more performant algorithm to build grams using a linked list
of AttributeSource.cloneAttributes() instances and the new copyTo() method.
Build
* LUCENE-2124: Moved the JDK-based collation support from contrib/collation

View File

@ -18,14 +18,16 @@ package org.apache.lucene.analysis.shingle;
*/
import java.io.IOException;
import java.util.Iterator;
import java.util.LinkedList;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.AttributeSource;
/**
@ -66,12 +68,12 @@ public final class ShingleFilter extends TokenFilter {
*/
public static final String TOKEN_SEPARATOR = " ";
/**
* The sequence of input stream tokens (or filler tokens, if necessary)
* that will be composed to form output shingles.
*/
private LinkedList<State> inputWindow = new LinkedList<State>();
private LinkedList<InputWindowToken> inputWindow
= new LinkedList<InputWindowToken>();
/**
* The number of input tokens in the next output token. This is the "n" in
@ -80,9 +82,9 @@ public final class ShingleFilter extends TokenFilter {
private CircularSequence gramSize;
/**
* Shingle text is composed here.
* Shingle and unigram text is composed here.
*/
private StringBuilder shingleBuilder = new StringBuilder();
private StringBuilder gramBuilder = new StringBuilder();
/**
* The token type attribute value to use - default is "shingle"
@ -111,18 +113,31 @@ public final class ShingleFilter extends TokenFilter {
private int minShingleSize;
/**
* The remaining number of filler tokens inserted into the input stream
* The remaining number of filler tokens to be inserted into the input stream
* from which shingles are composed, to handle position increments greater
* than one.
*/
private int numFillerTokensToInsert;
/**
* The next input stream token.
* When the next input stream token has a position increment greater than
* one, it is stored in this field until sufficient filler tokens have been
* inserted to account for the position increment.
*/
private State nextInputStreamToken;
private AttributeSource nextInputStreamToken;
/**
* Whether or not there is a next input stream token.
*/
private boolean isNextInputStreamToken = false;
/**
* Whether at least one unigram or shingle has been output at the current
* position.
*/
private boolean isOutputHere = false;
private final TermAttribute termAtt;
private final CharTermAttribute termAtt;
private final OffsetAttribute offsetAtt;
private final PositionIncrementAttribute posIncrAtt;
private final TypeAttribute typeAtt;
@ -140,7 +155,7 @@ public final class ShingleFilter extends TokenFilter {
super(input);
setMaxShingleSize(maxShingleSize);
setMinShingleSize(minShingleSize);
this.termAtt = addAttribute(TermAttribute.class);
this.termAtt = addAttribute(CharTermAttribute.class);
this.offsetAtt = addAttribute(OffsetAttribute.class);
this.posIncrAtt = addAttribute(PositionIncrementAttribute.class);
this.typeAtt = addAttribute(TypeAttribute.class);
@ -241,23 +256,49 @@ public final class ShingleFilter extends TokenFilter {
this.tokenSeparator = null == tokenSeparator ? "" : tokenSeparator;
}
/* (non-Javadoc)
* @see org.apache.lucene.analysis.TokenStream#next()
*/
@Override
public final boolean incrementToken() throws IOException {
boolean tokenAvailable = false;
boolean tokenAvailable = false;
int builtGramSize = 0;
if (gramSize.atMinValue() || inputWindow.size() < gramSize.getValue()) {
shiftInputWindow();
gramBuilder.setLength(0);
} else {
builtGramSize = gramSize.getPreviousValue();
}
if ( ! inputWindow.isEmpty()) {
restoreState(inputWindow.getFirst());
if (1 == gramSize.getValue()) {
posIncrAtt.setPositionIncrement(1);
gramSize.advance();
tokenAvailable = true;
} else if (inputWindow.size() >= gramSize.getValue()) {
getNextShingle();
if (inputWindow.size() >= gramSize.getValue()) {
boolean isAllFiller = true;
InputWindowToken nextToken = null;
Iterator<InputWindowToken> iter = inputWindow.iterator();
for (int gramNum = 1 ;
iter.hasNext() && builtGramSize < gramSize.getValue() ;
++gramNum) {
nextToken = iter.next();
if (builtGramSize < gramNum) {
if (builtGramSize > 0) {
gramBuilder.append(tokenSeparator);
}
gramBuilder.append(nextToken.termAtt.buffer(), 0,
nextToken.termAtt.length());
++builtGramSize;
}
if (isAllFiller && nextToken.isFiller) {
if (gramNum == gramSize.getValue()) {
gramSize.advance();
}
} else {
isAllFiller = false;
}
}
if ( ! isAllFiller && builtGramSize == gramSize.getValue()) {
inputWindow.getFirst().attSource.copyTo(this);
posIncrAtt.setPositionIncrement(isOutputHere ? 0 : 1);
termAtt.setEmpty().append(gramBuilder);
if (gramSize.getValue() > 1) {
typeAtt.setType(tokenType);
}
offsetAtt.setOffset(offsetAtt.startOffset(), nextToken.offsetAtt.endOffset());
isOutputHere = true;
gramSize.advance();
tokenAvailable = true;
}
@ -265,83 +306,69 @@ public final class ShingleFilter extends TokenFilter {
return tokenAvailable;
}
/**
* <p>Makes the next token a shingle of length {@link #gramSize},
* composed of tokens taken from {@link #inputWindow}.
* <p>Callers of this method must first insure that there are at least
* <code>gramSize</code> tokens available in <code>inputWindow</code>.
*/
private void getNextShingle() {
int startOffset = offsetAtt.startOffset();
int minTokNum = gramSize.getValue() - 1; // zero-based inputWindow position
if (gramSize.getValue() == minShingleSize) {
// Clear the shingle text buffer if this is the first shingle
// at the current position in the input stream.
shingleBuilder.setLength(0);
minTokNum = 0;
}
for (int tokNum = minTokNum ; tokNum < gramSize.getValue() ; ++tokNum) {
if (tokNum > 0) {
shingleBuilder.append(tokenSeparator);
}
restoreState(inputWindow.get(tokNum));
shingleBuilder.append(termAtt.termBuffer(), 0, termAtt.termLength());
}
char[] termBuffer = termAtt.termBuffer();
int termLength = shingleBuilder.length();
if (termBuffer.length < termLength) {
termBuffer = termAtt.resizeTermBuffer(termLength);
}
shingleBuilder.getChars(0, termLength, termBuffer, 0);
termAtt.setTermLength(termLength);
posIncrAtt.setPositionIncrement(gramSize.atMinValue() ? 1 : 0);
typeAtt.setType(tokenType);
offsetAtt.setOffset(startOffset, offsetAtt.endOffset());
}
/**
* <p>Get the next token from the input stream.
* <p>If the next token has <code>positionIncrement > 1</code>,
* <code>positionIncrement - 1</code> {@link #FILLER_TOKEN}s are
* inserted first.
* @return false for end of stream; true otherwise
* @param target Where to put the new token; if null, a new instance is created.
* @return On success, the populated token; null otherwise
* @throws IOException if the input stream has a problem
*/
private boolean getNextToken() throws IOException {
boolean success = false;
private InputWindowToken getNextToken(InputWindowToken target)
throws IOException {
InputWindowToken newTarget = target;
if (numFillerTokensToInsert > 0) {
insertFillerToken();
success = true;
} else if (null != nextInputStreamToken) {
restoreState(nextInputStreamToken);
nextInputStreamToken = null;
success = true;
} else if (input.incrementToken()) {
if (posIncrAtt.getPositionIncrement() > 1) {
numFillerTokensToInsert = posIncrAtt.getPositionIncrement() - 1;
insertFillerToken();
if (null == target) {
newTarget = new InputWindowToken(nextInputStreamToken.cloneAttributes());
} else {
nextInputStreamToken.copyTo(target.attSource);
}
// A filler token occupies no space
newTarget.offsetAtt.setOffset(newTarget.offsetAtt.startOffset(),
newTarget.offsetAtt.startOffset());
newTarget.termAtt.copyBuffer(FILLER_TOKEN, 0, FILLER_TOKEN.length);
newTarget.isFiller = true;
--numFillerTokensToInsert;
} else if (isNextInputStreamToken) {
if (null == target) {
newTarget = new InputWindowToken(nextInputStreamToken.cloneAttributes());
} else {
nextInputStreamToken.copyTo(target.attSource);
}
isNextInputStreamToken = false;
newTarget.isFiller = false;
} else if (input.incrementToken()) {
if (null == target) {
newTarget = new InputWindowToken(cloneAttributes());
} else {
this.copyTo(target.attSource);
}
if (posIncrAtt.getPositionIncrement() > 1) {
// Each output shingle must contain at least one input token,
// so no more than (maxShingleSize - 1) filler tokens will be inserted.
numFillerTokensToInsert
= Math.min(posIncrAtt.getPositionIncrement() - 1, maxShingleSize - 1);
// Save the current token as the next input stream token
if (null == nextInputStreamToken) {
nextInputStreamToken = cloneAttributes();
} else {
this.copyTo(nextInputStreamToken);
}
isNextInputStreamToken = true;
// A filler token occupies no space
newTarget.offsetAtt.setOffset(offsetAtt.startOffset(), offsetAtt.startOffset());
newTarget.termAtt.copyBuffer(FILLER_TOKEN, 0, FILLER_TOKEN.length);
newTarget.isFiller = true;
--numFillerTokensToInsert;
} else {
newTarget.isFiller = false;
}
success = true;
}
return success;
}
/**
* Inserts a {@link #FILLER_TOKEN} and decrements
* {@link #numFillerTokensToInsert}.
*/
private void insertFillerToken() {
if (null == nextInputStreamToken) {
nextInputStreamToken = captureState();
} else {
restoreState(nextInputStreamToken);
newTarget = null;
}
--numFillerTokensToInsert;
// A filler token occupies no space
offsetAtt.setOffset(offsetAtt.startOffset(), offsetAtt.startOffset());
termAtt.setTermBuffer(FILLER_TOKEN, 0, FILLER_TOKEN.length);
}
return newTarget;
}
/**
* <p>Fills {@link #inputWindow} with input stream tokens, if available,
@ -351,16 +378,29 @@ public final class ShingleFilter extends TokenFilter {
* @throws IOException if there's a problem getting the next token
*/
private void shiftInputWindow() throws IOException {
InputWindowToken firstToken = null;
if (inputWindow.size() > 0) {
inputWindow.removeFirst();
firstToken = inputWindow.removeFirst();
}
while (getNextToken()) {
inputWindow.add(captureState());
if (inputWindow.size() == maxShingleSize) {
break;
while (inputWindow.size() < maxShingleSize) {
if (null != firstToken) { // recycle the firstToken, if available
if (null != getNextToken(firstToken)) {
inputWindow.add(firstToken); // the firstToken becomes the last
firstToken = null;
} else {
break; // end of input stream
}
} else {
InputWindowToken nextToken = getNextToken(null);
if (null != nextToken) {
inputWindow.add(nextToken);
} else {
break; // end of input stream
}
}
}
gramSize.reset();
isOutputHere = false;
}
@Override
@ -369,6 +409,7 @@ public final class ShingleFilter extends TokenFilter {
gramSize.reset();
inputWindow.clear();
numFillerTokensToInsert = 0;
isOutputHere = false;
}
@ -383,6 +424,7 @@ public final class ShingleFilter extends TokenFilter {
*/
private class CircularSequence {
private int value;
private int previousValue;
private int minValue;
public CircularSequence() {
@ -405,10 +447,9 @@ public final class ShingleFilter extends TokenFilter {
* <b>{ [ 1, ] {@link #minShingleSize} [ , ... , {@link #maxShingleSize} ] }</b>.
* <p>1 is included in the circular sequence only if
* {@link #outputUnigrams} = true.
*
* @return the next member in the circular sequence
*/
public int advance() {
public void advance() {
previousValue = value;
if (value == 1) {
value = minShingleSize;
} else if (value == maxShingleSize) {
@ -416,7 +457,6 @@ public final class ShingleFilter extends TokenFilter {
} else {
++value;
}
return value;
}
/**
@ -428,7 +468,7 @@ public final class ShingleFilter extends TokenFilter {
* {@link #outputUnigrams} = true.
*/
public void reset() {
value = minValue;
previousValue = value = minValue;
}
/**
@ -443,5 +483,25 @@ public final class ShingleFilter extends TokenFilter {
public boolean atMinValue() {
return value == minValue;
}
/**
* @return the value this instance had before the last advance() call
*/
public int getPreviousValue() {
return previousValue;
}
}
private class InputWindowToken {
final AttributeSource attSource;
final CharTermAttribute termAtt;
final OffsetAttribute offsetAtt;
boolean isFiller = false;
public InputWindowToken(AttributeSource attSource) {
this.attSource = attSource;
this.termAtt = attSource.getAttribute(CharTermAttribute.class);
this.offsetAtt = attSource.getAttribute(OffsetAttribute.class);
}
}
}

View File

@ -34,7 +34,7 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
protected int index = 0;
protected Token[] testToken;
private TermAttribute termAtt;
private CharTermAttribute termAtt;
private OffsetAttribute offsetAtt;
private PositionIncrementAttribute posIncrAtt;
private TypeAttribute typeAtt;
@ -42,7 +42,7 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
public TestTokenStream(Token[] testToken) {
super();
this.testToken = testToken;
this.termAtt = addAttribute(TermAttribute.class);
this.termAtt = addAttribute(CharTermAttribute.class);
this.offsetAtt = addAttribute(OffsetAttribute.class);
this.posIncrAtt = addAttribute(PositionIncrementAttribute.class);
this.typeAtt = addAttribute(TypeAttribute.class);
@ -53,7 +53,7 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
clearAttributes();
if (index < testToken.length) {
Token t = testToken[index++];
termAtt.setTermBuffer(t.termBuffer(), 0, t.termLength());
termAtt.copyBuffer(t.buffer(), 0, t.length());
offsetAtt.setOffset(t.startOffset(), t.endOffset());
posIncrAtt.setPositionIncrement(t.getPositionIncrement());
typeAtt.setType(TypeAttributeImpl.DEFAULT_TYPE);
@ -103,17 +103,20 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
createToken("please divide", 0, 13),
createToken("divide", 7, 13),
createToken("divide _", 7, 19),
createToken("_", 19, 19),
createToken("_ sentence", 19, 27),
createToken("sentence", 19, 27),
createToken("sentence _", 19, 33),
createToken("_", 33, 33),
createToken("_ shingles", 33, 39),
createToken("shingles", 33, 39),
};
public static final int[] BI_GRAM_POSITION_INCREMENTS_WITH_HOLES = new int[] {
1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
1, 0, 1, 0, 1, 1, 0, 1, 1
};
private static final String[] BI_GRAM_TYPES_WITH_HOLES = {
"word", "shingle",
"word", "shingle", "shingle", "word", "shingle", "shingle", "word"
};
public static final Token[] BI_GRAM_TOKENS_WITHOUT_UNIGRAMS = new Token[] {
@ -642,18 +645,157 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
"word"
};
public static final Token[] TEST_TOKEN_POS_INCR_EQUAL_TO_N = new Token[] {
createToken("please", 0, 6),
createToken("divide", 7, 13),
createToken("this", 14, 18),
createToken("sentence", 29, 37, 3),
createToken("into", 38, 42),
createToken("shingles", 43, 49),
};
public static final Token[] TRI_GRAM_TOKENS_POS_INCR_EQUAL_TO_N = new Token[] {
createToken("please", 0, 6),
createToken("please divide", 0, 13),
createToken("please divide this", 0, 18),
createToken("divide", 7, 13),
createToken("divide this", 7, 18),
createToken("divide this _", 7, 29),
createToken("this", 14, 18),
createToken("this _", 14, 29),
createToken("this _ _", 14, 29),
createToken("_ _ sentence", 29, 37),
createToken("_ sentence", 29, 37),
createToken("_ sentence into", 29, 42),
createToken("sentence", 29, 37),
createToken("sentence into", 29, 42),
createToken("sentence into shingles", 29, 49),
createToken("into", 38, 42),
createToken("into shingles", 38, 49),
createToken("shingles", 43, 49)
};
public static final int[] TRI_GRAM_POSITION_INCREMENTS_POS_INCR_EQUAL_TO_N = new int[] {
1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1
};
public static final String[] TRI_GRAM_TYPES_POS_INCR_EQUAL_TO_N = new String[] {
"word", "shingle", "shingle",
"word", "shingle", "shingle",
"word", "shingle", "shingle",
"shingle", "shingle", "shingle", "word", "shingle", "shingle",
"word", "shingle",
"word"
};
public static final Token[] TRI_GRAM_TOKENS_POS_INCR_EQUAL_TO_N_WITHOUT_UNIGRAMS = new Token[] {
createToken("please divide", 0, 13),
createToken("please divide this", 0, 18),
createToken("divide this", 7, 18),
createToken("divide this _", 7, 29),
createToken("this _", 14, 29),
createToken("this _ _", 14, 29),
createToken("_ _ sentence", 29, 37),
createToken("_ sentence", 29, 37),
createToken("_ sentence into", 29, 42),
createToken("sentence into", 29, 42),
createToken("sentence into shingles", 29, 49),
createToken("into shingles", 38, 49),
};
public static final int[] TRI_GRAM_POSITION_INCREMENTS_POS_INCR_EQUAL_TO_N_WITHOUT_UNIGRAMS = new int[] {
1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1
};
public static final String[] TRI_GRAM_TYPES_POS_INCR_EQUAL_TO_N_WITHOUT_UNIGRAMS = new String[] {
"shingle", "shingle",
"shingle", "shingle",
"shingle", "shingle",
"shingle", "shingle", "shingle",
"shingle", "shingle",
"shingle",
};
public static final Token[] TEST_TOKEN_POS_INCR_GREATER_THAN_N = new Token[] {
createToken("please", 0, 6),
createToken("divide", 57, 63, 8),
createToken("this", 64, 68),
createToken("sentence", 69, 77),
createToken("into", 78, 82),
createToken("shingles", 83, 89),
};
public static final Token[] TRI_GRAM_TOKENS_POS_INCR_GREATER_THAN_N = new Token[] {
createToken("please", 0, 6),
createToken("please _", 0, 57),
createToken("please _ _", 0, 57),
createToken("_ _ divide", 57, 63),
createToken("_ divide", 57, 63),
createToken("_ divide this", 57, 68),
createToken("divide", 57, 63),
createToken("divide this", 57, 68),
createToken("divide this sentence", 57, 77),
createToken("this", 64, 68),
createToken("this sentence", 64, 77),
createToken("this sentence into", 64, 82),
createToken("sentence", 69, 77),
createToken("sentence into", 69, 82),
createToken("sentence into shingles", 69, 89),
createToken("into", 78, 82),
createToken("into shingles", 78, 89),
createToken("shingles", 83, 89)
};
public static final int[] TRI_GRAM_POSITION_INCREMENTS_POS_INCR_GREATER_THAN_N = new int[] {
1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1
};
public static final String[] TRI_GRAM_TYPES_POS_INCR_GREATER_THAN_N = new String[] {
"word", "shingle", "shingle",
"shingle",
"shingle", "shingle",
"word", "shingle", "shingle",
"word", "shingle", "shingle",
"word", "shingle", "shingle",
"word", "shingle",
"word"
};
public static final Token[] TRI_GRAM_TOKENS_POS_INCR_GREATER_THAN_N_WITHOUT_UNIGRAMS = new Token[] {
createToken("please _", 0, 57),
createToken("please _ _", 0, 57),
createToken("_ _ divide", 57, 63),
createToken("_ divide", 57, 63),
createToken("_ divide this", 57, 68),
createToken("divide this", 57, 68),
createToken("divide this sentence", 57, 77),
createToken("this sentence", 64, 77),
createToken("this sentence into", 64, 82),
createToken("sentence into", 69, 82),
createToken("sentence into shingles", 69, 89),
createToken("into shingles", 78, 89),
};
public static final int[] TRI_GRAM_POSITION_INCREMENTS_POS_INCR_GREATER_THAN_N_WITHOUT_UNIGRAMS = new int[] {
1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1
};
public static final String[] TRI_GRAM_TYPES_POS_INCR_GREATER_THAN_N_WITHOUT_UNIGRAMS = new String[] {
"shingle", "shingle",
"shingle", "shingle",
"shingle", "shingle",
"shingle", "shingle", "shingle", "shingle", "shingle",
"shingle",
};
@Override
protected void setUp() throws Exception {
super.setUp();
testTokenWithHoles = new Token[] {
createToken("please", 0, 6),
createToken("divide", 7, 13),
createToken("sentence", 19, 27),
createToken("shingles", 33, 39),
createToken("sentence", 19, 27, 2),
createToken("shingles", 33, 39, 2),
};
testTokenWithHoles[2].setPositionIncrement(2);
testTokenWithHoles[3].setPositionIncrement(2);
}
/*
@ -667,7 +809,8 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
public void testBiGramFilterWithHoles() throws IOException {
this.shingleFilterTest(2, testTokenWithHoles, BI_GRAM_TOKENS_WITH_HOLES,
BI_GRAM_POSITION_INCREMENTS, BI_GRAM_TYPES,
BI_GRAM_POSITION_INCREMENTS_WITH_HOLES,
BI_GRAM_TYPES_WITH_HOLES,
true);
}
@ -832,7 +975,31 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
TRI_GRAM_POSITION_INCREMENTS_NULL_SEPARATOR,
TRI_GRAM_TYPES_NULL_SEPARATOR, true);
}
public void testPositionIncrementEqualToN() throws IOException {
this.shingleFilterTest(2, 3, TEST_TOKEN_POS_INCR_EQUAL_TO_N, TRI_GRAM_TOKENS_POS_INCR_EQUAL_TO_N,
TRI_GRAM_POSITION_INCREMENTS_POS_INCR_EQUAL_TO_N,
TRI_GRAM_TYPES_POS_INCR_EQUAL_TO_N, true);
}
public void testPositionIncrementEqualToNWithoutUnigrams() throws IOException {
this.shingleFilterTest(2, 3, TEST_TOKEN_POS_INCR_EQUAL_TO_N, TRI_GRAM_TOKENS_POS_INCR_EQUAL_TO_N_WITHOUT_UNIGRAMS,
TRI_GRAM_POSITION_INCREMENTS_POS_INCR_EQUAL_TO_N_WITHOUT_UNIGRAMS,
TRI_GRAM_TYPES_POS_INCR_EQUAL_TO_N_WITHOUT_UNIGRAMS, false);
}
public void testPositionIncrementGreaterThanN() throws IOException {
this.shingleFilterTest(2, 3, TEST_TOKEN_POS_INCR_GREATER_THAN_N, TRI_GRAM_TOKENS_POS_INCR_GREATER_THAN_N,
TRI_GRAM_POSITION_INCREMENTS_POS_INCR_GREATER_THAN_N,
TRI_GRAM_TYPES_POS_INCR_GREATER_THAN_N, true);
}
public void testPositionIncrementGreaterThanNWithoutUnigrams() throws IOException {
this.shingleFilterTest(2, 3, TEST_TOKEN_POS_INCR_GREATER_THAN_N, TRI_GRAM_TOKENS_POS_INCR_GREATER_THAN_N_WITHOUT_UNIGRAMS,
TRI_GRAM_POSITION_INCREMENTS_POS_INCR_GREATER_THAN_N_WITHOUT_UNIGRAMS,
TRI_GRAM_TYPES_POS_INCR_GREATER_THAN_N_WITHOUT_UNIGRAMS, false);
}
public void testReset() throws Exception {
Tokenizer wsTokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("please divide this sentence"));
@ -896,18 +1063,24 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
int endOffsets[] = new int[tokensToCompare.length];
for (int i = 0; i < tokensToCompare.length; i++) {
text[i] = tokensToCompare[i].term();
text[i] = new String(tokensToCompare[i].buffer(),0, tokensToCompare[i].length());
startOffsets[i] = tokensToCompare[i].startOffset();
endOffsets[i] = tokensToCompare[i].endOffset();
}
assertTokenStreamContents(filter, text, startOffsets, endOffsets, types, positionIncrements);
}
private static Token createToken(String term, int start, int offset) {
return createToken(term, start, offset, 1);
}
private static Token createToken(String term, int start, int offset)
private static Token createToken
(String term, int start, int offset, int positionIncrement)
{
Token token = new Token(start, offset);
token.setTermBuffer(term);
token.copyBuffer(term.toCharArray(), 0, term.length());
token.setPositionIncrement(positionIncrement);
return token;
}
}