mirror of https://github.com/apache/lucene.git
LUCENE-2400: ShingleFilter: don't output all-filler shingles/unigrams; also, convert from TermAttribute to CharTermAttribute
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@940451 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
5abbf3429c
commit
7cd9a8da03
|
@ -145,6 +145,10 @@ New features
|
|||
multiple languages. The default configuration includes special support for
|
||||
Thai, Lao, Myanmar, and Khmer. (Robert Muir, Uwe Schindler)
|
||||
|
||||
* LUCENE-2400: ShingleFilter was changed to don't output all-filler shingles and
|
||||
unigrams, and uses a more performant algorithm to build grams using a linked list
|
||||
of AttributeSource.cloneAttributes() instances and the new copyTo() method.
|
||||
|
||||
Build
|
||||
|
||||
* LUCENE-2124: Moved the JDK-based collation support from contrib/collation
|
||||
|
|
|
@ -18,14 +18,16 @@ package org.apache.lucene.analysis.shingle;
|
|||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Iterator;
|
||||
import java.util.LinkedList;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
|
||||
|
||||
/**
|
||||
|
@ -66,12 +68,12 @@ public final class ShingleFilter extends TokenFilter {
|
|||
*/
|
||||
public static final String TOKEN_SEPARATOR = " ";
|
||||
|
||||
|
||||
/**
|
||||
* The sequence of input stream tokens (or filler tokens, if necessary)
|
||||
* that will be composed to form output shingles.
|
||||
*/
|
||||
private LinkedList<State> inputWindow = new LinkedList<State>();
|
||||
private LinkedList<InputWindowToken> inputWindow
|
||||
= new LinkedList<InputWindowToken>();
|
||||
|
||||
/**
|
||||
* The number of input tokens in the next output token. This is the "n" in
|
||||
|
@ -80,9 +82,9 @@ public final class ShingleFilter extends TokenFilter {
|
|||
private CircularSequence gramSize;
|
||||
|
||||
/**
|
||||
* Shingle text is composed here.
|
||||
* Shingle and unigram text is composed here.
|
||||
*/
|
||||
private StringBuilder shingleBuilder = new StringBuilder();
|
||||
private StringBuilder gramBuilder = new StringBuilder();
|
||||
|
||||
/**
|
||||
* The token type attribute value to use - default is "shingle"
|
||||
|
@ -111,18 +113,31 @@ public final class ShingleFilter extends TokenFilter {
|
|||
private int minShingleSize;
|
||||
|
||||
/**
|
||||
* The remaining number of filler tokens inserted into the input stream
|
||||
* The remaining number of filler tokens to be inserted into the input stream
|
||||
* from which shingles are composed, to handle position increments greater
|
||||
* than one.
|
||||
*/
|
||||
private int numFillerTokensToInsert;
|
||||
|
||||
/**
|
||||
* The next input stream token.
|
||||
* When the next input stream token has a position increment greater than
|
||||
* one, it is stored in this field until sufficient filler tokens have been
|
||||
* inserted to account for the position increment.
|
||||
*/
|
||||
private State nextInputStreamToken;
|
||||
private AttributeSource nextInputStreamToken;
|
||||
|
||||
/**
|
||||
* Whether or not there is a next input stream token.
|
||||
*/
|
||||
private boolean isNextInputStreamToken = false;
|
||||
|
||||
/**
|
||||
* Whether at least one unigram or shingle has been output at the current
|
||||
* position.
|
||||
*/
|
||||
private boolean isOutputHere = false;
|
||||
|
||||
private final TermAttribute termAtt;
|
||||
private final CharTermAttribute termAtt;
|
||||
private final OffsetAttribute offsetAtt;
|
||||
private final PositionIncrementAttribute posIncrAtt;
|
||||
private final TypeAttribute typeAtt;
|
||||
|
@ -140,7 +155,7 @@ public final class ShingleFilter extends TokenFilter {
|
|||
super(input);
|
||||
setMaxShingleSize(maxShingleSize);
|
||||
setMinShingleSize(minShingleSize);
|
||||
this.termAtt = addAttribute(TermAttribute.class);
|
||||
this.termAtt = addAttribute(CharTermAttribute.class);
|
||||
this.offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
this.posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
this.typeAtt = addAttribute(TypeAttribute.class);
|
||||
|
@ -241,23 +256,49 @@ public final class ShingleFilter extends TokenFilter {
|
|||
this.tokenSeparator = null == tokenSeparator ? "" : tokenSeparator;
|
||||
}
|
||||
|
||||
/* (non-Javadoc)
|
||||
* @see org.apache.lucene.analysis.TokenStream#next()
|
||||
*/
|
||||
@Override
|
||||
public final boolean incrementToken() throws IOException {
|
||||
boolean tokenAvailable = false;
|
||||
boolean tokenAvailable = false;
|
||||
int builtGramSize = 0;
|
||||
if (gramSize.atMinValue() || inputWindow.size() < gramSize.getValue()) {
|
||||
shiftInputWindow();
|
||||
gramBuilder.setLength(0);
|
||||
} else {
|
||||
builtGramSize = gramSize.getPreviousValue();
|
||||
}
|
||||
if ( ! inputWindow.isEmpty()) {
|
||||
restoreState(inputWindow.getFirst());
|
||||
if (1 == gramSize.getValue()) {
|
||||
posIncrAtt.setPositionIncrement(1);
|
||||
gramSize.advance();
|
||||
tokenAvailable = true;
|
||||
} else if (inputWindow.size() >= gramSize.getValue()) {
|
||||
getNextShingle();
|
||||
if (inputWindow.size() >= gramSize.getValue()) {
|
||||
boolean isAllFiller = true;
|
||||
InputWindowToken nextToken = null;
|
||||
Iterator<InputWindowToken> iter = inputWindow.iterator();
|
||||
for (int gramNum = 1 ;
|
||||
iter.hasNext() && builtGramSize < gramSize.getValue() ;
|
||||
++gramNum) {
|
||||
nextToken = iter.next();
|
||||
if (builtGramSize < gramNum) {
|
||||
if (builtGramSize > 0) {
|
||||
gramBuilder.append(tokenSeparator);
|
||||
}
|
||||
gramBuilder.append(nextToken.termAtt.buffer(), 0,
|
||||
nextToken.termAtt.length());
|
||||
++builtGramSize;
|
||||
}
|
||||
if (isAllFiller && nextToken.isFiller) {
|
||||
if (gramNum == gramSize.getValue()) {
|
||||
gramSize.advance();
|
||||
}
|
||||
} else {
|
||||
isAllFiller = false;
|
||||
}
|
||||
}
|
||||
if ( ! isAllFiller && builtGramSize == gramSize.getValue()) {
|
||||
inputWindow.getFirst().attSource.copyTo(this);
|
||||
posIncrAtt.setPositionIncrement(isOutputHere ? 0 : 1);
|
||||
termAtt.setEmpty().append(gramBuilder);
|
||||
if (gramSize.getValue() > 1) {
|
||||
typeAtt.setType(tokenType);
|
||||
}
|
||||
offsetAtt.setOffset(offsetAtt.startOffset(), nextToken.offsetAtt.endOffset());
|
||||
isOutputHere = true;
|
||||
gramSize.advance();
|
||||
tokenAvailable = true;
|
||||
}
|
||||
|
@ -265,83 +306,69 @@ public final class ShingleFilter extends TokenFilter {
|
|||
return tokenAvailable;
|
||||
}
|
||||
|
||||
/**
|
||||
* <p>Makes the next token a shingle of length {@link #gramSize},
|
||||
* composed of tokens taken from {@link #inputWindow}.
|
||||
* <p>Callers of this method must first insure that there are at least
|
||||
* <code>gramSize</code> tokens available in <code>inputWindow</code>.
|
||||
*/
|
||||
private void getNextShingle() {
|
||||
int startOffset = offsetAtt.startOffset();
|
||||
|
||||
int minTokNum = gramSize.getValue() - 1; // zero-based inputWindow position
|
||||
if (gramSize.getValue() == minShingleSize) {
|
||||
// Clear the shingle text buffer if this is the first shingle
|
||||
// at the current position in the input stream.
|
||||
shingleBuilder.setLength(0);
|
||||
minTokNum = 0;
|
||||
}
|
||||
for (int tokNum = minTokNum ; tokNum < gramSize.getValue() ; ++tokNum) {
|
||||
if (tokNum > 0) {
|
||||
shingleBuilder.append(tokenSeparator);
|
||||
}
|
||||
restoreState(inputWindow.get(tokNum));
|
||||
shingleBuilder.append(termAtt.termBuffer(), 0, termAtt.termLength());
|
||||
}
|
||||
char[] termBuffer = termAtt.termBuffer();
|
||||
int termLength = shingleBuilder.length();
|
||||
if (termBuffer.length < termLength) {
|
||||
termBuffer = termAtt.resizeTermBuffer(termLength);
|
||||
}
|
||||
shingleBuilder.getChars(0, termLength, termBuffer, 0);
|
||||
termAtt.setTermLength(termLength);
|
||||
posIncrAtt.setPositionIncrement(gramSize.atMinValue() ? 1 : 0);
|
||||
typeAtt.setType(tokenType);
|
||||
offsetAtt.setOffset(startOffset, offsetAtt.endOffset());
|
||||
}
|
||||
|
||||
/**
|
||||
* <p>Get the next token from the input stream.
|
||||
* <p>If the next token has <code>positionIncrement > 1</code>,
|
||||
* <code>positionIncrement - 1</code> {@link #FILLER_TOKEN}s are
|
||||
* inserted first.
|
||||
* @return false for end of stream; true otherwise
|
||||
* @param target Where to put the new token; if null, a new instance is created.
|
||||
* @return On success, the populated token; null otherwise
|
||||
* @throws IOException if the input stream has a problem
|
||||
*/
|
||||
private boolean getNextToken() throws IOException {
|
||||
boolean success = false;
|
||||
private InputWindowToken getNextToken(InputWindowToken target)
|
||||
throws IOException {
|
||||
InputWindowToken newTarget = target;
|
||||
if (numFillerTokensToInsert > 0) {
|
||||
insertFillerToken();
|
||||
success = true;
|
||||
} else if (null != nextInputStreamToken) {
|
||||
restoreState(nextInputStreamToken);
|
||||
nextInputStreamToken = null;
|
||||
success = true;
|
||||
} else if (input.incrementToken()) {
|
||||
if (posIncrAtt.getPositionIncrement() > 1) {
|
||||
numFillerTokensToInsert = posIncrAtt.getPositionIncrement() - 1;
|
||||
insertFillerToken();
|
||||
if (null == target) {
|
||||
newTarget = new InputWindowToken(nextInputStreamToken.cloneAttributes());
|
||||
} else {
|
||||
nextInputStreamToken.copyTo(target.attSource);
|
||||
}
|
||||
// A filler token occupies no space
|
||||
newTarget.offsetAtt.setOffset(newTarget.offsetAtt.startOffset(),
|
||||
newTarget.offsetAtt.startOffset());
|
||||
newTarget.termAtt.copyBuffer(FILLER_TOKEN, 0, FILLER_TOKEN.length);
|
||||
newTarget.isFiller = true;
|
||||
--numFillerTokensToInsert;
|
||||
} else if (isNextInputStreamToken) {
|
||||
if (null == target) {
|
||||
newTarget = new InputWindowToken(nextInputStreamToken.cloneAttributes());
|
||||
} else {
|
||||
nextInputStreamToken.copyTo(target.attSource);
|
||||
}
|
||||
isNextInputStreamToken = false;
|
||||
newTarget.isFiller = false;
|
||||
} else if (input.incrementToken()) {
|
||||
if (null == target) {
|
||||
newTarget = new InputWindowToken(cloneAttributes());
|
||||
} else {
|
||||
this.copyTo(target.attSource);
|
||||
}
|
||||
if (posIncrAtt.getPositionIncrement() > 1) {
|
||||
// Each output shingle must contain at least one input token,
|
||||
// so no more than (maxShingleSize - 1) filler tokens will be inserted.
|
||||
numFillerTokensToInsert
|
||||
= Math.min(posIncrAtt.getPositionIncrement() - 1, maxShingleSize - 1);
|
||||
// Save the current token as the next input stream token
|
||||
if (null == nextInputStreamToken) {
|
||||
nextInputStreamToken = cloneAttributes();
|
||||
} else {
|
||||
this.copyTo(nextInputStreamToken);
|
||||
}
|
||||
isNextInputStreamToken = true;
|
||||
// A filler token occupies no space
|
||||
newTarget.offsetAtt.setOffset(offsetAtt.startOffset(), offsetAtt.startOffset());
|
||||
newTarget.termAtt.copyBuffer(FILLER_TOKEN, 0, FILLER_TOKEN.length);
|
||||
newTarget.isFiller = true;
|
||||
--numFillerTokensToInsert;
|
||||
} else {
|
||||
newTarget.isFiller = false;
|
||||
}
|
||||
success = true;
|
||||
}
|
||||
return success;
|
||||
}
|
||||
|
||||
/**
|
||||
* Inserts a {@link #FILLER_TOKEN} and decrements
|
||||
* {@link #numFillerTokensToInsert}.
|
||||
*/
|
||||
private void insertFillerToken() {
|
||||
if (null == nextInputStreamToken) {
|
||||
nextInputStreamToken = captureState();
|
||||
} else {
|
||||
restoreState(nextInputStreamToken);
|
||||
newTarget = null;
|
||||
}
|
||||
--numFillerTokensToInsert;
|
||||
// A filler token occupies no space
|
||||
offsetAtt.setOffset(offsetAtt.startOffset(), offsetAtt.startOffset());
|
||||
termAtt.setTermBuffer(FILLER_TOKEN, 0, FILLER_TOKEN.length);
|
||||
}
|
||||
return newTarget;
|
||||
}
|
||||
|
||||
/**
|
||||
* <p>Fills {@link #inputWindow} with input stream tokens, if available,
|
||||
|
@ -351,16 +378,29 @@ public final class ShingleFilter extends TokenFilter {
|
|||
* @throws IOException if there's a problem getting the next token
|
||||
*/
|
||||
private void shiftInputWindow() throws IOException {
|
||||
InputWindowToken firstToken = null;
|
||||
if (inputWindow.size() > 0) {
|
||||
inputWindow.removeFirst();
|
||||
firstToken = inputWindow.removeFirst();
|
||||
}
|
||||
while (getNextToken()) {
|
||||
inputWindow.add(captureState());
|
||||
if (inputWindow.size() == maxShingleSize) {
|
||||
break;
|
||||
while (inputWindow.size() < maxShingleSize) {
|
||||
if (null != firstToken) { // recycle the firstToken, if available
|
||||
if (null != getNextToken(firstToken)) {
|
||||
inputWindow.add(firstToken); // the firstToken becomes the last
|
||||
firstToken = null;
|
||||
} else {
|
||||
break; // end of input stream
|
||||
}
|
||||
} else {
|
||||
InputWindowToken nextToken = getNextToken(null);
|
||||
if (null != nextToken) {
|
||||
inputWindow.add(nextToken);
|
||||
} else {
|
||||
break; // end of input stream
|
||||
}
|
||||
}
|
||||
}
|
||||
gramSize.reset();
|
||||
isOutputHere = false;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -369,6 +409,7 @@ public final class ShingleFilter extends TokenFilter {
|
|||
gramSize.reset();
|
||||
inputWindow.clear();
|
||||
numFillerTokensToInsert = 0;
|
||||
isOutputHere = false;
|
||||
}
|
||||
|
||||
|
||||
|
@ -383,6 +424,7 @@ public final class ShingleFilter extends TokenFilter {
|
|||
*/
|
||||
private class CircularSequence {
|
||||
private int value;
|
||||
private int previousValue;
|
||||
private int minValue;
|
||||
|
||||
public CircularSequence() {
|
||||
|
@ -405,10 +447,9 @@ public final class ShingleFilter extends TokenFilter {
|
|||
* <b>{ [ 1, ] {@link #minShingleSize} [ , ... , {@link #maxShingleSize} ] }</b>.
|
||||
* <p>1 is included in the circular sequence only if
|
||||
* {@link #outputUnigrams} = true.
|
||||
*
|
||||
* @return the next member in the circular sequence
|
||||
*/
|
||||
public int advance() {
|
||||
public void advance() {
|
||||
previousValue = value;
|
||||
if (value == 1) {
|
||||
value = minShingleSize;
|
||||
} else if (value == maxShingleSize) {
|
||||
|
@ -416,7 +457,6 @@ public final class ShingleFilter extends TokenFilter {
|
|||
} else {
|
||||
++value;
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -428,7 +468,7 @@ public final class ShingleFilter extends TokenFilter {
|
|||
* {@link #outputUnigrams} = true.
|
||||
*/
|
||||
public void reset() {
|
||||
value = minValue;
|
||||
previousValue = value = minValue;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -443,5 +483,25 @@ public final class ShingleFilter extends TokenFilter {
|
|||
public boolean atMinValue() {
|
||||
return value == minValue;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the value this instance had before the last advance() call
|
||||
*/
|
||||
public int getPreviousValue() {
|
||||
return previousValue;
|
||||
}
|
||||
}
|
||||
|
||||
private class InputWindowToken {
|
||||
final AttributeSource attSource;
|
||||
final CharTermAttribute termAtt;
|
||||
final OffsetAttribute offsetAtt;
|
||||
boolean isFiller = false;
|
||||
|
||||
public InputWindowToken(AttributeSource attSource) {
|
||||
this.attSource = attSource;
|
||||
this.termAtt = attSource.getAttribute(CharTermAttribute.class);
|
||||
this.offsetAtt = attSource.getAttribute(OffsetAttribute.class);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -34,7 +34,7 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
|
|||
protected int index = 0;
|
||||
protected Token[] testToken;
|
||||
|
||||
private TermAttribute termAtt;
|
||||
private CharTermAttribute termAtt;
|
||||
private OffsetAttribute offsetAtt;
|
||||
private PositionIncrementAttribute posIncrAtt;
|
||||
private TypeAttribute typeAtt;
|
||||
|
@ -42,7 +42,7 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
|
|||
public TestTokenStream(Token[] testToken) {
|
||||
super();
|
||||
this.testToken = testToken;
|
||||
this.termAtt = addAttribute(TermAttribute.class);
|
||||
this.termAtt = addAttribute(CharTermAttribute.class);
|
||||
this.offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
this.posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
this.typeAtt = addAttribute(TypeAttribute.class);
|
||||
|
@ -53,7 +53,7 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
|
|||
clearAttributes();
|
||||
if (index < testToken.length) {
|
||||
Token t = testToken[index++];
|
||||
termAtt.setTermBuffer(t.termBuffer(), 0, t.termLength());
|
||||
termAtt.copyBuffer(t.buffer(), 0, t.length());
|
||||
offsetAtt.setOffset(t.startOffset(), t.endOffset());
|
||||
posIncrAtt.setPositionIncrement(t.getPositionIncrement());
|
||||
typeAtt.setType(TypeAttributeImpl.DEFAULT_TYPE);
|
||||
|
@ -103,17 +103,20 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
|
|||
createToken("please divide", 0, 13),
|
||||
createToken("divide", 7, 13),
|
||||
createToken("divide _", 7, 19),
|
||||
createToken("_", 19, 19),
|
||||
createToken("_ sentence", 19, 27),
|
||||
createToken("sentence", 19, 27),
|
||||
createToken("sentence _", 19, 33),
|
||||
createToken("_", 33, 33),
|
||||
createToken("_ shingles", 33, 39),
|
||||
createToken("shingles", 33, 39),
|
||||
};
|
||||
|
||||
public static final int[] BI_GRAM_POSITION_INCREMENTS_WITH_HOLES = new int[] {
|
||||
1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
|
||||
1, 0, 1, 0, 1, 1, 0, 1, 1
|
||||
};
|
||||
|
||||
private static final String[] BI_GRAM_TYPES_WITH_HOLES = {
|
||||
"word", "shingle",
|
||||
"word", "shingle", "shingle", "word", "shingle", "shingle", "word"
|
||||
};
|
||||
|
||||
public static final Token[] BI_GRAM_TOKENS_WITHOUT_UNIGRAMS = new Token[] {
|
||||
|
@ -642,18 +645,157 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
|
|||
"word"
|
||||
};
|
||||
|
||||
public static final Token[] TEST_TOKEN_POS_INCR_EQUAL_TO_N = new Token[] {
|
||||
createToken("please", 0, 6),
|
||||
createToken("divide", 7, 13),
|
||||
createToken("this", 14, 18),
|
||||
createToken("sentence", 29, 37, 3),
|
||||
createToken("into", 38, 42),
|
||||
createToken("shingles", 43, 49),
|
||||
};
|
||||
|
||||
public static final Token[] TRI_GRAM_TOKENS_POS_INCR_EQUAL_TO_N = new Token[] {
|
||||
createToken("please", 0, 6),
|
||||
createToken("please divide", 0, 13),
|
||||
createToken("please divide this", 0, 18),
|
||||
createToken("divide", 7, 13),
|
||||
createToken("divide this", 7, 18),
|
||||
createToken("divide this _", 7, 29),
|
||||
createToken("this", 14, 18),
|
||||
createToken("this _", 14, 29),
|
||||
createToken("this _ _", 14, 29),
|
||||
createToken("_ _ sentence", 29, 37),
|
||||
createToken("_ sentence", 29, 37),
|
||||
createToken("_ sentence into", 29, 42),
|
||||
createToken("sentence", 29, 37),
|
||||
createToken("sentence into", 29, 42),
|
||||
createToken("sentence into shingles", 29, 49),
|
||||
createToken("into", 38, 42),
|
||||
createToken("into shingles", 38, 49),
|
||||
createToken("shingles", 43, 49)
|
||||
};
|
||||
|
||||
public static final int[] TRI_GRAM_POSITION_INCREMENTS_POS_INCR_EQUAL_TO_N = new int[] {
|
||||
1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1
|
||||
};
|
||||
|
||||
public static final String[] TRI_GRAM_TYPES_POS_INCR_EQUAL_TO_N = new String[] {
|
||||
"word", "shingle", "shingle",
|
||||
"word", "shingle", "shingle",
|
||||
"word", "shingle", "shingle",
|
||||
"shingle", "shingle", "shingle", "word", "shingle", "shingle",
|
||||
"word", "shingle",
|
||||
"word"
|
||||
};
|
||||
|
||||
public static final Token[] TRI_GRAM_TOKENS_POS_INCR_EQUAL_TO_N_WITHOUT_UNIGRAMS = new Token[] {
|
||||
createToken("please divide", 0, 13),
|
||||
createToken("please divide this", 0, 18),
|
||||
createToken("divide this", 7, 18),
|
||||
createToken("divide this _", 7, 29),
|
||||
createToken("this _", 14, 29),
|
||||
createToken("this _ _", 14, 29),
|
||||
createToken("_ _ sentence", 29, 37),
|
||||
createToken("_ sentence", 29, 37),
|
||||
createToken("_ sentence into", 29, 42),
|
||||
createToken("sentence into", 29, 42),
|
||||
createToken("sentence into shingles", 29, 49),
|
||||
createToken("into shingles", 38, 49),
|
||||
};
|
||||
|
||||
public static final int[] TRI_GRAM_POSITION_INCREMENTS_POS_INCR_EQUAL_TO_N_WITHOUT_UNIGRAMS = new int[] {
|
||||
1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1
|
||||
};
|
||||
|
||||
public static final String[] TRI_GRAM_TYPES_POS_INCR_EQUAL_TO_N_WITHOUT_UNIGRAMS = new String[] {
|
||||
"shingle", "shingle",
|
||||
"shingle", "shingle",
|
||||
"shingle", "shingle",
|
||||
"shingle", "shingle", "shingle",
|
||||
"shingle", "shingle",
|
||||
"shingle",
|
||||
};
|
||||
|
||||
public static final Token[] TEST_TOKEN_POS_INCR_GREATER_THAN_N = new Token[] {
|
||||
createToken("please", 0, 6),
|
||||
createToken("divide", 57, 63, 8),
|
||||
createToken("this", 64, 68),
|
||||
createToken("sentence", 69, 77),
|
||||
createToken("into", 78, 82),
|
||||
createToken("shingles", 83, 89),
|
||||
};
|
||||
|
||||
public static final Token[] TRI_GRAM_TOKENS_POS_INCR_GREATER_THAN_N = new Token[] {
|
||||
createToken("please", 0, 6),
|
||||
createToken("please _", 0, 57),
|
||||
createToken("please _ _", 0, 57),
|
||||
createToken("_ _ divide", 57, 63),
|
||||
createToken("_ divide", 57, 63),
|
||||
createToken("_ divide this", 57, 68),
|
||||
createToken("divide", 57, 63),
|
||||
createToken("divide this", 57, 68),
|
||||
createToken("divide this sentence", 57, 77),
|
||||
createToken("this", 64, 68),
|
||||
createToken("this sentence", 64, 77),
|
||||
createToken("this sentence into", 64, 82),
|
||||
createToken("sentence", 69, 77),
|
||||
createToken("sentence into", 69, 82),
|
||||
createToken("sentence into shingles", 69, 89),
|
||||
createToken("into", 78, 82),
|
||||
createToken("into shingles", 78, 89),
|
||||
createToken("shingles", 83, 89)
|
||||
};
|
||||
|
||||
public static final int[] TRI_GRAM_POSITION_INCREMENTS_POS_INCR_GREATER_THAN_N = new int[] {
|
||||
1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1
|
||||
};
|
||||
public static final String[] TRI_GRAM_TYPES_POS_INCR_GREATER_THAN_N = new String[] {
|
||||
"word", "shingle", "shingle",
|
||||
"shingle",
|
||||
"shingle", "shingle",
|
||||
"word", "shingle", "shingle",
|
||||
"word", "shingle", "shingle",
|
||||
"word", "shingle", "shingle",
|
||||
"word", "shingle",
|
||||
"word"
|
||||
};
|
||||
|
||||
public static final Token[] TRI_GRAM_TOKENS_POS_INCR_GREATER_THAN_N_WITHOUT_UNIGRAMS = new Token[] {
|
||||
createToken("please _", 0, 57),
|
||||
createToken("please _ _", 0, 57),
|
||||
createToken("_ _ divide", 57, 63),
|
||||
createToken("_ divide", 57, 63),
|
||||
createToken("_ divide this", 57, 68),
|
||||
createToken("divide this", 57, 68),
|
||||
createToken("divide this sentence", 57, 77),
|
||||
createToken("this sentence", 64, 77),
|
||||
createToken("this sentence into", 64, 82),
|
||||
createToken("sentence into", 69, 82),
|
||||
createToken("sentence into shingles", 69, 89),
|
||||
createToken("into shingles", 78, 89),
|
||||
};
|
||||
|
||||
public static final int[] TRI_GRAM_POSITION_INCREMENTS_POS_INCR_GREATER_THAN_N_WITHOUT_UNIGRAMS = new int[] {
|
||||
1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1
|
||||
};
|
||||
|
||||
public static final String[] TRI_GRAM_TYPES_POS_INCR_GREATER_THAN_N_WITHOUT_UNIGRAMS = new String[] {
|
||||
"shingle", "shingle",
|
||||
"shingle", "shingle",
|
||||
"shingle", "shingle",
|
||||
"shingle", "shingle", "shingle", "shingle", "shingle",
|
||||
"shingle",
|
||||
};
|
||||
|
||||
@Override
|
||||
protected void setUp() throws Exception {
|
||||
super.setUp();
|
||||
testTokenWithHoles = new Token[] {
|
||||
createToken("please", 0, 6),
|
||||
createToken("divide", 7, 13),
|
||||
createToken("sentence", 19, 27),
|
||||
createToken("shingles", 33, 39),
|
||||
createToken("sentence", 19, 27, 2),
|
||||
createToken("shingles", 33, 39, 2),
|
||||
};
|
||||
|
||||
testTokenWithHoles[2].setPositionIncrement(2);
|
||||
testTokenWithHoles[3].setPositionIncrement(2);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -667,7 +809,8 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
|
|||
|
||||
public void testBiGramFilterWithHoles() throws IOException {
|
||||
this.shingleFilterTest(2, testTokenWithHoles, BI_GRAM_TOKENS_WITH_HOLES,
|
||||
BI_GRAM_POSITION_INCREMENTS, BI_GRAM_TYPES,
|
||||
BI_GRAM_POSITION_INCREMENTS_WITH_HOLES,
|
||||
BI_GRAM_TYPES_WITH_HOLES,
|
||||
true);
|
||||
}
|
||||
|
||||
|
@ -832,7 +975,31 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
|
|||
TRI_GRAM_POSITION_INCREMENTS_NULL_SEPARATOR,
|
||||
TRI_GRAM_TYPES_NULL_SEPARATOR, true);
|
||||
}
|
||||
|
||||
public void testPositionIncrementEqualToN() throws IOException {
|
||||
this.shingleFilterTest(2, 3, TEST_TOKEN_POS_INCR_EQUAL_TO_N, TRI_GRAM_TOKENS_POS_INCR_EQUAL_TO_N,
|
||||
TRI_GRAM_POSITION_INCREMENTS_POS_INCR_EQUAL_TO_N,
|
||||
TRI_GRAM_TYPES_POS_INCR_EQUAL_TO_N, true);
|
||||
}
|
||||
|
||||
public void testPositionIncrementEqualToNWithoutUnigrams() throws IOException {
|
||||
this.shingleFilterTest(2, 3, TEST_TOKEN_POS_INCR_EQUAL_TO_N, TRI_GRAM_TOKENS_POS_INCR_EQUAL_TO_N_WITHOUT_UNIGRAMS,
|
||||
TRI_GRAM_POSITION_INCREMENTS_POS_INCR_EQUAL_TO_N_WITHOUT_UNIGRAMS,
|
||||
TRI_GRAM_TYPES_POS_INCR_EQUAL_TO_N_WITHOUT_UNIGRAMS, false);
|
||||
}
|
||||
|
||||
|
||||
public void testPositionIncrementGreaterThanN() throws IOException {
|
||||
this.shingleFilterTest(2, 3, TEST_TOKEN_POS_INCR_GREATER_THAN_N, TRI_GRAM_TOKENS_POS_INCR_GREATER_THAN_N,
|
||||
TRI_GRAM_POSITION_INCREMENTS_POS_INCR_GREATER_THAN_N,
|
||||
TRI_GRAM_TYPES_POS_INCR_GREATER_THAN_N, true);
|
||||
}
|
||||
|
||||
public void testPositionIncrementGreaterThanNWithoutUnigrams() throws IOException {
|
||||
this.shingleFilterTest(2, 3, TEST_TOKEN_POS_INCR_GREATER_THAN_N, TRI_GRAM_TOKENS_POS_INCR_GREATER_THAN_N_WITHOUT_UNIGRAMS,
|
||||
TRI_GRAM_POSITION_INCREMENTS_POS_INCR_GREATER_THAN_N_WITHOUT_UNIGRAMS,
|
||||
TRI_GRAM_TYPES_POS_INCR_GREATER_THAN_N_WITHOUT_UNIGRAMS, false);
|
||||
}
|
||||
|
||||
public void testReset() throws Exception {
|
||||
Tokenizer wsTokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("please divide this sentence"));
|
||||
|
@ -896,18 +1063,24 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
|
|||
int endOffsets[] = new int[tokensToCompare.length];
|
||||
|
||||
for (int i = 0; i < tokensToCompare.length; i++) {
|
||||
text[i] = tokensToCompare[i].term();
|
||||
text[i] = new String(tokensToCompare[i].buffer(),0, tokensToCompare[i].length());
|
||||
startOffsets[i] = tokensToCompare[i].startOffset();
|
||||
endOffsets[i] = tokensToCompare[i].endOffset();
|
||||
}
|
||||
|
||||
assertTokenStreamContents(filter, text, startOffsets, endOffsets, types, positionIncrements);
|
||||
}
|
||||
|
||||
private static Token createToken(String term, int start, int offset) {
|
||||
return createToken(term, start, offset, 1);
|
||||
}
|
||||
|
||||
private static Token createToken(String term, int start, int offset)
|
||||
private static Token createToken
|
||||
(String term, int start, int offset, int positionIncrement)
|
||||
{
|
||||
Token token = new Token(start, offset);
|
||||
token.setTermBuffer(term);
|
||||
token.copyBuffer(term.toCharArray(), 0, term.length());
|
||||
token.setPositionIncrement(positionIncrement);
|
||||
return token;
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue