LUCENE-5180: ShingleFilter creates shingles from trailing holes

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1524117 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2013-09-17 16:26:36 +00:00
parent b67683ea21
commit 9a6360e131
4 changed files with 145 additions and 64 deletions

View File

@ -38,6 +38,10 @@ New Features
* SOLR-3359: Added analyzer attribute/property to SynonymFilterFactory.
(Ryo Onodera via Koji Sekiguchi)
* LUCENE-5180: ShingleFilter now creates shingles with trailing holes,
for example if a StopFilter had removed the last token. (Mike
McCandless)
Optimizations
* LUCENE-4848: Use Java 7 NIO2-FileChannel instead of RandomAccessFile

View File

@ -147,6 +147,12 @@ public final class ShingleFilter extends TokenFilter {
* true if no shingles have been output yet (for outputUnigramsIfNoShingles).
*/
boolean noShingleOutput = true;
/**
* Holds the State after input.end() was called, so we can
* restore it in our end() impl.
*/
private State endState;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
@ -279,7 +285,7 @@ public final class ShingleFilter extends TokenFilter {
}
@Override
public final boolean incrementToken() throws IOException {
public boolean incrementToken() throws IOException {
boolean tokenAvailable = false;
int builtGramSize = 0;
if (gramSize.atMinValue() || inputWindow.size() < gramSize.getValue()) {
@ -364,39 +370,63 @@ public final class ShingleFilter extends TokenFilter {
}
isNextInputStreamToken = false;
newTarget.isFiller = false;
} else if (!exhausted && input.incrementToken()) {
if (null == target) {
newTarget = new InputWindowToken(cloneAttributes());
} else {
this.copyTo(target.attSource);
}
if (posIncrAtt.getPositionIncrement() > 1) {
// Each output shingle must contain at least one input token,
// so no more than (maxShingleSize - 1) filler tokens will be inserted.
numFillerTokensToInsert
= Math.min(posIncrAtt.getPositionIncrement() - 1, maxShingleSize - 1);
// Save the current token as the next input stream token
if (null == nextInputStreamToken) {
nextInputStreamToken = cloneAttributes();
} else if (!exhausted) {
if (input.incrementToken()) {
if (null == target) {
newTarget = new InputWindowToken(cloneAttributes());
} else {
this.copyTo(nextInputStreamToken);
this.copyTo(target.attSource);
}
if (posIncrAtt.getPositionIncrement() > 1) {
// Each output shingle must contain at least one input token,
// so no more than (maxShingleSize - 1) filler tokens will be inserted.
numFillerTokensToInsert = Math.min(posIncrAtt.getPositionIncrement() - 1, maxShingleSize - 1);
// Save the current token as the next input stream token
if (null == nextInputStreamToken) {
nextInputStreamToken = cloneAttributes();
} else {
this.copyTo(nextInputStreamToken);
}
isNextInputStreamToken = true;
// A filler token occupies no space
newTarget.offsetAtt.setOffset(offsetAtt.startOffset(), offsetAtt.startOffset());
newTarget.termAtt.copyBuffer(FILLER_TOKEN, 0, FILLER_TOKEN.length);
newTarget.isFiller = true;
--numFillerTokensToInsert;
} else {
newTarget.isFiller = false;
}
isNextInputStreamToken = true;
// A filler token occupies no space
newTarget.offsetAtt.setOffset(offsetAtt.startOffset(), offsetAtt.startOffset());
newTarget.termAtt.copyBuffer(FILLER_TOKEN, 0, FILLER_TOKEN.length);
newTarget.isFiller = true;
--numFillerTokensToInsert;
} else {
newTarget.isFiller = false;
exhausted = true;
input.end();
endState = captureState();
numFillerTokensToInsert = Math.min(posIncrAtt.getPositionIncrement(), maxShingleSize - 1);
if (numFillerTokensToInsert > 0) {
nextInputStreamToken = new AttributeSource(getAttributeFactory());
nextInputStreamToken.addAttribute(CharTermAttribute.class);
OffsetAttribute newOffsetAtt = nextInputStreamToken.addAttribute(OffsetAttribute.class);
newOffsetAtt.setOffset(offsetAtt.endOffset(), offsetAtt.endOffset());
// Recurse/loop just once:
return getNextToken(target);
} else {
newTarget = null;
}
}
} else {
newTarget = null;
exhausted = true;
}
return newTarget;
}
@Override
public void end() throws IOException {
if (!exhausted) {
super.end();
} else {
restoreState(endState);
}
}
/**
* <p>Fills {@link #inputWindow} with input stream tokens, if available,
* shifting to the right if the window was previously full.
@ -445,6 +475,7 @@ public final class ShingleFilter extends TokenFilter {
isOutputHere = false;
noShingleOutput = true;
exhausted = false;
endState = null;
if (outputUnigramsIfNoShingles && ! outputUnigrams) {
// Fix up gramSize if minValue was reset for outputUnigramsIfNoShingles
gramSize.minValue = minShingleSize;

View File

@ -24,6 +24,7 @@ import java.util.Random;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.CannedTokenStream;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
@ -34,41 +35,6 @@ import org.apache.lucene.analysis.tokenattributes.*;
public class ShingleFilterTest extends BaseTokenStreamTestCase {
public class TestTokenStream extends TokenStream {
protected int index = 0;
protected Token[] testToken;
private CharTermAttribute termAtt;
private OffsetAttribute offsetAtt;
private PositionIncrementAttribute posIncrAtt;
private TypeAttribute typeAtt;
public TestTokenStream(Token[] testToken) {
super();
this.testToken = testToken;
this.termAtt = addAttribute(CharTermAttribute.class);
this.offsetAtt = addAttribute(OffsetAttribute.class);
this.posIncrAtt = addAttribute(PositionIncrementAttribute.class);
this.typeAtt = addAttribute(TypeAttribute.class);
}
@Override
public final boolean incrementToken() {
clearAttributes();
if (index < testToken.length) {
Token t = testToken[index++];
termAtt.copyBuffer(t.buffer(), 0, t.length());
offsetAtt.setOffset(t.startOffset(), t.endOffset());
posIncrAtt.setPositionIncrement(t.getPositionIncrement());
typeAtt.setType(TypeAttribute.DEFAULT_TYPE);
return true;
} else {
return false;
}
}
}
public static final Token[] TEST_TOKEN = new Token[] {
createToken("please", 0, 6),
createToken("divide", 7, 13),
@ -1066,7 +1032,7 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
boolean outputUnigrams)
throws IOException {
ShingleFilter filter = new ShingleFilter(new TestTokenStream(tokensToShingle), maxSize);
ShingleFilter filter = new ShingleFilter(new CannedTokenStream(tokensToShingle), maxSize);
filter.setOutputUnigrams(outputUnigrams);
shingleFilterTestCommon(filter, tokensToCompare, positionIncrements, types);
}
@ -1076,7 +1042,7 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
String[] types, boolean outputUnigrams)
throws IOException {
ShingleFilter filter
= new ShingleFilter(new TestTokenStream(tokensToShingle), minSize, maxSize);
= new ShingleFilter(new CannedTokenStream(tokensToShingle), minSize, maxSize);
filter.setOutputUnigrams(outputUnigrams);
shingleFilterTestCommon(filter, tokensToCompare, positionIncrements, types);
}
@ -1087,7 +1053,7 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
boolean outputUnigramsIfNoShingles)
throws IOException {
ShingleFilter filter
= new ShingleFilter(new TestTokenStream(tokensToShingle), minSize, maxSize);
= new ShingleFilter(new CannedTokenStream(tokensToShingle), minSize, maxSize);
filter.setOutputUnigrams(outputUnigrams);
filter.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles);
shingleFilterTestCommon(filter, tokensToCompare, positionIncrements, types);
@ -1098,7 +1064,7 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
String[] types, boolean outputUnigrams)
throws IOException {
ShingleFilter filter
= new ShingleFilter(new TestTokenStream(tokensToShingle), minSize, maxSize);
= new ShingleFilter(new CannedTokenStream(tokensToShingle), minSize, maxSize);
filter.setTokenSeparator(tokenSeparator);
filter.setOutputUnigrams(outputUnigrams);
shingleFilterTestCommon(filter, tokensToCompare, positionIncrements, types);
@ -1170,4 +1136,63 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
};
checkOneTermReuse(a, "", "");
}
public void testTrailingHole1() throws IOException {
// Analyzing "wizard of", where of is removed as a
// stopword leaving a trailing hole:
Token[] inputTokens = new Token[] {createToken("wizard", 0, 6)};
ShingleFilter filter = new ShingleFilter(new CannedTokenStream(1, 9, inputTokens), 2, 2);
assertTokenStreamContents(filter,
new String[] {"wizard", "wizard _"},
new int[] {0, 0},
new int[] {6, 9},
new int[] {1, 0},
9);
}
public void testTrailingHole2() throws IOException {
// Analyzing "purple wizard of", where of is removed as a
// stopword leaving a trailing hole:
Token[] inputTokens = new Token[] {createToken("purple", 0, 6),
createToken("wizard", 7, 13)};
ShingleFilter filter = new ShingleFilter(new CannedTokenStream(1, 16, inputTokens), 2, 2);
assertTokenStreamContents(filter,
new String[] {"purple", "purple wizard", "wizard", "wizard _"},
new int[] {0, 0, 7, 7},
new int[] {6, 13, 13, 16},
new int[] {1, 0, 1, 0},
16);
}
public void testTwoTrailingHoles() throws IOException {
// Analyzing "purple wizard of the", where of and the are removed as a
// stopwords, leaving two trailing holes:
Token[] inputTokens = new Token[] {createToken("purple", 0, 6),
createToken("wizard", 7, 13)};
ShingleFilter filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 2);
assertTokenStreamContents(filter,
new String[] {"purple", "purple wizard", "wizard", "wizard _"},
new int[] {0, 0, 7, 7},
new int[] {6, 13, 13, 20},
new int[] {1, 0, 1, 0},
20);
}
public void testTwoTrailingHolesTriShingle() throws IOException {
// Analyzing "purple wizard of the", where of and the are removed as a
// stopwords, leaving two trailing holes:
Token[] inputTokens = new Token[] {createToken("purple", 0, 6),
createToken("wizard", 7, 13)};
ShingleFilter filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3);
assertTokenStreamContents(filter,
new String[] {"purple", "purple wizard", "purple wizard _", "wizard", "wizard _", "wizard _ _"},
new int[] {0, 0, 0, 7, 7, 7},
new int[] {6, 13, 20, 13, 20, 20},
new int[] {1, 0, 0, 1, 0, 0},
20);
}
}

View File

@ -17,6 +17,8 @@ package org.apache.lucene.analysis;
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
@ -34,9 +36,28 @@ public final class CannedTokenStream extends TokenStream {
private final PositionLengthAttribute posLengthAtt = addAttribute(PositionLengthAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private final PayloadAttribute payloadAtt = addAttribute(PayloadAttribute.class);
private final int finalOffset;
private final int finalPosInc;
public CannedTokenStream(Token... tokens) {
this.tokens = tokens;
finalOffset = 0;
finalPosInc = 0;
}
/** If you want trailing holes, pass a non-zero
* finalPosInc. */
public CannedTokenStream(int finalPosInc, int finalOffset, Token... tokens) {
this.tokens = tokens;
this.finalOffset = finalOffset;
this.finalPosInc = finalPosInc;
}
@Override
public void end() throws IOException {
super.end();
posIncrAtt.setPositionIncrement(finalPosInc);
offsetAtt.setOffset(finalOffset, finalOffset);
}
@Override