mirror of https://github.com/apache/lucene.git
LUCENE-5180: ShingleFilter creates shingles from trailing holes
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1524117 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
b67683ea21
commit
9a6360e131
|
@ -38,6 +38,10 @@ New Features
|
|||
* SOLR-3359: Added analyzer attribute/property to SynonymFilterFactory.
|
||||
(Ryo Onodera via Koji Sekiguchi)
|
||||
|
||||
* LUCENE-5180: ShingleFilter now creates shingles with trailing holes,
|
||||
for example if a StopFilter had removed the last token. (Mike
|
||||
McCandless)
|
||||
|
||||
Optimizations
|
||||
|
||||
* LUCENE-4848: Use Java 7 NIO2-FileChannel instead of RandomAccessFile
|
||||
|
|
|
@ -147,6 +147,12 @@ public final class ShingleFilter extends TokenFilter {
|
|||
* true if no shingles have been output yet (for outputUnigramsIfNoShingles).
|
||||
*/
|
||||
boolean noShingleOutput = true;
|
||||
|
||||
/**
|
||||
* Holds the State after input.end() was called, so we can
|
||||
* restore it in our end() impl.
|
||||
*/
|
||||
private State endState;
|
||||
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
|
@ -279,7 +285,7 @@ public final class ShingleFilter extends TokenFilter {
|
|||
}
|
||||
|
||||
@Override
|
||||
public final boolean incrementToken() throws IOException {
|
||||
public boolean incrementToken() throws IOException {
|
||||
boolean tokenAvailable = false;
|
||||
int builtGramSize = 0;
|
||||
if (gramSize.atMinValue() || inputWindow.size() < gramSize.getValue()) {
|
||||
|
@ -364,39 +370,63 @@ public final class ShingleFilter extends TokenFilter {
|
|||
}
|
||||
isNextInputStreamToken = false;
|
||||
newTarget.isFiller = false;
|
||||
} else if (!exhausted && input.incrementToken()) {
|
||||
if (null == target) {
|
||||
newTarget = new InputWindowToken(cloneAttributes());
|
||||
} else {
|
||||
this.copyTo(target.attSource);
|
||||
}
|
||||
if (posIncrAtt.getPositionIncrement() > 1) {
|
||||
// Each output shingle must contain at least one input token,
|
||||
// so no more than (maxShingleSize - 1) filler tokens will be inserted.
|
||||
numFillerTokensToInsert
|
||||
= Math.min(posIncrAtt.getPositionIncrement() - 1, maxShingleSize - 1);
|
||||
// Save the current token as the next input stream token
|
||||
if (null == nextInputStreamToken) {
|
||||
nextInputStreamToken = cloneAttributes();
|
||||
} else if (!exhausted) {
|
||||
if (input.incrementToken()) {
|
||||
if (null == target) {
|
||||
newTarget = new InputWindowToken(cloneAttributes());
|
||||
} else {
|
||||
this.copyTo(nextInputStreamToken);
|
||||
this.copyTo(target.attSource);
|
||||
}
|
||||
if (posIncrAtt.getPositionIncrement() > 1) {
|
||||
// Each output shingle must contain at least one input token,
|
||||
// so no more than (maxShingleSize - 1) filler tokens will be inserted.
|
||||
numFillerTokensToInsert = Math.min(posIncrAtt.getPositionIncrement() - 1, maxShingleSize - 1);
|
||||
// Save the current token as the next input stream token
|
||||
if (null == nextInputStreamToken) {
|
||||
nextInputStreamToken = cloneAttributes();
|
||||
} else {
|
||||
this.copyTo(nextInputStreamToken);
|
||||
}
|
||||
isNextInputStreamToken = true;
|
||||
// A filler token occupies no space
|
||||
newTarget.offsetAtt.setOffset(offsetAtt.startOffset(), offsetAtt.startOffset());
|
||||
newTarget.termAtt.copyBuffer(FILLER_TOKEN, 0, FILLER_TOKEN.length);
|
||||
newTarget.isFiller = true;
|
||||
--numFillerTokensToInsert;
|
||||
} else {
|
||||
newTarget.isFiller = false;
|
||||
}
|
||||
isNextInputStreamToken = true;
|
||||
// A filler token occupies no space
|
||||
newTarget.offsetAtt.setOffset(offsetAtt.startOffset(), offsetAtt.startOffset());
|
||||
newTarget.termAtt.copyBuffer(FILLER_TOKEN, 0, FILLER_TOKEN.length);
|
||||
newTarget.isFiller = true;
|
||||
--numFillerTokensToInsert;
|
||||
} else {
|
||||
newTarget.isFiller = false;
|
||||
exhausted = true;
|
||||
input.end();
|
||||
endState = captureState();
|
||||
numFillerTokensToInsert = Math.min(posIncrAtt.getPositionIncrement(), maxShingleSize - 1);
|
||||
if (numFillerTokensToInsert > 0) {
|
||||
nextInputStreamToken = new AttributeSource(getAttributeFactory());
|
||||
nextInputStreamToken.addAttribute(CharTermAttribute.class);
|
||||
OffsetAttribute newOffsetAtt = nextInputStreamToken.addAttribute(OffsetAttribute.class);
|
||||
newOffsetAtt.setOffset(offsetAtt.endOffset(), offsetAtt.endOffset());
|
||||
// Recurse/loop just once:
|
||||
return getNextToken(target);
|
||||
} else {
|
||||
newTarget = null;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
newTarget = null;
|
||||
exhausted = true;
|
||||
}
|
||||
return newTarget;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void end() throws IOException {
|
||||
if (!exhausted) {
|
||||
super.end();
|
||||
} else {
|
||||
restoreState(endState);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* <p>Fills {@link #inputWindow} with input stream tokens, if available,
|
||||
* shifting to the right if the window was previously full.
|
||||
|
@ -445,6 +475,7 @@ public final class ShingleFilter extends TokenFilter {
|
|||
isOutputHere = false;
|
||||
noShingleOutput = true;
|
||||
exhausted = false;
|
||||
endState = null;
|
||||
if (outputUnigramsIfNoShingles && ! outputUnigrams) {
|
||||
// Fix up gramSize if minValue was reset for outputUnigramsIfNoShingles
|
||||
gramSize.minValue = minShingleSize;
|
||||
|
|
|
@ -24,6 +24,7 @@ import java.util.Random;
|
|||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.CannedTokenStream;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
@ -34,41 +35,6 @@ import org.apache.lucene.analysis.tokenattributes.*;
|
|||
|
||||
public class ShingleFilterTest extends BaseTokenStreamTestCase {
|
||||
|
||||
public class TestTokenStream extends TokenStream {
|
||||
|
||||
protected int index = 0;
|
||||
protected Token[] testToken;
|
||||
|
||||
private CharTermAttribute termAtt;
|
||||
private OffsetAttribute offsetAtt;
|
||||
private PositionIncrementAttribute posIncrAtt;
|
||||
private TypeAttribute typeAtt;
|
||||
|
||||
public TestTokenStream(Token[] testToken) {
|
||||
super();
|
||||
this.testToken = testToken;
|
||||
this.termAtt = addAttribute(CharTermAttribute.class);
|
||||
this.offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
this.posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
this.typeAtt = addAttribute(TypeAttribute.class);
|
||||
}
|
||||
|
||||
@Override
|
||||
public final boolean incrementToken() {
|
||||
clearAttributes();
|
||||
if (index < testToken.length) {
|
||||
Token t = testToken[index++];
|
||||
termAtt.copyBuffer(t.buffer(), 0, t.length());
|
||||
offsetAtt.setOffset(t.startOffset(), t.endOffset());
|
||||
posIncrAtt.setPositionIncrement(t.getPositionIncrement());
|
||||
typeAtt.setType(TypeAttribute.DEFAULT_TYPE);
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public static final Token[] TEST_TOKEN = new Token[] {
|
||||
createToken("please", 0, 6),
|
||||
createToken("divide", 7, 13),
|
||||
|
@ -1066,7 +1032,7 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
|
|||
boolean outputUnigrams)
|
||||
throws IOException {
|
||||
|
||||
ShingleFilter filter = new ShingleFilter(new TestTokenStream(tokensToShingle), maxSize);
|
||||
ShingleFilter filter = new ShingleFilter(new CannedTokenStream(tokensToShingle), maxSize);
|
||||
filter.setOutputUnigrams(outputUnigrams);
|
||||
shingleFilterTestCommon(filter, tokensToCompare, positionIncrements, types);
|
||||
}
|
||||
|
@ -1076,7 +1042,7 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
|
|||
String[] types, boolean outputUnigrams)
|
||||
throws IOException {
|
||||
ShingleFilter filter
|
||||
= new ShingleFilter(new TestTokenStream(tokensToShingle), minSize, maxSize);
|
||||
= new ShingleFilter(new CannedTokenStream(tokensToShingle), minSize, maxSize);
|
||||
filter.setOutputUnigrams(outputUnigrams);
|
||||
shingleFilterTestCommon(filter, tokensToCompare, positionIncrements, types);
|
||||
}
|
||||
|
@ -1087,7 +1053,7 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
|
|||
boolean outputUnigramsIfNoShingles)
|
||||
throws IOException {
|
||||
ShingleFilter filter
|
||||
= new ShingleFilter(new TestTokenStream(tokensToShingle), minSize, maxSize);
|
||||
= new ShingleFilter(new CannedTokenStream(tokensToShingle), minSize, maxSize);
|
||||
filter.setOutputUnigrams(outputUnigrams);
|
||||
filter.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles);
|
||||
shingleFilterTestCommon(filter, tokensToCompare, positionIncrements, types);
|
||||
|
@ -1098,7 +1064,7 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
|
|||
String[] types, boolean outputUnigrams)
|
||||
throws IOException {
|
||||
ShingleFilter filter
|
||||
= new ShingleFilter(new TestTokenStream(tokensToShingle), minSize, maxSize);
|
||||
= new ShingleFilter(new CannedTokenStream(tokensToShingle), minSize, maxSize);
|
||||
filter.setTokenSeparator(tokenSeparator);
|
||||
filter.setOutputUnigrams(outputUnigrams);
|
||||
shingleFilterTestCommon(filter, tokensToCompare, positionIncrements, types);
|
||||
|
@ -1170,4 +1136,63 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
|
|||
};
|
||||
checkOneTermReuse(a, "", "");
|
||||
}
|
||||
|
||||
public void testTrailingHole1() throws IOException {
|
||||
// Analyzing "wizard of", where of is removed as a
|
||||
// stopword leaving a trailing hole:
|
||||
Token[] inputTokens = new Token[] {createToken("wizard", 0, 6)};
|
||||
ShingleFilter filter = new ShingleFilter(new CannedTokenStream(1, 9, inputTokens), 2, 2);
|
||||
|
||||
assertTokenStreamContents(filter,
|
||||
new String[] {"wizard", "wizard _"},
|
||||
new int[] {0, 0},
|
||||
new int[] {6, 9},
|
||||
new int[] {1, 0},
|
||||
9);
|
||||
}
|
||||
|
||||
public void testTrailingHole2() throws IOException {
|
||||
// Analyzing "purple wizard of", where of is removed as a
|
||||
// stopword leaving a trailing hole:
|
||||
Token[] inputTokens = new Token[] {createToken("purple", 0, 6),
|
||||
createToken("wizard", 7, 13)};
|
||||
ShingleFilter filter = new ShingleFilter(new CannedTokenStream(1, 16, inputTokens), 2, 2);
|
||||
|
||||
assertTokenStreamContents(filter,
|
||||
new String[] {"purple", "purple wizard", "wizard", "wizard _"},
|
||||
new int[] {0, 0, 7, 7},
|
||||
new int[] {6, 13, 13, 16},
|
||||
new int[] {1, 0, 1, 0},
|
||||
16);
|
||||
}
|
||||
|
||||
public void testTwoTrailingHoles() throws IOException {
|
||||
// Analyzing "purple wizard of the", where of and the are removed as a
|
||||
// stopwords, leaving two trailing holes:
|
||||
Token[] inputTokens = new Token[] {createToken("purple", 0, 6),
|
||||
createToken("wizard", 7, 13)};
|
||||
ShingleFilter filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 2);
|
||||
|
||||
assertTokenStreamContents(filter,
|
||||
new String[] {"purple", "purple wizard", "wizard", "wizard _"},
|
||||
new int[] {0, 0, 7, 7},
|
||||
new int[] {6, 13, 13, 20},
|
||||
new int[] {1, 0, 1, 0},
|
||||
20);
|
||||
}
|
||||
|
||||
public void testTwoTrailingHolesTriShingle() throws IOException {
|
||||
// Analyzing "purple wizard of the", where of and the are removed as a
|
||||
// stopwords, leaving two trailing holes:
|
||||
Token[] inputTokens = new Token[] {createToken("purple", 0, 6),
|
||||
createToken("wizard", 7, 13)};
|
||||
ShingleFilter filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3);
|
||||
|
||||
assertTokenStreamContents(filter,
|
||||
new String[] {"purple", "purple wizard", "purple wizard _", "wizard", "wizard _", "wizard _ _"},
|
||||
new int[] {0, 0, 0, 7, 7, 7},
|
||||
new int[] {6, 13, 20, 13, 20, 20},
|
||||
new int[] {1, 0, 0, 1, 0, 0},
|
||||
20);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,6 +17,8 @@ package org.apache.lucene.analysis;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||
|
@ -34,9 +36,28 @@ public final class CannedTokenStream extends TokenStream {
|
|||
private final PositionLengthAttribute posLengthAtt = addAttribute(PositionLengthAttribute.class);
|
||||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
private final PayloadAttribute payloadAtt = addAttribute(PayloadAttribute.class);
|
||||
|
||||
private final int finalOffset;
|
||||
private final int finalPosInc;
|
||||
|
||||
public CannedTokenStream(Token... tokens) {
|
||||
this.tokens = tokens;
|
||||
finalOffset = 0;
|
||||
finalPosInc = 0;
|
||||
}
|
||||
|
||||
/** If you want trailing holes, pass a non-zero
|
||||
* finalPosInc. */
|
||||
public CannedTokenStream(int finalPosInc, int finalOffset, Token... tokens) {
|
||||
this.tokens = tokens;
|
||||
this.finalOffset = finalOffset;
|
||||
this.finalPosInc = finalPosInc;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void end() throws IOException {
|
||||
super.end();
|
||||
posIncrAtt.setPositionIncrement(finalPosInc);
|
||||
offsetAtt.setOffset(finalOffset, finalOffset);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
Loading…
Reference in New Issue