mirror of https://github.com/apache/lucene.git
LUCENE-1370: Added ShingleFilter option to output unigrams if no shingles can be generated.
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1006187 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
2e6787a850
commit
f9e4f551e2
|
@ -15,6 +15,9 @@ API Changes
|
||||||
RFCs. ClassicTokenizer/Analyzer retains the old StandardTokenizer/Analyzer
|
RFCs. ClassicTokenizer/Analyzer retains the old StandardTokenizer/Analyzer
|
||||||
behavior. (Steven Rowe, Robert Muir, Uwe Schindler)
|
behavior. (Steven Rowe, Robert Muir, Uwe Schindler)
|
||||||
|
|
||||||
|
* LUCENE-1370: Added ShingleFilter option to output unigrams if no shingles
|
||||||
|
can be generated. (Chris Harris via Steven Rowe)
|
||||||
|
|
||||||
New Features
|
New Features
|
||||||
|
|
||||||
* LUCENE-2413: Consolidated Solr analysis components into common.
|
* LUCENE-2413: Consolidated Solr analysis components into common.
|
||||||
|
|
|
@ -38,6 +38,7 @@ public final class ShingleAnalyzerWrapper extends Analyzer {
|
||||||
private int minShingleSize = ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE;
|
private int minShingleSize = ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE;
|
||||||
private String tokenSeparator = ShingleFilter.TOKEN_SEPARATOR;
|
private String tokenSeparator = ShingleFilter.TOKEN_SEPARATOR;
|
||||||
private boolean outputUnigrams = true;
|
private boolean outputUnigrams = true;
|
||||||
|
private boolean outputUnigramsIfNoShingles = false;
|
||||||
|
|
||||||
public ShingleAnalyzerWrapper(Analyzer defaultAnalyzer) {
|
public ShingleAnalyzerWrapper(Analyzer defaultAnalyzer) {
|
||||||
super();
|
super();
|
||||||
|
@ -147,6 +148,24 @@ public final class ShingleAnalyzerWrapper extends Analyzer {
|
||||||
public void setOutputUnigrams(boolean outputUnigrams) {
|
public void setOutputUnigrams(boolean outputUnigrams) {
|
||||||
this.outputUnigrams = outputUnigrams;
|
this.outputUnigrams = outputUnigrams;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public boolean isOutputUnigramsIfNoShingles() {
|
||||||
|
return outputUnigramsIfNoShingles;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* <p>Shall we override the behavior of outputUnigrams==false for those
|
||||||
|
* times when no shingles are available (because there are fewer than
|
||||||
|
* minShingleSize tokens in the input stream)? (default: false.)
|
||||||
|
* <p>Note that if outputUnigrams==true, then unigrams are always output,
|
||||||
|
* regardless of whether any shingles are available.
|
||||||
|
*
|
||||||
|
* @param outputUnigramsIfNoShingles Whether or not to output a single
|
||||||
|
* unigram when no shingles are available.
|
||||||
|
*/
|
||||||
|
public void setOutputUnigramsIfNoShingles(boolean outputUnigramsIfNoShingles) {
|
||||||
|
this.outputUnigramsIfNoShingles = outputUnigramsIfNoShingles;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||||
|
@ -161,6 +180,7 @@ public final class ShingleAnalyzerWrapper extends Analyzer {
|
||||||
filter.setMaxShingleSize(maxShingleSize);
|
filter.setMaxShingleSize(maxShingleSize);
|
||||||
filter.setTokenSeparator(tokenSeparator);
|
filter.setTokenSeparator(tokenSeparator);
|
||||||
filter.setOutputUnigrams(outputUnigrams);
|
filter.setOutputUnigrams(outputUnigrams);
|
||||||
|
filter.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles);
|
||||||
return filter;
|
return filter;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -192,6 +212,7 @@ public final class ShingleAnalyzerWrapper extends Analyzer {
|
||||||
streams.shingle.setMinShingleSize(minShingleSize);
|
streams.shingle.setMinShingleSize(minShingleSize);
|
||||||
streams.shingle.setTokenSeparator(tokenSeparator);
|
streams.shingle.setTokenSeparator(tokenSeparator);
|
||||||
streams.shingle.setOutputUnigrams(outputUnigrams);
|
streams.shingle.setOutputUnigrams(outputUnigrams);
|
||||||
|
streams.shingle.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles);
|
||||||
return streams.shingle;
|
return streams.shingle;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -102,6 +102,11 @@ public final class ShingleFilter extends TokenFilter {
|
||||||
*/
|
*/
|
||||||
private boolean outputUnigrams = true;
|
private boolean outputUnigrams = true;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* By default, we don't override behavior of outputUnigrams.
|
||||||
|
*/
|
||||||
|
private boolean outputUnigramsIfNoShingles = false;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* maximum shingle size (number of tokens)
|
* maximum shingle size (number of tokens)
|
||||||
*/
|
*/
|
||||||
|
@ -136,6 +141,11 @@ public final class ShingleFilter extends TokenFilter {
|
||||||
* position.
|
* position.
|
||||||
*/
|
*/
|
||||||
private boolean isOutputHere = false;
|
private boolean isOutputHere = false;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* true if no shingles have been output yet (for outputUnigramsIfNoShingles).
|
||||||
|
*/
|
||||||
|
boolean noShingleOutput = true;
|
||||||
|
|
||||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||||
|
@ -211,6 +221,20 @@ public final class ShingleFilter extends TokenFilter {
|
||||||
gramSize = new CircularSequence();
|
gramSize = new CircularSequence();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* <p>Shall we override the behavior of outputUnigrams==false for those
|
||||||
|
* times when no shingles are available (because there are fewer than
|
||||||
|
* minShingleSize tokens in the input stream)? (default: false.)
|
||||||
|
* <p>Note that if outputUnigrams==true, then unigrams are always output,
|
||||||
|
* regardless of whether any shingles are available.
|
||||||
|
*
|
||||||
|
* @param outputUnigramsIfNoShingles Whether or not to output a single
|
||||||
|
* unigram when no shingles are available.
|
||||||
|
*/
|
||||||
|
public void setOutputUnigramsIfNoShingles(boolean outputUnigramsIfNoShingles) {
|
||||||
|
this.outputUnigramsIfNoShingles = outputUnigramsIfNoShingles;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Set the max shingle size (default: 2)
|
* Set the max shingle size (default: 2)
|
||||||
*
|
*
|
||||||
|
@ -292,6 +316,7 @@ public final class ShingleFilter extends TokenFilter {
|
||||||
termAtt.setEmpty().append(gramBuilder);
|
termAtt.setEmpty().append(gramBuilder);
|
||||||
if (gramSize.getValue() > 1) {
|
if (gramSize.getValue() > 1) {
|
||||||
typeAtt.setType(tokenType);
|
typeAtt.setType(tokenType);
|
||||||
|
noShingleOutput = false;
|
||||||
}
|
}
|
||||||
offsetAtt.setOffset(offsetAtt.startOffset(), nextToken.offsetAtt.endOffset());
|
offsetAtt.setOffset(offsetAtt.startOffset(), nextToken.offsetAtt.endOffset());
|
||||||
isOutputHere = true;
|
isOutputHere = true;
|
||||||
|
@ -395,6 +420,10 @@ public final class ShingleFilter extends TokenFilter {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (outputUnigramsIfNoShingles && noShingleOutput
|
||||||
|
&& gramSize.minValue > 1 && inputWindow.size() < minShingleSize) {
|
||||||
|
gramSize.minValue = 1;
|
||||||
|
}
|
||||||
gramSize.reset();
|
gramSize.reset();
|
||||||
isOutputHere = false;
|
isOutputHere = false;
|
||||||
}
|
}
|
||||||
|
@ -406,6 +435,11 @@ public final class ShingleFilter extends TokenFilter {
|
||||||
inputWindow.clear();
|
inputWindow.clear();
|
||||||
numFillerTokensToInsert = 0;
|
numFillerTokensToInsert = 0;
|
||||||
isOutputHere = false;
|
isOutputHere = false;
|
||||||
|
noShingleOutput = true;
|
||||||
|
if (outputUnigramsIfNoShingles && ! outputUnigrams) {
|
||||||
|
// Fix up gramSize if minValue was reset for outputUnigramsIfNoShingles
|
||||||
|
gramSize.minValue = minShingleSize;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -359,4 +359,16 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
|
||||||
new int[] { 13, 18, 27 },
|
new int[] { 13, 18, 27 },
|
||||||
new int[] { 1, 1, 1 });
|
new int[] { 1, 1, 1 });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testOutputUnigramsIfNoShinglesSingleToken() throws Exception {
|
||||||
|
ShingleAnalyzerWrapper analyzer
|
||||||
|
= new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(TEST_VERSION_CURRENT));
|
||||||
|
analyzer.setOutputUnigrams(false);
|
||||||
|
analyzer.setOutputUnigramsIfNoShingles(true);
|
||||||
|
assertAnalyzesToReuse(analyzer, "please",
|
||||||
|
new String[] { "please" },
|
||||||
|
new int[] { 0 },
|
||||||
|
new int[] { 6 },
|
||||||
|
new int[] { 1 });
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -73,6 +73,14 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
|
||||||
createToken("shingles", 33, 39),
|
createToken("shingles", 33, 39),
|
||||||
};
|
};
|
||||||
|
|
||||||
|
public static final int[] UNIGRAM_ONLY_POSITION_INCREMENTS = new int[] {
|
||||||
|
1, 1, 1, 1, 1, 1
|
||||||
|
};
|
||||||
|
|
||||||
|
public static final String[] UNIGRAM_ONLY_TYPES = new String[] {
|
||||||
|
"word", "word", "word", "word", "word", "word"
|
||||||
|
};
|
||||||
|
|
||||||
public static Token[] testTokenWithHoles;
|
public static Token[] testTokenWithHoles;
|
||||||
|
|
||||||
public static final Token[] BI_GRAM_TOKENS = new Token[] {
|
public static final Token[] BI_GRAM_TOKENS = new Token[] {
|
||||||
|
@ -1018,15 +1026,44 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
|
||||||
new int[]{1,0,1,0,1,0,1}
|
new int[]{1,0,1,0,1,0,1}
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testOutputUnigramsIfNoShinglesSingleTokenCase() throws IOException {
|
||||||
|
// Single token input with outputUnigrams==false is the primary case where
|
||||||
|
// enabling this option should alter program behavior.
|
||||||
|
this.shingleFilterTest(2, 2, TEST_SINGLE_TOKEN, SINGLE_TOKEN,
|
||||||
|
SINGLE_TOKEN_INCREMENTS, SINGLE_TOKEN_TYPES,
|
||||||
|
false, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testOutputUnigramsIfNoShinglesWithSimpleBigram() throws IOException {
|
||||||
|
// Here we expect the same result as with testBiGramFilter().
|
||||||
|
this.shingleFilterTest(2, 2, TEST_TOKEN, BI_GRAM_TOKENS,
|
||||||
|
BI_GRAM_POSITION_INCREMENTS, BI_GRAM_TYPES,
|
||||||
|
true, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testOutputUnigramsIfNoShinglesWithSimpleUnigramlessBigram() throws IOException {
|
||||||
|
// Here we expect the same result as with testBiGramFilterWithoutUnigrams().
|
||||||
|
this.shingleFilterTest(2, 2, TEST_TOKEN, BI_GRAM_TOKENS_WITHOUT_UNIGRAMS,
|
||||||
|
BI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS, BI_GRAM_TYPES_WITHOUT_UNIGRAMS,
|
||||||
|
false, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testOutputUnigramsIfNoShinglesWithMultipleInputTokens() throws IOException {
|
||||||
|
// Test when the minimum shingle size is greater than the number of input tokens
|
||||||
|
this.shingleFilterTest(7, 7, TEST_TOKEN, TEST_TOKEN,
|
||||||
|
UNIGRAM_ONLY_POSITION_INCREMENTS, UNIGRAM_ONLY_TYPES,
|
||||||
|
false, true);
|
||||||
|
}
|
||||||
|
|
||||||
protected void shingleFilterTest(int maxSize, Token[] tokensToShingle, Token[] tokensToCompare,
|
protected void shingleFilterTest(int maxSize, Token[] tokensToShingle, Token[] tokensToCompare,
|
||||||
int[] positionIncrements, String[] types,
|
int[] positionIncrements, String[] types,
|
||||||
boolean outputUnigrams)
|
boolean outputUnigrams)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
|
|
||||||
ShingleFilter filter = new ShingleFilter(new TestTokenStream(tokensToShingle), maxSize);
|
ShingleFilter filter = new ShingleFilter(new TestTokenStream(tokensToShingle), maxSize);
|
||||||
shingleFilterTestCommon
|
filter.setOutputUnigrams(outputUnigrams);
|
||||||
(filter, tokensToCompare, positionIncrements, types, outputUnigrams);
|
shingleFilterTestCommon(filter, tokensToCompare, positionIncrements, types);
|
||||||
}
|
}
|
||||||
|
|
||||||
protected void shingleFilterTest(int minSize, int maxSize, Token[] tokensToShingle,
|
protected void shingleFilterTest(int minSize, int maxSize, Token[] tokensToShingle,
|
||||||
|
@ -1035,8 +1072,20 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
|
||||||
throws IOException {
|
throws IOException {
|
||||||
ShingleFilter filter
|
ShingleFilter filter
|
||||||
= new ShingleFilter(new TestTokenStream(tokensToShingle), minSize, maxSize);
|
= new ShingleFilter(new TestTokenStream(tokensToShingle), minSize, maxSize);
|
||||||
shingleFilterTestCommon
|
filter.setOutputUnigrams(outputUnigrams);
|
||||||
(filter, tokensToCompare, positionIncrements, types, outputUnigrams);
|
shingleFilterTestCommon(filter, tokensToCompare, positionIncrements, types);
|
||||||
|
}
|
||||||
|
|
||||||
|
protected void shingleFilterTest(int minSize, int maxSize, Token[] tokensToShingle,
|
||||||
|
Token[] tokensToCompare, int[] positionIncrements,
|
||||||
|
String[] types, boolean outputUnigrams,
|
||||||
|
boolean outputUnigramsIfNoShingles)
|
||||||
|
throws IOException {
|
||||||
|
ShingleFilter filter
|
||||||
|
= new ShingleFilter(new TestTokenStream(tokensToShingle), minSize, maxSize);
|
||||||
|
filter.setOutputUnigrams(outputUnigrams);
|
||||||
|
filter.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles);
|
||||||
|
shingleFilterTestCommon(filter, tokensToCompare, positionIncrements, types);
|
||||||
}
|
}
|
||||||
|
|
||||||
protected void shingleFilterTest(String tokenSeparator, int minSize, int maxSize, Token[] tokensToShingle,
|
protected void shingleFilterTest(String tokenSeparator, int minSize, int maxSize, Token[] tokensToShingle,
|
||||||
|
@ -1046,18 +1095,15 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
|
||||||
ShingleFilter filter
|
ShingleFilter filter
|
||||||
= new ShingleFilter(new TestTokenStream(tokensToShingle), minSize, maxSize);
|
= new ShingleFilter(new TestTokenStream(tokensToShingle), minSize, maxSize);
|
||||||
filter.setTokenSeparator(tokenSeparator);
|
filter.setTokenSeparator(tokenSeparator);
|
||||||
shingleFilterTestCommon
|
filter.setOutputUnigrams(outputUnigrams);
|
||||||
(filter, tokensToCompare, positionIncrements, types, outputUnigrams);
|
shingleFilterTestCommon(filter, tokensToCompare, positionIncrements, types);
|
||||||
}
|
}
|
||||||
|
|
||||||
protected void shingleFilterTestCommon(ShingleFilter filter,
|
protected void shingleFilterTestCommon(ShingleFilter filter,
|
||||||
Token[] tokensToCompare,
|
Token[] tokensToCompare,
|
||||||
int[] positionIncrements,
|
int[] positionIncrements,
|
||||||
String[] types, boolean outputUnigrams)
|
String[] types)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
|
|
||||||
filter.setOutputUnigrams(outputUnigrams);
|
|
||||||
|
|
||||||
String text[] = new String[tokensToCompare.length];
|
String text[] = new String[tokensToCompare.length];
|
||||||
int startOffsets[] = new int[tokensToCompare.length];
|
int startOffsets[] = new int[tokensToCompare.length];
|
||||||
int endOffsets[] = new int[tokensToCompare.length];
|
int endOffsets[] = new int[tokensToCompare.length];
|
||||||
|
|
Loading…
Reference in New Issue