mirror of https://github.com/apache/lucene.git
LUCENE-1370: Added ShingleFilter option to output unigrams if no shingles can be generated.
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1006187 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
2e6787a850
commit
f9e4f551e2
|
@ -15,6 +15,9 @@ API Changes
|
|||
RFCs. ClassicTokenizer/Analyzer retains the old StandardTokenizer/Analyzer
|
||||
behavior. (Steven Rowe, Robert Muir, Uwe Schindler)
|
||||
|
||||
* LUCENE-1370: Added ShingleFilter option to output unigrams if no shingles
|
||||
can be generated. (Chris Harris via Steven Rowe)
|
||||
|
||||
New Features
|
||||
|
||||
* LUCENE-2413: Consolidated Solr analysis components into common.
|
||||
|
|
|
@ -38,6 +38,7 @@ public final class ShingleAnalyzerWrapper extends Analyzer {
|
|||
private int minShingleSize = ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE;
|
||||
private String tokenSeparator = ShingleFilter.TOKEN_SEPARATOR;
|
||||
private boolean outputUnigrams = true;
|
||||
private boolean outputUnigramsIfNoShingles = false;
|
||||
|
||||
public ShingleAnalyzerWrapper(Analyzer defaultAnalyzer) {
|
||||
super();
|
||||
|
@ -147,6 +148,24 @@ public final class ShingleAnalyzerWrapper extends Analyzer {
|
|||
public void setOutputUnigrams(boolean outputUnigrams) {
|
||||
this.outputUnigrams = outputUnigrams;
|
||||
}
|
||||
|
||||
public boolean isOutputUnigramsIfNoShingles() {
|
||||
return outputUnigramsIfNoShingles;
|
||||
}
|
||||
|
||||
/**
|
||||
* <p>Shall we override the behavior of outputUnigrams==false for those
|
||||
* times when no shingles are available (because there are fewer than
|
||||
* minShingleSize tokens in the input stream)? (default: false.)
|
||||
* <p>Note that if outputUnigrams==true, then unigrams are always output,
|
||||
* regardless of whether any shingles are available.
|
||||
*
|
||||
* @param outputUnigramsIfNoShingles Whether or not to output a single
|
||||
* unigram when no shingles are available.
|
||||
*/
|
||||
public void setOutputUnigramsIfNoShingles(boolean outputUnigramsIfNoShingles) {
|
||||
this.outputUnigramsIfNoShingles = outputUnigramsIfNoShingles;
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
|
@ -161,6 +180,7 @@ public final class ShingleAnalyzerWrapper extends Analyzer {
|
|||
filter.setMaxShingleSize(maxShingleSize);
|
||||
filter.setTokenSeparator(tokenSeparator);
|
||||
filter.setOutputUnigrams(outputUnigrams);
|
||||
filter.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles);
|
||||
return filter;
|
||||
}
|
||||
|
||||
|
@ -192,6 +212,7 @@ public final class ShingleAnalyzerWrapper extends Analyzer {
|
|||
streams.shingle.setMinShingleSize(minShingleSize);
|
||||
streams.shingle.setTokenSeparator(tokenSeparator);
|
||||
streams.shingle.setOutputUnigrams(outputUnigrams);
|
||||
streams.shingle.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles);
|
||||
return streams.shingle;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -102,6 +102,11 @@ public final class ShingleFilter extends TokenFilter {
|
|||
*/
|
||||
private boolean outputUnigrams = true;
|
||||
|
||||
/**
|
||||
* By default, we don't override behavior of outputUnigrams.
|
||||
*/
|
||||
private boolean outputUnigramsIfNoShingles = false;
|
||||
|
||||
/**
|
||||
* maximum shingle size (number of tokens)
|
||||
*/
|
||||
|
@ -136,6 +141,11 @@ public final class ShingleFilter extends TokenFilter {
|
|||
* position.
|
||||
*/
|
||||
private boolean isOutputHere = false;
|
||||
|
||||
/**
|
||||
* true if no shingles have been output yet (for outputUnigramsIfNoShingles).
|
||||
*/
|
||||
boolean noShingleOutput = true;
|
||||
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
|
@ -211,6 +221,20 @@ public final class ShingleFilter extends TokenFilter {
|
|||
gramSize = new CircularSequence();
|
||||
}
|
||||
|
||||
/**
|
||||
* <p>Shall we override the behavior of outputUnigrams==false for those
|
||||
* times when no shingles are available (because there are fewer than
|
||||
* minShingleSize tokens in the input stream)? (default: false.)
|
||||
* <p>Note that if outputUnigrams==true, then unigrams are always output,
|
||||
* regardless of whether any shingles are available.
|
||||
*
|
||||
* @param outputUnigramsIfNoShingles Whether or not to output a single
|
||||
* unigram when no shingles are available.
|
||||
*/
|
||||
public void setOutputUnigramsIfNoShingles(boolean outputUnigramsIfNoShingles) {
|
||||
this.outputUnigramsIfNoShingles = outputUnigramsIfNoShingles;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the max shingle size (default: 2)
|
||||
*
|
||||
|
@ -292,6 +316,7 @@ public final class ShingleFilter extends TokenFilter {
|
|||
termAtt.setEmpty().append(gramBuilder);
|
||||
if (gramSize.getValue() > 1) {
|
||||
typeAtt.setType(tokenType);
|
||||
noShingleOutput = false;
|
||||
}
|
||||
offsetAtt.setOffset(offsetAtt.startOffset(), nextToken.offsetAtt.endOffset());
|
||||
isOutputHere = true;
|
||||
|
@ -395,6 +420,10 @@ public final class ShingleFilter extends TokenFilter {
|
|||
}
|
||||
}
|
||||
}
|
||||
if (outputUnigramsIfNoShingles && noShingleOutput
|
||||
&& gramSize.minValue > 1 && inputWindow.size() < minShingleSize) {
|
||||
gramSize.minValue = 1;
|
||||
}
|
||||
gramSize.reset();
|
||||
isOutputHere = false;
|
||||
}
|
||||
|
@ -406,6 +435,11 @@ public final class ShingleFilter extends TokenFilter {
|
|||
inputWindow.clear();
|
||||
numFillerTokensToInsert = 0;
|
||||
isOutputHere = false;
|
||||
noShingleOutput = true;
|
||||
if (outputUnigramsIfNoShingles && ! outputUnigrams) {
|
||||
// Fix up gramSize if minValue was reset for outputUnigramsIfNoShingles
|
||||
gramSize.minValue = minShingleSize;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -359,4 +359,16 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
|
|||
new int[] { 13, 18, 27 },
|
||||
new int[] { 1, 1, 1 });
|
||||
}
|
||||
|
||||
public void testOutputUnigramsIfNoShinglesSingleToken() throws Exception {
|
||||
ShingleAnalyzerWrapper analyzer
|
||||
= new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(TEST_VERSION_CURRENT));
|
||||
analyzer.setOutputUnigrams(false);
|
||||
analyzer.setOutputUnigramsIfNoShingles(true);
|
||||
assertAnalyzesToReuse(analyzer, "please",
|
||||
new String[] { "please" },
|
||||
new int[] { 0 },
|
||||
new int[] { 6 },
|
||||
new int[] { 1 });
|
||||
}
|
||||
}
|
||||
|
|
|
@ -73,6 +73,14 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
|
|||
createToken("shingles", 33, 39),
|
||||
};
|
||||
|
||||
public static final int[] UNIGRAM_ONLY_POSITION_INCREMENTS = new int[] {
|
||||
1, 1, 1, 1, 1, 1
|
||||
};
|
||||
|
||||
public static final String[] UNIGRAM_ONLY_TYPES = new String[] {
|
||||
"word", "word", "word", "word", "word", "word"
|
||||
};
|
||||
|
||||
public static Token[] testTokenWithHoles;
|
||||
|
||||
public static final Token[] BI_GRAM_TOKENS = new Token[] {
|
||||
|
@ -1018,15 +1026,44 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
|
|||
new int[]{1,0,1,0,1,0,1}
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
public void testOutputUnigramsIfNoShinglesSingleTokenCase() throws IOException {
|
||||
// Single token input with outputUnigrams==false is the primary case where
|
||||
// enabling this option should alter program behavior.
|
||||
this.shingleFilterTest(2, 2, TEST_SINGLE_TOKEN, SINGLE_TOKEN,
|
||||
SINGLE_TOKEN_INCREMENTS, SINGLE_TOKEN_TYPES,
|
||||
false, true);
|
||||
}
|
||||
|
||||
public void testOutputUnigramsIfNoShinglesWithSimpleBigram() throws IOException {
|
||||
// Here we expect the same result as with testBiGramFilter().
|
||||
this.shingleFilterTest(2, 2, TEST_TOKEN, BI_GRAM_TOKENS,
|
||||
BI_GRAM_POSITION_INCREMENTS, BI_GRAM_TYPES,
|
||||
true, true);
|
||||
}
|
||||
|
||||
public void testOutputUnigramsIfNoShinglesWithSimpleUnigramlessBigram() throws IOException {
|
||||
// Here we expect the same result as with testBiGramFilterWithoutUnigrams().
|
||||
this.shingleFilterTest(2, 2, TEST_TOKEN, BI_GRAM_TOKENS_WITHOUT_UNIGRAMS,
|
||||
BI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS, BI_GRAM_TYPES_WITHOUT_UNIGRAMS,
|
||||
false, true);
|
||||
}
|
||||
|
||||
public void testOutputUnigramsIfNoShinglesWithMultipleInputTokens() throws IOException {
|
||||
// Test when the minimum shingle size is greater than the number of input tokens
|
||||
this.shingleFilterTest(7, 7, TEST_TOKEN, TEST_TOKEN,
|
||||
UNIGRAM_ONLY_POSITION_INCREMENTS, UNIGRAM_ONLY_TYPES,
|
||||
false, true);
|
||||
}
|
||||
|
||||
protected void shingleFilterTest(int maxSize, Token[] tokensToShingle, Token[] tokensToCompare,
|
||||
int[] positionIncrements, String[] types,
|
||||
boolean outputUnigrams)
|
||||
throws IOException {
|
||||
|
||||
ShingleFilter filter = new ShingleFilter(new TestTokenStream(tokensToShingle), maxSize);
|
||||
shingleFilterTestCommon
|
||||
(filter, tokensToCompare, positionIncrements, types, outputUnigrams);
|
||||
filter.setOutputUnigrams(outputUnigrams);
|
||||
shingleFilterTestCommon(filter, tokensToCompare, positionIncrements, types);
|
||||
}
|
||||
|
||||
protected void shingleFilterTest(int minSize, int maxSize, Token[] tokensToShingle,
|
||||
|
@ -1035,8 +1072,20 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
|
|||
throws IOException {
|
||||
ShingleFilter filter
|
||||
= new ShingleFilter(new TestTokenStream(tokensToShingle), minSize, maxSize);
|
||||
shingleFilterTestCommon
|
||||
(filter, tokensToCompare, positionIncrements, types, outputUnigrams);
|
||||
filter.setOutputUnigrams(outputUnigrams);
|
||||
shingleFilterTestCommon(filter, tokensToCompare, positionIncrements, types);
|
||||
}
|
||||
|
||||
protected void shingleFilterTest(int minSize, int maxSize, Token[] tokensToShingle,
|
||||
Token[] tokensToCompare, int[] positionIncrements,
|
||||
String[] types, boolean outputUnigrams,
|
||||
boolean outputUnigramsIfNoShingles)
|
||||
throws IOException {
|
||||
ShingleFilter filter
|
||||
= new ShingleFilter(new TestTokenStream(tokensToShingle), minSize, maxSize);
|
||||
filter.setOutputUnigrams(outputUnigrams);
|
||||
filter.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles);
|
||||
shingleFilterTestCommon(filter, tokensToCompare, positionIncrements, types);
|
||||
}
|
||||
|
||||
protected void shingleFilterTest(String tokenSeparator, int minSize, int maxSize, Token[] tokensToShingle,
|
||||
|
@ -1046,18 +1095,15 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
|
|||
ShingleFilter filter
|
||||
= new ShingleFilter(new TestTokenStream(tokensToShingle), minSize, maxSize);
|
||||
filter.setTokenSeparator(tokenSeparator);
|
||||
shingleFilterTestCommon
|
||||
(filter, tokensToCompare, positionIncrements, types, outputUnigrams);
|
||||
filter.setOutputUnigrams(outputUnigrams);
|
||||
shingleFilterTestCommon(filter, tokensToCompare, positionIncrements, types);
|
||||
}
|
||||
|
||||
protected void shingleFilterTestCommon(ShingleFilter filter,
|
||||
Token[] tokensToCompare,
|
||||
int[] positionIncrements,
|
||||
String[] types, boolean outputUnigrams)
|
||||
String[] types)
|
||||
throws IOException {
|
||||
|
||||
filter.setOutputUnigrams(outputUnigrams);
|
||||
|
||||
String text[] = new String[tokensToCompare.length];
|
||||
int startOffsets[] = new int[tokensToCompare.length];
|
||||
int endOffsets[] = new int[tokensToCompare.length];
|
||||
|
|
Loading…
Reference in New Issue