mirror of https://github.com/apache/lucene.git
LUCENE-3375: fix synonyms bug where keepOrig=false would discard unmatched inputs
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1158342 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
e227a64b9d
commit
7eab19aff7
|
@ -78,9 +78,9 @@ New Features
|
||||||
documents must be indexed as a document block, using
|
documents must be indexed as a document block, using
|
||||||
IndexWriter.add/UpdateDocuments (Mark Harwood, Mike McCandless)
|
IndexWriter.add/UpdateDocuments (Mark Harwood, Mike McCandless)
|
||||||
|
|
||||||
* LUCENE-3233: Added SynonymFilter for applying multi-word synonyms
|
* LUCENE-3233, LUCENE-3375: Added SynonymFilter for applying multi-word synonyms
|
||||||
during indexing or querying (with parsers for wordnet and solr formats).
|
during indexing or querying (with parsers for wordnet and solr formats).
|
||||||
Removed contrib/wordnet. (Robert Muir, Mike McCandless)
|
Removed contrib/wordnet. (Simon Rosenthal, Robert Muir, Mike McCandless)
|
||||||
|
|
||||||
API Changes
|
API Changes
|
||||||
|
|
||||||
|
|
|
@ -132,6 +132,7 @@ public final class SynonymFilter extends TokenFilter {
|
||||||
final CharsRef term = new CharsRef();
|
final CharsRef term = new CharsRef();
|
||||||
AttributeSource.State state;
|
AttributeSource.State state;
|
||||||
boolean keepOrig;
|
boolean keepOrig;
|
||||||
|
boolean matched;
|
||||||
boolean consumed = true;
|
boolean consumed = true;
|
||||||
int startOffset;
|
int startOffset;
|
||||||
int endOffset;
|
int endOffset;
|
||||||
|
@ -140,6 +141,7 @@ public final class SynonymFilter extends TokenFilter {
|
||||||
state = null;
|
state = null;
|
||||||
consumed = true;
|
consumed = true;
|
||||||
keepOrig = false;
|
keepOrig = false;
|
||||||
|
matched = false;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -388,7 +390,7 @@ public final class SynonymFilter extends TokenFilter {
|
||||||
if (matchOutput != null) {
|
if (matchOutput != null) {
|
||||||
//System.out.println(" add matchLength=" + matchInputLength + " output=" + matchOutput);
|
//System.out.println(" add matchLength=" + matchInputLength + " output=" + matchOutput);
|
||||||
inputSkipCount = matchInputLength;
|
inputSkipCount = matchInputLength;
|
||||||
addOutput(matchOutput);
|
addOutput(matchOutput, matchInputLength);
|
||||||
} else if (nextRead != nextWrite) {
|
} else if (nextRead != nextWrite) {
|
||||||
// Even though we had no match here, we set to 1
|
// Even though we had no match here, we set to 1
|
||||||
// because we need to skip current input token before
|
// because we need to skip current input token before
|
||||||
|
@ -402,7 +404,7 @@ public final class SynonymFilter extends TokenFilter {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Interleaves all output tokens onto the futureOutputs:
|
// Interleaves all output tokens onto the futureOutputs:
|
||||||
private void addOutput(BytesRef bytes) {
|
private void addOutput(BytesRef bytes, int matchInputLength) {
|
||||||
bytesReader.reset(bytes.bytes, bytes.offset, bytes.length);
|
bytesReader.reset(bytes.bytes, bytes.offset, bytes.length);
|
||||||
|
|
||||||
final int code = bytesReader.readVInt();
|
final int code = bytesReader.readVInt();
|
||||||
|
@ -426,13 +428,19 @@ public final class SynonymFilter extends TokenFilter {
|
||||||
futureOutputs[outputUpto].add(scratchChars.chars, lastStart, outputLen);
|
futureOutputs[outputUpto].add(scratchChars.chars, lastStart, outputLen);
|
||||||
//System.out.println(" " + new String(scratchChars.chars, lastStart, outputLen) + " outputUpto=" + outputUpto);
|
//System.out.println(" " + new String(scratchChars.chars, lastStart, outputLen) + " outputUpto=" + outputUpto);
|
||||||
lastStart = 1+chIDX;
|
lastStart = 1+chIDX;
|
||||||
futureInputs[outputUpto].keepOrig |= keepOrig;
|
|
||||||
//System.out.println(" slot=" + outputUpto + " keepOrig=" + keepOrig);
|
//System.out.println(" slot=" + outputUpto + " keepOrig=" + keepOrig);
|
||||||
outputUpto = rollIncr(outputUpto);
|
outputUpto = rollIncr(outputUpto);
|
||||||
assert futureOutputs[outputUpto].posIncr == 1: "outputUpto=" + outputUpto + " vs nextWrite=" + nextWrite;
|
assert futureOutputs[outputUpto].posIncr == 1: "outputUpto=" + outputUpto + " vs nextWrite=" + nextWrite;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int upto = nextRead;
|
||||||
|
for(int idx=0;idx<matchInputLength;idx++) {
|
||||||
|
futureInputs[upto].keepOrig |= keepOrig;
|
||||||
|
futureInputs[upto].matched = true;
|
||||||
|
upto = rollIncr(upto);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// ++ mod rollBufferSize
|
// ++ mod rollBufferSize
|
||||||
|
@ -471,7 +479,7 @@ public final class SynonymFilter extends TokenFilter {
|
||||||
|
|
||||||
//System.out.println(" cycle nextRead=" + nextRead + " nextWrite=" + nextWrite + " inputSkipCount="+ inputSkipCount + " input.keepOrig=" + input.keepOrig + " input.consumed=" + input.consumed + " input.state=" + input.state);
|
//System.out.println(" cycle nextRead=" + nextRead + " nextWrite=" + nextWrite + " inputSkipCount="+ inputSkipCount + " input.keepOrig=" + input.keepOrig + " input.consumed=" + input.consumed + " input.state=" + input.state);
|
||||||
|
|
||||||
if (!input.consumed && (input.keepOrig || outputs.count == 0)) {
|
if (!input.consumed && (input.keepOrig || !input.matched)) {
|
||||||
if (input.state != null) {
|
if (input.state != null) {
|
||||||
// Return a previously saved token (because we
|
// Return a previously saved token (because we
|
||||||
// had to lookahead):
|
// had to lookahead):
|
||||||
|
|
|
@ -29,6 +29,7 @@ import java.util.Set;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.analysis.MockAnalyzer;
|
||||||
import org.apache.lucene.analysis.MockTokenizer;
|
import org.apache.lucene.analysis.MockTokenizer;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.tokenattributes.*;
|
import org.apache.lucene.analysis.tokenattributes.*;
|
||||||
|
@ -59,6 +60,8 @@ public class TestSynonymMapFilter extends BaseTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// todo: we should probably refactor this guy to use/take analyzer,
|
||||||
|
// the tests are a little messy
|
||||||
private void verify(String input, String output) throws Exception {
|
private void verify(String input, String output) throws Exception {
|
||||||
if (VERBOSE) {
|
if (VERBOSE) {
|
||||||
System.out.println("TEST: verify input=" + input + " expectedOutput=" + output);
|
System.out.println("TEST: verify input=" + input + " expectedOutput=" + output);
|
||||||
|
@ -148,7 +151,7 @@ public class TestSynonymMapFilter extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
// mixed keepOrig true/false:
|
// mixed keepOrig true/false:
|
||||||
verify("a m c e x", "a/foo dog barks loudly x");
|
verify("a m c e x", "a/foo dog barks loudly x");
|
||||||
verify("c d m c e x", "c/dog d/harness m/holder/dog c/extras/barks loudly x");
|
verify("c d m c e x", "c/dog d/harness holder/dog extras/barks loudly x");
|
||||||
assertTrue(tokensOut.getCaptureCount() > 0);
|
assertTrue(tokensOut.getCaptureCount() > 0);
|
||||||
|
|
||||||
// no captureStates when no syns matched
|
// no captureStates when no syns matched
|
||||||
|
@ -181,6 +184,7 @@ public class TestSynonymMapFilter extends BaseTokenStreamTestCase {
|
||||||
assertTrue(doc.length() % 2 == 0);
|
assertTrue(doc.length() % 2 == 0);
|
||||||
final int numInputs = doc.length()/2;
|
final int numInputs = doc.length()/2;
|
||||||
boolean[] keepOrigs = new boolean[numInputs];
|
boolean[] keepOrigs = new boolean[numInputs];
|
||||||
|
boolean[] hasMatch = new boolean[numInputs];
|
||||||
Arrays.fill(keepOrigs, false);
|
Arrays.fill(keepOrigs, false);
|
||||||
String[] outputs = new String[numInputs + maxOutputLength];
|
String[] outputs = new String[numInputs + maxOutputLength];
|
||||||
OneSyn[] matches = new OneSyn[numInputs];
|
OneSyn[] matches = new OneSyn[numInputs];
|
||||||
|
@ -223,6 +227,10 @@ public class TestSynonymMapFilter extends BaseTokenStreamTestCase {
|
||||||
if (syn == null) {
|
if (syn == null) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
for(int idx=0;idx<(1+syn.in.length())/2;idx++) {
|
||||||
|
hasMatch[inputIDX+idx] = true;
|
||||||
|
keepOrigs[inputIDX+idx] |= syn.keepOrig;
|
||||||
|
}
|
||||||
for(String synOut : syn.out) {
|
for(String synOut : syn.out) {
|
||||||
final String[] synOutputs = synOut.split(" ");
|
final String[] synOutputs = synOut.split(" ");
|
||||||
assertEquals(synOutputs.length, (1+synOut.length())/2);
|
assertEquals(synOutputs.length, (1+synOut.length())/2);
|
||||||
|
@ -234,9 +242,6 @@ public class TestSynonymMapFilter extends BaseTokenStreamTestCase {
|
||||||
} else {
|
} else {
|
||||||
outputs[matchIDX] = outputs[matchIDX] + "/" + synOutputs[synUpto++];
|
outputs[matchIDX] = outputs[matchIDX] + "/" + synOutputs[synUpto++];
|
||||||
}
|
}
|
||||||
if (matchIDX < numInputs) {
|
|
||||||
keepOrigs[matchIDX] |= syn.keepOrig;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -249,7 +254,8 @@ public class TestSynonymMapFilter extends BaseTokenStreamTestCase {
|
||||||
if (inputIDX >= numInputs && outputs[inputIDX] == null) {
|
if (inputIDX >= numInputs && outputs[inputIDX] == null) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
if (inputIDX < numInputs && (outputs[inputIDX] == null || keepOrigs[inputIDX])) {
|
if (inputIDX < numInputs && (!hasMatch[inputIDX] || keepOrigs[inputIDX])) {
|
||||||
|
assertTrue(inputTokens[inputIDX].length() != 0);
|
||||||
sb.append(inputTokens[inputIDX]);
|
sb.append(inputTokens[inputIDX]);
|
||||||
posHasOutput = true;
|
posHasOutput = true;
|
||||||
}
|
}
|
||||||
|
@ -259,6 +265,8 @@ public class TestSynonymMapFilter extends BaseTokenStreamTestCase {
|
||||||
sb.append('/');
|
sb.append('/');
|
||||||
}
|
}
|
||||||
sb.append(outputs[inputIDX]);
|
sb.append(outputs[inputIDX]);
|
||||||
|
} else if (!posHasOutput) {
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
if (inputIDX < limit-1) {
|
if (inputIDX < limit-1) {
|
||||||
sb.append(' ');
|
sb.append(' ');
|
||||||
|
@ -390,4 +398,270 @@ public class TestSynonymMapFilter extends BaseTokenStreamTestCase {
|
||||||
checkRandomData(random, analyzer, 1000*RANDOM_MULTIPLIER);
|
checkRandomData(random, analyzer, 1000*RANDOM_MULTIPLIER);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// LUCENE-3375
|
||||||
|
public void testVanishingTerms() throws Exception {
|
||||||
|
String testFile =
|
||||||
|
"aaa => aaaa1 aaaa2 aaaa3\n" +
|
||||||
|
"bbb => bbbb1 bbbb2\n";
|
||||||
|
|
||||||
|
SolrSynonymParser parser = new SolrSynonymParser(true, true, new MockAnalyzer(random));
|
||||||
|
parser.add(new StringReader(testFile));
|
||||||
|
final SynonymMap map = parser.build();
|
||||||
|
|
||||||
|
Analyzer analyzer = new ReusableAnalyzerBase() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||||
|
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, true);
|
||||||
|
return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, true));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// where did my pot go?!
|
||||||
|
assertAnalyzesTo(analyzer, "xyzzy bbb pot of gold",
|
||||||
|
new String[] { "xyzzy", "bbbb1", "pot", "bbbb2", "of", "gold" });
|
||||||
|
|
||||||
|
// this one nukes 'pot' and 'of'
|
||||||
|
// xyzzy aaa pot of gold -> xyzzy aaaa1 aaaa2 aaaa3 gold
|
||||||
|
assertAnalyzesTo(analyzer, "xyzzy aaa pot of gold",
|
||||||
|
new String[] { "xyzzy", "aaaa1", "pot", "aaaa2", "of", "aaaa3", "gold" });
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testBasic2() throws Exception {
|
||||||
|
b = new SynonymMap.Builder(true);
|
||||||
|
final boolean keepOrig = false;
|
||||||
|
add("aaa", "aaaa1 aaaa2 aaaa3", keepOrig);
|
||||||
|
add("bbb", "bbbb1 bbbb2", keepOrig);
|
||||||
|
tokensIn = new MockTokenizer(new StringReader("a"),
|
||||||
|
MockTokenizer.WHITESPACE,
|
||||||
|
true);
|
||||||
|
tokensIn.reset();
|
||||||
|
assertTrue(tokensIn.incrementToken());
|
||||||
|
assertFalse(tokensIn.incrementToken());
|
||||||
|
tokensIn.end();
|
||||||
|
tokensIn.close();
|
||||||
|
|
||||||
|
tokensOut = new SynonymFilter(tokensIn,
|
||||||
|
b.build(),
|
||||||
|
true);
|
||||||
|
termAtt = tokensOut.addAttribute(CharTermAttribute.class);
|
||||||
|
posIncrAtt = tokensOut.addAttribute(PositionIncrementAttribute.class);
|
||||||
|
offsetAtt = tokensOut.addAttribute(OffsetAttribute.class);
|
||||||
|
|
||||||
|
if (keepOrig) {
|
||||||
|
verify("xyzzy bbb pot of gold", "xyzzy bbb/bbbb1 pot/bbbb2 of gold");
|
||||||
|
verify("xyzzy aaa pot of gold", "xyzzy aaa/aaaa1 pot/aaaa2 of/aaaa3 gold");
|
||||||
|
} else {
|
||||||
|
verify("xyzzy bbb pot of gold", "xyzzy bbbb1 pot/bbbb2 of gold");
|
||||||
|
verify("xyzzy aaa pot of gold", "xyzzy aaaa1 pot/aaaa2 of/aaaa3 gold");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testMatching() throws Exception {
|
||||||
|
b = new SynonymMap.Builder(true);
|
||||||
|
final boolean keepOrig = false;
|
||||||
|
add("a b", "ab", keepOrig);
|
||||||
|
add("a c", "ac", keepOrig);
|
||||||
|
add("a", "aa", keepOrig);
|
||||||
|
add("b", "bb", keepOrig);
|
||||||
|
add("z x c v", "zxcv", keepOrig);
|
||||||
|
add("x c", "xc", keepOrig);
|
||||||
|
final SynonymMap map = b.build();
|
||||||
|
Analyzer a = new ReusableAnalyzerBase() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||||
|
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||||
|
return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, true));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
checkOneTerm(a, "$", "$");
|
||||||
|
checkOneTerm(a, "a", "aa");
|
||||||
|
checkOneTerm(a, "b", "bb");
|
||||||
|
|
||||||
|
assertAnalyzesTo(a, "a $",
|
||||||
|
new String[] { "aa", "$" },
|
||||||
|
new int[] { 1, 1 });
|
||||||
|
|
||||||
|
assertAnalyzesTo(a, "$ a",
|
||||||
|
new String[] { "$", "aa" },
|
||||||
|
new int[] { 1, 1 });
|
||||||
|
|
||||||
|
assertAnalyzesTo(a, "a a",
|
||||||
|
new String[] { "aa", "aa" },
|
||||||
|
new int[] { 1, 1 });
|
||||||
|
|
||||||
|
assertAnalyzesTo(a, "z x c v",
|
||||||
|
new String[] { "zxcv" },
|
||||||
|
new int[] { 1 });
|
||||||
|
|
||||||
|
assertAnalyzesTo(a, "z x c $",
|
||||||
|
new String[] { "z", "xc", "$" },
|
||||||
|
new int[] { 1, 1, 1 });
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testRepeatsOff() throws Exception {
|
||||||
|
b = new SynonymMap.Builder(true);
|
||||||
|
final boolean keepOrig = false;
|
||||||
|
add("a b", "ab", keepOrig);
|
||||||
|
add("a b", "ab", keepOrig);
|
||||||
|
add("a b", "ab", keepOrig);
|
||||||
|
final SynonymMap map = b.build();
|
||||||
|
Analyzer a = new ReusableAnalyzerBase() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||||
|
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||||
|
return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, true));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
assertAnalyzesTo(a, "a b",
|
||||||
|
new String[] { "ab" },
|
||||||
|
new int[] { 1 });
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testRepeatsOn() throws Exception {
|
||||||
|
b = new SynonymMap.Builder(false);
|
||||||
|
final boolean keepOrig = false;
|
||||||
|
add("a b", "ab", keepOrig);
|
||||||
|
add("a b", "ab", keepOrig);
|
||||||
|
add("a b", "ab", keepOrig);
|
||||||
|
final SynonymMap map = b.build();
|
||||||
|
Analyzer a = new ReusableAnalyzerBase() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||||
|
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||||
|
return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, true));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
assertAnalyzesTo(a, "a b",
|
||||||
|
new String[] { "ab", "ab", "ab" },
|
||||||
|
new int[] { 1, 0, 0 });
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testRecursion() throws Exception {
|
||||||
|
b = new SynonymMap.Builder(true);
|
||||||
|
final boolean keepOrig = false;
|
||||||
|
add("zoo", "zoo", keepOrig);
|
||||||
|
final SynonymMap map = b.build();
|
||||||
|
Analyzer a = new ReusableAnalyzerBase() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||||
|
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||||
|
return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, true));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
assertAnalyzesTo(a, "zoo zoo $ zoo",
|
||||||
|
new String[] { "zoo", "zoo", "$", "zoo" },
|
||||||
|
new int[] { 1, 1, 1, 1 });
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testRecursion2() throws Exception {
|
||||||
|
b = new SynonymMap.Builder(true);
|
||||||
|
final boolean keepOrig = false;
|
||||||
|
add("zoo", "zoo", keepOrig);
|
||||||
|
add("zoo", "zoo zoo", keepOrig);
|
||||||
|
final SynonymMap map = b.build();
|
||||||
|
Analyzer a = new ReusableAnalyzerBase() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||||
|
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||||
|
return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, true));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// verify("zoo zoo $ zoo", "zoo/zoo zoo/zoo/zoo $/zoo zoo/zoo zoo");
|
||||||
|
assertAnalyzesTo(a, "zoo zoo $ zoo",
|
||||||
|
new String[] { "zoo", "zoo", "zoo", "zoo", "zoo", "$", "zoo", "zoo", "zoo", "zoo" },
|
||||||
|
new int[] { 1, 0, 1, 0, 0, 1, 0, 1, 0, 1 });
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testIncludeOrig() throws Exception {
|
||||||
|
b = new SynonymMap.Builder(true);
|
||||||
|
final boolean keepOrig = true;
|
||||||
|
add("a b", "ab", keepOrig);
|
||||||
|
add("a c", "ac", keepOrig);
|
||||||
|
add("a", "aa", keepOrig);
|
||||||
|
add("b", "bb", keepOrig);
|
||||||
|
add("z x c v", "zxcv", keepOrig);
|
||||||
|
add("x c", "xc", keepOrig);
|
||||||
|
final SynonymMap map = b.build();
|
||||||
|
Analyzer a = new ReusableAnalyzerBase() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||||
|
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||||
|
return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, true));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
assertAnalyzesTo(a, "$",
|
||||||
|
new String[] { "$" },
|
||||||
|
new int[] { 1 });
|
||||||
|
assertAnalyzesTo(a, "a",
|
||||||
|
new String[] { "a", "aa" },
|
||||||
|
new int[] { 1, 0 });
|
||||||
|
assertAnalyzesTo(a, "a",
|
||||||
|
new String[] { "a", "aa" },
|
||||||
|
new int[] { 1, 0 });
|
||||||
|
assertAnalyzesTo(a, "$ a",
|
||||||
|
new String[] { "$", "a", "aa" },
|
||||||
|
new int[] { 1, 1, 0 });
|
||||||
|
assertAnalyzesTo(a, "a $",
|
||||||
|
new String[] { "a", "aa", "$" },
|
||||||
|
new int[] { 1, 0, 1 });
|
||||||
|
assertAnalyzesTo(a, "$ a !",
|
||||||
|
new String[] { "$", "a", "aa", "!" },
|
||||||
|
new int[] { 1, 1, 0, 1 });
|
||||||
|
assertAnalyzesTo(a, "a a",
|
||||||
|
new String[] { "a", "aa", "a", "aa" },
|
||||||
|
new int[] { 1, 0, 1, 0 });
|
||||||
|
assertAnalyzesTo(a, "b",
|
||||||
|
new String[] { "b", "bb" },
|
||||||
|
new int[] { 1, 0 });
|
||||||
|
assertAnalyzesTo(a, "z x c v",
|
||||||
|
new String[] { "z", "zxcv", "x", "c", "v" },
|
||||||
|
new int[] { 1, 0, 1, 1, 1 });
|
||||||
|
assertAnalyzesTo(a, "z x c $",
|
||||||
|
new String[] { "z", "x", "xc", "c", "$" },
|
||||||
|
new int[] { 1, 1, 0, 1, 1 });
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testRecursion3() throws Exception {
|
||||||
|
b = new SynonymMap.Builder(true);
|
||||||
|
final boolean keepOrig = true;
|
||||||
|
add("zoo zoo", "zoo", keepOrig);
|
||||||
|
final SynonymMap map = b.build();
|
||||||
|
Analyzer a = new ReusableAnalyzerBase() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||||
|
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||||
|
return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, true));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
assertAnalyzesTo(a, "zoo zoo $ zoo",
|
||||||
|
new String[] { "zoo", "zoo", "zoo", "$", "zoo" },
|
||||||
|
new int[] { 1, 0, 1, 1, 1 });
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testRecursion4() throws Exception {
|
||||||
|
b = new SynonymMap.Builder(true);
|
||||||
|
final boolean keepOrig = true;
|
||||||
|
add("zoo zoo", "zoo", keepOrig);
|
||||||
|
add("zoo", "zoo zoo", keepOrig);
|
||||||
|
final SynonymMap map = b.build();
|
||||||
|
Analyzer a = new ReusableAnalyzerBase() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||||
|
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||||
|
return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, true));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
assertAnalyzesTo(a, "zoo zoo $ zoo",
|
||||||
|
new String[] { "zoo", "zoo", "zoo", "$", "zoo", "zoo", "zoo" },
|
||||||
|
new int[] { 1, 0, 1, 1, 1, 0, 1 });
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue