From 7eab19aff7ea89e057252ec695b7036dc9ba631e Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Tue, 16 Aug 2011 16:01:05 +0000 Subject: [PATCH] LUCENE-3375: fix synonyms bug where keepOrig=false would discard unmatched inputs git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1158342 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/contrib/CHANGES.txt | 4 +- .../analysis/synonym/SynonymFilter.java | 16 +- .../synonym/TestSynonymMapFilter.java | 284 +++++++++++++++++- 3 files changed, 293 insertions(+), 11 deletions(-) diff --git a/lucene/contrib/CHANGES.txt b/lucene/contrib/CHANGES.txt index c56159af27d..f6db582cd0d 100644 --- a/lucene/contrib/CHANGES.txt +++ b/lucene/contrib/CHANGES.txt @@ -78,9 +78,9 @@ New Features documents must be indexed as a document block, using IndexWriter.add/UpdateDocuments (Mark Harwood, Mike McCandless) - * LUCENE-3233: Added SynonymFilter for applying multi-word synonyms + * LUCENE-3233, LUCENE-3375: Added SynonymFilter for applying multi-word synonyms during indexing or querying (with parsers for wordnet and solr formats). - Removed contrib/wordnet. (Robert Muir, Mike McCandless) + Removed contrib/wordnet. (Simon Rosenthal, Robert Muir, Mike McCandless) API Changes diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java index 64827c821c4..984298458d1 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java @@ -132,6 +132,7 @@ public final class SynonymFilter extends TokenFilter { final CharsRef term = new CharsRef(); AttributeSource.State state; boolean keepOrig; + boolean matched; boolean consumed = true; int startOffset; int endOffset; @@ -140,6 +141,7 @@ public final class SynonymFilter extends TokenFilter { state = null; consumed = true; keepOrig = false; + matched = false; } }; @@ -388,7 +390,7 @@ public final class SynonymFilter extends TokenFilter { if (matchOutput != null) { //System.out.println(" add matchLength=" + matchInputLength + " output=" + matchOutput); inputSkipCount = matchInputLength; - addOutput(matchOutput); + addOutput(matchOutput, matchInputLength); } else if (nextRead != nextWrite) { // Even though we had no match here, we set to 1 // because we need to skip current input token before @@ -402,7 +404,7 @@ public final class SynonymFilter extends TokenFilter { } // Interleaves all output tokens onto the futureOutputs: - private void addOutput(BytesRef bytes) { + private void addOutput(BytesRef bytes, int matchInputLength) { bytesReader.reset(bytes.bytes, bytes.offset, bytes.length); final int code = bytesReader.readVInt(); @@ -426,13 +428,19 @@ public final class SynonymFilter extends TokenFilter { futureOutputs[outputUpto].add(scratchChars.chars, lastStart, outputLen); //System.out.println(" " + new String(scratchChars.chars, lastStart, outputLen) + " outputUpto=" + outputUpto); lastStart = 1+chIDX; - futureInputs[outputUpto].keepOrig |= keepOrig; //System.out.println(" slot=" + outputUpto + " keepOrig=" + keepOrig); outputUpto = rollIncr(outputUpto); assert futureOutputs[outputUpto].posIncr == 1: "outputUpto=" + outputUpto + " vs nextWrite=" + nextWrite; } } } + + int upto = nextRead; + for(int idx=0;idx 0); // no captureStates when no syns matched @@ -181,6 +184,7 @@ public class TestSynonymMapFilter extends BaseTokenStreamTestCase { assertTrue(doc.length() % 2 == 0); final int numInputs = doc.length()/2; boolean[] keepOrigs = new boolean[numInputs]; + boolean[] hasMatch = new boolean[numInputs]; Arrays.fill(keepOrigs, false); String[] outputs = new String[numInputs + maxOutputLength]; OneSyn[] matches = new OneSyn[numInputs]; @@ -223,6 +227,10 @@ public class TestSynonymMapFilter extends BaseTokenStreamTestCase { if (syn == null) { continue; } + for(int idx=0;idx<(1+syn.in.length())/2;idx++) { + hasMatch[inputIDX+idx] = true; + keepOrigs[inputIDX+idx] |= syn.keepOrig; + } for(String synOut : syn.out) { final String[] synOutputs = synOut.split(" "); assertEquals(synOutputs.length, (1+synOut.length())/2); @@ -234,9 +242,6 @@ public class TestSynonymMapFilter extends BaseTokenStreamTestCase { } else { outputs[matchIDX] = outputs[matchIDX] + "/" + synOutputs[synUpto++]; } - if (matchIDX < numInputs) { - keepOrigs[matchIDX] |= syn.keepOrig; - } } } } @@ -249,7 +254,8 @@ public class TestSynonymMapFilter extends BaseTokenStreamTestCase { if (inputIDX >= numInputs && outputs[inputIDX] == null) { break; } - if (inputIDX < numInputs && (outputs[inputIDX] == null || keepOrigs[inputIDX])) { + if (inputIDX < numInputs && (!hasMatch[inputIDX] || keepOrigs[inputIDX])) { + assertTrue(inputTokens[inputIDX].length() != 0); sb.append(inputTokens[inputIDX]); posHasOutput = true; } @@ -259,6 +265,8 @@ public class TestSynonymMapFilter extends BaseTokenStreamTestCase { sb.append('/'); } sb.append(outputs[inputIDX]); + } else if (!posHasOutput) { + continue; } if (inputIDX < limit-1) { sb.append(' '); @@ -390,4 +398,270 @@ public class TestSynonymMapFilter extends BaseTokenStreamTestCase { checkRandomData(random, analyzer, 1000*RANDOM_MULTIPLIER); } } + + // LUCENE-3375 + public void testVanishingTerms() throws Exception { + String testFile = + "aaa => aaaa1 aaaa2 aaaa3\n" + + "bbb => bbbb1 bbbb2\n"; + + SolrSynonymParser parser = new SolrSynonymParser(true, true, new MockAnalyzer(random)); + parser.add(new StringReader(testFile)); + final SynonymMap map = parser.build(); + + Analyzer analyzer = new ReusableAnalyzerBase() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, true); + return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, true)); + } + }; + + // where did my pot go?! + assertAnalyzesTo(analyzer, "xyzzy bbb pot of gold", + new String[] { "xyzzy", "bbbb1", "pot", "bbbb2", "of", "gold" }); + + // this one nukes 'pot' and 'of' + // xyzzy aaa pot of gold -> xyzzy aaaa1 aaaa2 aaaa3 gold + assertAnalyzesTo(analyzer, "xyzzy aaa pot of gold", + new String[] { "xyzzy", "aaaa1", "pot", "aaaa2", "of", "aaaa3", "gold" }); + } + + public void testBasic2() throws Exception { + b = new SynonymMap.Builder(true); + final boolean keepOrig = false; + add("aaa", "aaaa1 aaaa2 aaaa3", keepOrig); + add("bbb", "bbbb1 bbbb2", keepOrig); + tokensIn = new MockTokenizer(new StringReader("a"), + MockTokenizer.WHITESPACE, + true); + tokensIn.reset(); + assertTrue(tokensIn.incrementToken()); + assertFalse(tokensIn.incrementToken()); + tokensIn.end(); + tokensIn.close(); + + tokensOut = new SynonymFilter(tokensIn, + b.build(), + true); + termAtt = tokensOut.addAttribute(CharTermAttribute.class); + posIncrAtt = tokensOut.addAttribute(PositionIncrementAttribute.class); + offsetAtt = tokensOut.addAttribute(OffsetAttribute.class); + + if (keepOrig) { + verify("xyzzy bbb pot of gold", "xyzzy bbb/bbbb1 pot/bbbb2 of gold"); + verify("xyzzy aaa pot of gold", "xyzzy aaa/aaaa1 pot/aaaa2 of/aaaa3 gold"); + } else { + verify("xyzzy bbb pot of gold", "xyzzy bbbb1 pot/bbbb2 of gold"); + verify("xyzzy aaa pot of gold", "xyzzy aaaa1 pot/aaaa2 of/aaaa3 gold"); + } + } + + public void testMatching() throws Exception { + b = new SynonymMap.Builder(true); + final boolean keepOrig = false; + add("a b", "ab", keepOrig); + add("a c", "ac", keepOrig); + add("a", "aa", keepOrig); + add("b", "bb", keepOrig); + add("z x c v", "zxcv", keepOrig); + add("x c", "xc", keepOrig); + final SynonymMap map = b.build(); + Analyzer a = new ReusableAnalyzerBase() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, true)); + } + }; + + checkOneTerm(a, "$", "$"); + checkOneTerm(a, "a", "aa"); + checkOneTerm(a, "b", "bb"); + + assertAnalyzesTo(a, "a $", + new String[] { "aa", "$" }, + new int[] { 1, 1 }); + + assertAnalyzesTo(a, "$ a", + new String[] { "$", "aa" }, + new int[] { 1, 1 }); + + assertAnalyzesTo(a, "a a", + new String[] { "aa", "aa" }, + new int[] { 1, 1 }); + + assertAnalyzesTo(a, "z x c v", + new String[] { "zxcv" }, + new int[] { 1 }); + + assertAnalyzesTo(a, "z x c $", + new String[] { "z", "xc", "$" }, + new int[] { 1, 1, 1 }); + } + + public void testRepeatsOff() throws Exception { + b = new SynonymMap.Builder(true); + final boolean keepOrig = false; + add("a b", "ab", keepOrig); + add("a b", "ab", keepOrig); + add("a b", "ab", keepOrig); + final SynonymMap map = b.build(); + Analyzer a = new ReusableAnalyzerBase() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, true)); + } + }; + + assertAnalyzesTo(a, "a b", + new String[] { "ab" }, + new int[] { 1 }); + } + + public void testRepeatsOn() throws Exception { + b = new SynonymMap.Builder(false); + final boolean keepOrig = false; + add("a b", "ab", keepOrig); + add("a b", "ab", keepOrig); + add("a b", "ab", keepOrig); + final SynonymMap map = b.build(); + Analyzer a = new ReusableAnalyzerBase() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, true)); + } + }; + + assertAnalyzesTo(a, "a b", + new String[] { "ab", "ab", "ab" }, + new int[] { 1, 0, 0 }); + } + + public void testRecursion() throws Exception { + b = new SynonymMap.Builder(true); + final boolean keepOrig = false; + add("zoo", "zoo", keepOrig); + final SynonymMap map = b.build(); + Analyzer a = new ReusableAnalyzerBase() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, true)); + } + }; + + assertAnalyzesTo(a, "zoo zoo $ zoo", + new String[] { "zoo", "zoo", "$", "zoo" }, + new int[] { 1, 1, 1, 1 }); + } + + public void testRecursion2() throws Exception { + b = new SynonymMap.Builder(true); + final boolean keepOrig = false; + add("zoo", "zoo", keepOrig); + add("zoo", "zoo zoo", keepOrig); + final SynonymMap map = b.build(); + Analyzer a = new ReusableAnalyzerBase() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, true)); + } + }; + + // verify("zoo zoo $ zoo", "zoo/zoo zoo/zoo/zoo $/zoo zoo/zoo zoo"); + assertAnalyzesTo(a, "zoo zoo $ zoo", + new String[] { "zoo", "zoo", "zoo", "zoo", "zoo", "$", "zoo", "zoo", "zoo", "zoo" }, + new int[] { 1, 0, 1, 0, 0, 1, 0, 1, 0, 1 }); + } + + public void testIncludeOrig() throws Exception { + b = new SynonymMap.Builder(true); + final boolean keepOrig = true; + add("a b", "ab", keepOrig); + add("a c", "ac", keepOrig); + add("a", "aa", keepOrig); + add("b", "bb", keepOrig); + add("z x c v", "zxcv", keepOrig); + add("x c", "xc", keepOrig); + final SynonymMap map = b.build(); + Analyzer a = new ReusableAnalyzerBase() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, true)); + } + }; + + assertAnalyzesTo(a, "$", + new String[] { "$" }, + new int[] { 1 }); + assertAnalyzesTo(a, "a", + new String[] { "a", "aa" }, + new int[] { 1, 0 }); + assertAnalyzesTo(a, "a", + new String[] { "a", "aa" }, + new int[] { 1, 0 }); + assertAnalyzesTo(a, "$ a", + new String[] { "$", "a", "aa" }, + new int[] { 1, 1, 0 }); + assertAnalyzesTo(a, "a $", + new String[] { "a", "aa", "$" }, + new int[] { 1, 0, 1 }); + assertAnalyzesTo(a, "$ a !", + new String[] { "$", "a", "aa", "!" }, + new int[] { 1, 1, 0, 1 }); + assertAnalyzesTo(a, "a a", + new String[] { "a", "aa", "a", "aa" }, + new int[] { 1, 0, 1, 0 }); + assertAnalyzesTo(a, "b", + new String[] { "b", "bb" }, + new int[] { 1, 0 }); + assertAnalyzesTo(a, "z x c v", + new String[] { "z", "zxcv", "x", "c", "v" }, + new int[] { 1, 0, 1, 1, 1 }); + assertAnalyzesTo(a, "z x c $", + new String[] { "z", "x", "xc", "c", "$" }, + new int[] { 1, 1, 0, 1, 1 }); + } + + public void testRecursion3() throws Exception { + b = new SynonymMap.Builder(true); + final boolean keepOrig = true; + add("zoo zoo", "zoo", keepOrig); + final SynonymMap map = b.build(); + Analyzer a = new ReusableAnalyzerBase() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, true)); + } + }; + + assertAnalyzesTo(a, "zoo zoo $ zoo", + new String[] { "zoo", "zoo", "zoo", "$", "zoo" }, + new int[] { 1, 0, 1, 1, 1 }); + } + + public void testRecursion4() throws Exception { + b = new SynonymMap.Builder(true); + final boolean keepOrig = true; + add("zoo zoo", "zoo", keepOrig); + add("zoo", "zoo zoo", keepOrig); + final SynonymMap map = b.build(); + Analyzer a = new ReusableAnalyzerBase() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, true)); + } + }; + + assertAnalyzesTo(a, "zoo zoo $ zoo", + new String[] { "zoo", "zoo", "zoo", "$", "zoo", "zoo", "zoo" }, + new int[] { 1, 0, 1, 1, 1, 0, 1 }); + } }