diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java index 77e88627773..2267ce2007c 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java @@ -480,7 +480,7 @@ public final class SynonymFilter extends TokenFilter { // endOffset (ie, endOffset of the last input // token it matched): endOffset = matchEndOffset; - posLen = matchInputLength; + posLen = keepOrig ? matchInputLength : 1; } else { // This rule has more than one output token; we // can't pick any particular endOffset for this diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymMapFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymMapFilter.java index 5379cab256f..7addbe981ae 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymMapFilter.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymMapFilter.java @@ -40,7 +40,6 @@ import org.apache.lucene.analysis.core.KeywordTokenizer; import org.apache.lucene.analysis.tokenattributes.*; import org.apache.lucene.util.CharsRef; import org.apache.lucene.util._TestUtil; -import org.apache.lucene.util.LuceneTestCase.Slow; public class TestSynonymMapFilter extends BaseTokenStreamTestCase { @@ -53,6 +52,9 @@ public class TestSynonymMapFilter extends BaseTokenStreamTestCase { private OffsetAttribute offsetAtt; private void add(String input, String output, boolean keepOrig) { + if (VERBOSE) { + System.out.println(" add input=" + input + " output=" + output + " keepOrig=" + keepOrig); + } b.add(new CharsRef(input.replaceAll(" +", "\u0000")), new CharsRef(output.replaceAll(" +", "\u0000")), keepOrig); @@ -137,6 +139,56 @@ public class TestSynonymMapFilter extends BaseTokenStreamTestCase { assertEquals(expectedUpto, expected.length); } + public void testDontKeepOrig() throws Exception { + b = new SynonymMap.Builder(true); + add("a b", "foo", false); + + final SynonymMap map = b.build(); + + final Analyzer analyzer = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true); + return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, false)); + } + }; + + assertAnalyzesTo(analyzer, "a b c", + new String[] {"foo", "c"}, + new int[] {0, 4}, + new int[] {3, 5}, + null, + new int[] {1, 1}, + new int[] {1, 1}, + true); + checkAnalysisConsistency(random(), analyzer, false, "a b c"); + } + + public void testDoKeepOrig() throws Exception { + b = new SynonymMap.Builder(true); + add("a b", "foo", true); + + final SynonymMap map = b.build(); + + final Analyzer analyzer = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true); + return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, false)); + } + }; + + assertAnalyzesTo(analyzer, "a b c", + new String[] {"a", "foo", "b", "c"}, + new int[] {0, 0, 2, 4}, + new int[] {1, 3, 3, 5}, + null, + new int[] {1, 0, 1, 1}, + new int[] {1, 2, 1, 1}, + true); + checkAnalysisConsistency(random(), analyzer, false, "a b c"); + } + public void testBasic() throws Exception { b = new SynonymMap.Builder(true); add("a", "foo", true); @@ -284,7 +336,7 @@ public class TestSynonymMapFilter extends BaseTokenStreamTestCase { if (synOutputs.length == 1) { // Add full endOffset endOffset = (inputIDX*2) + syn.in.length(); - posLen = (1+syn.in.length())/2; + posLen = syn.keepOrig ? (1+syn.in.length())/2 : 1; } else { // Add endOffset matching input token's endOffset = (matchIDX*2) + 1; @@ -540,6 +592,9 @@ public class TestSynonymMapFilter extends BaseTokenStreamTestCase { for (int i = 0; i < numIters; i++) { b = new SynonymMap.Builder(random.nextBoolean()); final int numEntries = atLeast(10); + if (VERBOSE) { + System.out.println("TEST: iter=" + i + " numEntries=" + numEntries); + } for (int j = 0; j < numEntries; j++) { add(randomNonEmptyString(), randomNonEmptyString(), random.nextBoolean()); }