diff --git a/CHANGES.txt b/CHANGES.txt index e9a673ce84e..bcfcf918efb 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -509,6 +509,11 @@ Bug Fixes already been closed/destroyed; if it hasn't a, SEVERE error is logged first. (noble, hossman) +60. SOLR-1362: WordDelimiterFilter had inconsistent behavior when setting + the position increment of tokens following a token consisting of all + delimiters, and could additionally lose big position increments. + (Robert Muir, yonik + Other Changes ---------------------- 1. Upgraded to Lucene 2.4.0 (yonik) diff --git a/src/java/org/apache/solr/analysis/WordDelimiterFilter.java b/src/java/org/apache/solr/analysis/WordDelimiterFilter.java index abb071ddadd..b15be279640 100644 --- a/src/java/org/apache/solr/analysis/WordDelimiterFilter.java +++ b/src/java/org/apache/solr/analysis/WordDelimiterFilter.java @@ -384,13 +384,15 @@ final class WordDelimiterFilter extends TokenFilter { int start=0; if (len ==0) continue; + int posInc = t.getPositionIncrement(); + origPosIncrement += posInc; + //skip protected tokens if (protWords != null && protWords.contains(termBuffer, 0, len)) { + t.setPositionIncrement(origPosIncrement); return t; } - origPosIncrement += t.getPositionIncrement(); - // Avoid calling charType more than once for each char (basically // avoid any backtracking). // makes code slightly more difficult, but faster. @@ -482,6 +484,7 @@ final class WordDelimiterFilter extends TokenFilter { if (start==0) { // the subword is the whole original token, so // return it unchanged. + t.setPositionIncrement(origPosIncrement); return t; } @@ -492,6 +495,7 @@ final class WordDelimiterFilter extends TokenFilter { // of the original token t.setTermBuffer(termBuffer, start, len-start); t.setStartOffset(t.startOffset() + start); + t.setPositionIncrement(origPosIncrement); return t; } @@ -524,6 +528,9 @@ final class WordDelimiterFilter extends TokenFilter { if (preserveOriginal != 0) { return t; } + + // if this token had a "normal" gap of 1, remove it. + if (posInc==1) origPosIncrement-=1; continue; } diff --git a/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java b/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java index d0d17f1fb19..1b0e83f039d 100644 --- a/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java +++ b/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java @@ -18,12 +18,21 @@ package org.apache.solr.analysis; import org.apache.solr.util.AbstractSolrTestCase; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.CharArraySet; +import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.WhitespaceTokenizer; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; import java.io.IOException; +import java.io.Reader; import java.io.StringReader; +import java.util.Arrays; +import java.util.HashSet; /** * New WordDelimiterFilter tests... most of the tests are in ConvertedLegacyTest @@ -388,5 +397,113 @@ public class TestWordDelimiterFilter extends AbstractSolrTestCase { doSplitPossessive(1, "ra's", "ra"); doSplitPossessive(0, "ra's", "ra", "s"); } + + /* + * Set a large position increment gap of 10 if the token is "largegap" or "/" + */ + private final class LargePosIncTokenFilter extends TokenFilter { + private TermAttribute termAtt; + private PositionIncrementAttribute posIncAtt; + + protected LargePosIncTokenFilter(TokenStream input) { + super(input); + termAtt = (TermAttribute) addAttribute(TermAttribute.class); + posIncAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class); + } + @Override + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + if (termAtt.term().equals("largegap") || termAtt.term().equals("/")) + posIncAtt.setPositionIncrement(10); + return true; + } else { + return false; + } + } + } + + public void testPositionIncrements() throws Exception { + final CharArraySet protWords = new CharArraySet(new HashSet(Arrays.asList("NUTCH")), false); + + /* analyzer that uses whitespace + wdf */ + Analyzer a = new Analyzer() { + public TokenStream tokenStream(String field, Reader reader) { + return new WordDelimiterFilter( + new WhitespaceTokenizer(reader), + 1, 1, 0, 0, 1, 1, 0, 1, 1, protWords); + } + }; + + /* in this case, works as expected. */ + assertAnalyzesTo(a, "LUCENE / SOLR", new String[] { "LUCENE", "SOLR" }, + new int[] { 0, 9 }, + new int[] { 6, 13 }, + new int[] { 1, 1 }); + + /* only in this case, posInc of 2 ?! */ + assertAnalyzesTo(a, "LUCENE / solR", new String[] { "LUCENE", "sol", "R", "solR" }, + new int[] { 0, 9, 12, 9 }, + new int[] { 6, 12, 13, 13 }, + new int[] { 1, 1, 1, 0 }); + + assertAnalyzesTo(a, "LUCENE / NUTCH SOLR", new String[] { "LUCENE", "NUTCH", "SOLR" }, + new int[] { 0, 9, 15 }, + new int[] { 6, 14, 19 }, + new int[] { 1, 1, 1 }); + + /* analyzer that will consume tokens with large position increments */ + Analyzer a2 = new Analyzer() { + public TokenStream tokenStream(String field, Reader reader) { + return new WordDelimiterFilter( + new LargePosIncTokenFilter( + new WhitespaceTokenizer(reader)), + 1, 1, 0, 0, 1, 1, 0, 1, 1, protWords); + } + }; + + /* increment of "largegap" is preserved */ + assertAnalyzesTo(a2, "LUCENE largegap SOLR", new String[] { "LUCENE", "largegap", "SOLR" }, + new int[] { 0, 7, 16 }, + new int[] { 6, 15, 20 }, + new int[] { 1, 10, 1 }); + + /* the "/" had a position increment of 10, where did it go?!?!! */ + assertAnalyzesTo(a2, "LUCENE / SOLR", new String[] { "LUCENE", "SOLR" }, + new int[] { 0, 9 }, + new int[] { 6, 13 }, + new int[] { 1, 11 }); + + /* in this case, the increment of 10 from the "/" is carried over */ + assertAnalyzesTo(a2, "LUCENE / solR", new String[] { "LUCENE", "sol", "R", "solR" }, + new int[] { 0, 9, 12, 9 }, + new int[] { 6, 12, 13, 13 }, + new int[] { 1, 11, 1, 0 }); + + assertAnalyzesTo(a2, "LUCENE / NUTCH SOLR", new String[] { "LUCENE", "NUTCH", "SOLR" }, + new int[] { 0, 9, 15 }, + new int[] { 6, 14, 19 }, + new int[] { 1, 11, 1 }); + } + + private void assertAnalyzesTo(Analyzer a, String input, String[] output, + int startOffsets[], int endOffsets[], int posIncs[]) throws Exception { + + TokenStream ts = a.tokenStream("dummy", new StringReader(input)); + TermAttribute termAtt = (TermAttribute) ts + .getAttribute(TermAttribute.class); + OffsetAttribute offsetAtt = (OffsetAttribute) ts + .getAttribute(OffsetAttribute.class); + PositionIncrementAttribute posIncAtt = (PositionIncrementAttribute) ts + .getAttribute(PositionIncrementAttribute.class); + for (int i = 0; i < output.length; i++) { + assertTrue(ts.incrementToken()); + assertEquals(output[i], termAtt.term()); + assertEquals(startOffsets[i], offsetAtt.startOffset()); + assertEquals(endOffsets[i], offsetAtt.endOffset()); + assertEquals(posIncs[i], posIncAtt.getPositionIncrement()); + } + assertFalse(ts.incrementToken()); + ts.close(); + } }