mirror of
https://github.com/apache/lucene.git
synced 2025-03-09 01:59:27 +00:00
SOLR-1362: fix WDF position increment bugs
git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@808005 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
1ece23820f
commit
b4dd3f78f4
@ -509,6 +509,11 @@ Bug Fixes
|
|||||||
already been closed/destroyed; if it hasn't a, SEVERE error is
|
already been closed/destroyed; if it hasn't a, SEVERE error is
|
||||||
logged first. (noble, hossman)
|
logged first. (noble, hossman)
|
||||||
|
|
||||||
|
60. SOLR-1362: WordDelimiterFilter had inconsistent behavior when setting
|
||||||
|
the position increment of tokens following a token consisting of all
|
||||||
|
delimiters, and could additionally lose big position increments.
|
||||||
|
(Robert Muir, yonik
|
||||||
|
|
||||||
Other Changes
|
Other Changes
|
||||||
----------------------
|
----------------------
|
||||||
1. Upgraded to Lucene 2.4.0 (yonik)
|
1. Upgraded to Lucene 2.4.0 (yonik)
|
||||||
|
@ -384,13 +384,15 @@ final class WordDelimiterFilter extends TokenFilter {
|
|||||||
int start=0;
|
int start=0;
|
||||||
if (len ==0) continue;
|
if (len ==0) continue;
|
||||||
|
|
||||||
|
int posInc = t.getPositionIncrement();
|
||||||
|
origPosIncrement += posInc;
|
||||||
|
|
||||||
//skip protected tokens
|
//skip protected tokens
|
||||||
if (protWords != null && protWords.contains(termBuffer, 0, len)) {
|
if (protWords != null && protWords.contains(termBuffer, 0, len)) {
|
||||||
|
t.setPositionIncrement(origPosIncrement);
|
||||||
return t;
|
return t;
|
||||||
}
|
}
|
||||||
|
|
||||||
origPosIncrement += t.getPositionIncrement();
|
|
||||||
|
|
||||||
// Avoid calling charType more than once for each char (basically
|
// Avoid calling charType more than once for each char (basically
|
||||||
// avoid any backtracking).
|
// avoid any backtracking).
|
||||||
// makes code slightly more difficult, but faster.
|
// makes code slightly more difficult, but faster.
|
||||||
@ -482,6 +484,7 @@ final class WordDelimiterFilter extends TokenFilter {
|
|||||||
if (start==0) {
|
if (start==0) {
|
||||||
// the subword is the whole original token, so
|
// the subword is the whole original token, so
|
||||||
// return it unchanged.
|
// return it unchanged.
|
||||||
|
t.setPositionIncrement(origPosIncrement);
|
||||||
return t;
|
return t;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -492,6 +495,7 @@ final class WordDelimiterFilter extends TokenFilter {
|
|||||||
// of the original token
|
// of the original token
|
||||||
t.setTermBuffer(termBuffer, start, len-start);
|
t.setTermBuffer(termBuffer, start, len-start);
|
||||||
t.setStartOffset(t.startOffset() + start);
|
t.setStartOffset(t.startOffset() + start);
|
||||||
|
t.setPositionIncrement(origPosIncrement);
|
||||||
return t;
|
return t;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -524,6 +528,9 @@ final class WordDelimiterFilter extends TokenFilter {
|
|||||||
if (preserveOriginal != 0) {
|
if (preserveOriginal != 0) {
|
||||||
return t;
|
return t;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// if this token had a "normal" gap of 1, remove it.
|
||||||
|
if (posInc==1) origPosIncrement-=1;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -18,12 +18,21 @@
|
|||||||
package org.apache.solr.analysis;
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
import org.apache.solr.util.AbstractSolrTestCase;
|
import org.apache.solr.util.AbstractSolrTestCase;
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.CharArraySet;
|
||||||
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Token;
|
import org.apache.lucene.analysis.Token;
|
||||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.io.Reader;
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.HashSet;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* New WordDelimiterFilter tests... most of the tests are in ConvertedLegacyTest
|
* New WordDelimiterFilter tests... most of the tests are in ConvertedLegacyTest
|
||||||
@ -388,5 +397,113 @@ public class TestWordDelimiterFilter extends AbstractSolrTestCase {
|
|||||||
doSplitPossessive(1, "ra's", "ra");
|
doSplitPossessive(1, "ra's", "ra");
|
||||||
doSplitPossessive(0, "ra's", "ra", "s");
|
doSplitPossessive(0, "ra's", "ra", "s");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Set a large position increment gap of 10 if the token is "largegap" or "/"
|
||||||
|
*/
|
||||||
|
private final class LargePosIncTokenFilter extends TokenFilter {
|
||||||
|
private TermAttribute termAtt;
|
||||||
|
private PositionIncrementAttribute posIncAtt;
|
||||||
|
|
||||||
|
protected LargePosIncTokenFilter(TokenStream input) {
|
||||||
|
super(input);
|
||||||
|
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||||
|
posIncAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean incrementToken() throws IOException {
|
||||||
|
if (input.incrementToken()) {
|
||||||
|
if (termAtt.term().equals("largegap") || termAtt.term().equals("/"))
|
||||||
|
posIncAtt.setPositionIncrement(10);
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testPositionIncrements() throws Exception {
|
||||||
|
final CharArraySet protWords = new CharArraySet(new HashSet<String>(Arrays.asList("NUTCH")), false);
|
||||||
|
|
||||||
|
/* analyzer that uses whitespace + wdf */
|
||||||
|
Analyzer a = new Analyzer() {
|
||||||
|
public TokenStream tokenStream(String field, Reader reader) {
|
||||||
|
return new WordDelimiterFilter(
|
||||||
|
new WhitespaceTokenizer(reader),
|
||||||
|
1, 1, 0, 0, 1, 1, 0, 1, 1, protWords);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
/* in this case, works as expected. */
|
||||||
|
assertAnalyzesTo(a, "LUCENE / SOLR", new String[] { "LUCENE", "SOLR" },
|
||||||
|
new int[] { 0, 9 },
|
||||||
|
new int[] { 6, 13 },
|
||||||
|
new int[] { 1, 1 });
|
||||||
|
|
||||||
|
/* only in this case, posInc of 2 ?! */
|
||||||
|
assertAnalyzesTo(a, "LUCENE / solR", new String[] { "LUCENE", "sol", "R", "solR" },
|
||||||
|
new int[] { 0, 9, 12, 9 },
|
||||||
|
new int[] { 6, 12, 13, 13 },
|
||||||
|
new int[] { 1, 1, 1, 0 });
|
||||||
|
|
||||||
|
assertAnalyzesTo(a, "LUCENE / NUTCH SOLR", new String[] { "LUCENE", "NUTCH", "SOLR" },
|
||||||
|
new int[] { 0, 9, 15 },
|
||||||
|
new int[] { 6, 14, 19 },
|
||||||
|
new int[] { 1, 1, 1 });
|
||||||
|
|
||||||
|
/* analyzer that will consume tokens with large position increments */
|
||||||
|
Analyzer a2 = new Analyzer() {
|
||||||
|
public TokenStream tokenStream(String field, Reader reader) {
|
||||||
|
return new WordDelimiterFilter(
|
||||||
|
new LargePosIncTokenFilter(
|
||||||
|
new WhitespaceTokenizer(reader)),
|
||||||
|
1, 1, 0, 0, 1, 1, 0, 1, 1, protWords);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
/* increment of "largegap" is preserved */
|
||||||
|
assertAnalyzesTo(a2, "LUCENE largegap SOLR", new String[] { "LUCENE", "largegap", "SOLR" },
|
||||||
|
new int[] { 0, 7, 16 },
|
||||||
|
new int[] { 6, 15, 20 },
|
||||||
|
new int[] { 1, 10, 1 });
|
||||||
|
|
||||||
|
/* the "/" had a position increment of 10, where did it go?!?!! */
|
||||||
|
assertAnalyzesTo(a2, "LUCENE / SOLR", new String[] { "LUCENE", "SOLR" },
|
||||||
|
new int[] { 0, 9 },
|
||||||
|
new int[] { 6, 13 },
|
||||||
|
new int[] { 1, 11 });
|
||||||
|
|
||||||
|
/* in this case, the increment of 10 from the "/" is carried over */
|
||||||
|
assertAnalyzesTo(a2, "LUCENE / solR", new String[] { "LUCENE", "sol", "R", "solR" },
|
||||||
|
new int[] { 0, 9, 12, 9 },
|
||||||
|
new int[] { 6, 12, 13, 13 },
|
||||||
|
new int[] { 1, 11, 1, 0 });
|
||||||
|
|
||||||
|
assertAnalyzesTo(a2, "LUCENE / NUTCH SOLR", new String[] { "LUCENE", "NUTCH", "SOLR" },
|
||||||
|
new int[] { 0, 9, 15 },
|
||||||
|
new int[] { 6, 14, 19 },
|
||||||
|
new int[] { 1, 11, 1 });
|
||||||
|
}
|
||||||
|
|
||||||
|
private void assertAnalyzesTo(Analyzer a, String input, String[] output,
|
||||||
|
int startOffsets[], int endOffsets[], int posIncs[]) throws Exception {
|
||||||
|
|
||||||
|
TokenStream ts = a.tokenStream("dummy", new StringReader(input));
|
||||||
|
TermAttribute termAtt = (TermAttribute) ts
|
||||||
|
.getAttribute(TermAttribute.class);
|
||||||
|
OffsetAttribute offsetAtt = (OffsetAttribute) ts
|
||||||
|
.getAttribute(OffsetAttribute.class);
|
||||||
|
PositionIncrementAttribute posIncAtt = (PositionIncrementAttribute) ts
|
||||||
|
.getAttribute(PositionIncrementAttribute.class);
|
||||||
|
for (int i = 0; i < output.length; i++) {
|
||||||
|
assertTrue(ts.incrementToken());
|
||||||
|
assertEquals(output[i], termAtt.term());
|
||||||
|
assertEquals(startOffsets[i], offsetAtt.startOffset());
|
||||||
|
assertEquals(endOffsets[i], offsetAtt.endOffset());
|
||||||
|
assertEquals(posIncs[i], posIncAtt.getPositionIncrement());
|
||||||
|
}
|
||||||
|
assertFalse(ts.incrementToken());
|
||||||
|
ts.close();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user