SOLR-331: fix WordDelimiterFilter offsets for synonyms

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@564635 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Yonik Seeley 2007-08-10 15:53:03 +00:00
parent 7cb55d9904
commit 1d70c0e51b
2 changed files with 72 additions and 2 deletions

View File

@ -205,9 +205,20 @@ final class WordDelimiterFilter extends TokenFilter {
private Token newTok(Token orig, int start, int end) {
int startOff = orig.startOffset();
int endOff = orig.endOffset();
String origStr = orig.termText();
// if length by start + end offsets doesn't match the term text then assume
// this is a synonym and don't adjust the offsets.
if (origStr.length() == endOff-startOff) {
endOff = startOff + end;
startOff += start;
}
return new Token(orig.termText().substring(start,end),
orig.startOffset() + start,
orig.startOffset() + end,
startOff,
endOff,
orig.type());
}

View File

@ -20,6 +20,12 @@ package org.apache.solr.analysis;
import org.apache.solr.util.AbstractSolrTestCase;
import org.apache.solr.util.TestHarness;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Token;
import java.io.IOException;
import junit.framework.Assert;
/**
* New WordDelimiterFilter tests... most of the tests are in ConvertedLegacyTest
@ -82,4 +88,57 @@ public class TestWordDelimiterFilter extends AbstractSolrTestCase {
,"//result[@numFound=1]"
);
}
public void testOffsets() throws IOException {
// test that subwords and catenated subwords have
// the correct offsets.
WordDelimiterFilter wdf = new WordDelimiterFilter(
new TokenStream() {
Token t;
public Token next() throws IOException {
if (t!=null) return null;
t = new Token("foo-bar", 5, 12); // actual
return t;
}
},
1,1,0,0,1,1);
int i=0;
for(Token t; (t=wdf.next())!=null;) {
if (t.termText().equals("foo")) {
assertEquals(5, t.startOffset());
assertEquals(8, t.endOffset());
i++;
}
if (t.termText().equals("bar")) {
assertEquals(9, t.startOffset());
assertEquals(12, t.endOffset());
i++;
}
if (t.termText().equals("foobar")) {
assertEquals(5, t.startOffset());
assertEquals(12, t.endOffset());
i++;
}
}
assertEquals(3,i); // make sure all 3 tokens were generated
// test that if splitting or catenating a synonym, that the offsets
// are not altered (they would be incorrect).
wdf = new WordDelimiterFilter(
new TokenStream() {
Token t;
public Token next() throws IOException {
if (t!=null) return null;
t = new Token("foo-bar", 5, 6); // a synonym
return t;
}
},
1,1,0,0,1,1);
for(Token t; (t=wdf.next())!=null;) {
assertEquals(5, t.startOffset());
assertEquals(6, t.endOffset());
}
}
}