fix synonymfilter offsets: SOLR-167

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@513517 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Yonik Seeley 2007-03-01 21:36:36 +00:00
parent 52d00cda6a
commit 6fdf11d7aa
3 changed files with 72 additions and 10 deletions

View File

@ -175,6 +175,9 @@ Bug Fixes
7. SOLR-168: Fix display positioning of multiple tokens at the same
position in analysis.jsp (yonik)
8. SOLR-167: The SynonymFilter sometimes generated incorrect offsets when
multi token synonyms were mached in the source text. (yonik)
Other Changes
1. Updated to Lucene 2.1

View File

@ -86,8 +86,7 @@ public class SynonymFilter extends TokenFilter {
// OK, we matched a token, so find the longest match.
// since matched is only used for matches >= 2, defer creation until now
if (matched==null) matched=new LinkedList();
matched = new LinkedList();
SynonymMap result = match((SynonymMap)o);

View File

@ -43,6 +43,7 @@ public class TestSynonymFilter extends TestCase {
* a b c => returns List<Token> [a,b,c]
* a/b => tokens a and b share the same spot (b.positionIncrement=0)
* a,3/b/c => a,b,c all share same position (a.positionIncrement=3, b.positionIncrement=0, c.positionIncrement=0)
* a,1,10,11 => "a" with positionIncrement=1, startOffset=10, endOffset=11
*/
public List tokens(String str) {
String[] arr = str.split(" ");
@ -50,8 +51,32 @@ public class TestSynonymFilter extends TestCase {
for (int i=0; i<arr.length; i++) {
String[] toks = arr[i].split("/");
String[] params = toks[0].split(",");
Token t = new Token(params[0],0,0,"TEST");
if (params.length > 1) t.setPositionIncrement(Integer.parseInt(params[1]));
int posInc;
int start;
int end;
if (params.length > 1) {
posInc = Integer.parseInt(params[1]);
} else {
posInc = 1;
}
if (params.length > 2) {
start = Integer.parseInt(params[2]);
} else {
start = 0;
}
if (params.length > 3) {
end = Integer.parseInt(params[3]);
} else {
end = start + params[0].length();
}
Token t = new Token(params[0],start,end,"TEST");
t.setPositionIncrement(posInc);
result.add(t);
for (int j=1; j<toks.length; j++) {
t = new Token(toks[j],0,0,"TEST");
@ -91,27 +116,42 @@ public class TestSynonymFilter extends TestCase {
public void assertTokEqual(List a, List b) {
assertTokEq(a,b);
assertTokEq(b,a);
assertTokEq(a,b,false);
assertTokEq(b,a,false);
}
private void assertTokEq(List a, List b) {
public void assertTokEqualOff(List a, List b) {
assertTokEq(a,b,true);
assertTokEq(b,a,true);
}
private void assertTokEq(List a, List b, boolean checkOff) {
int pos=0;
for (Iterator iter = a.iterator(); iter.hasNext();) {
Token tok = (Token)iter.next();
pos += tok.getPositionIncrement();
if (!tokAt(b, tok.termText(), pos)) {
if (!tokAt(b, tok.termText(), pos
, checkOff ? tok.startOffset() : -1
, checkOff ? tok.endOffset() : -1
))
{
fail(a + "!=" + b);
}
}
}
public boolean tokAt(List lst, String val, int tokPos) {
public boolean tokAt(List lst, String val, int tokPos, int startOff, int endOff) {
int pos=0;
for (Iterator iter = lst.iterator(); iter.hasNext();) {
Token tok = (Token)iter.next();
pos += tok.getPositionIncrement();
if (pos==tokPos && tok.termText().equals(val)) return true;
if (pos==tokPos && tok.termText().equals(val)
&& (startOff==-1 || tok.startOffset()==startOff)
&& (endOff==-1 || tok.endOffset()==endOff)
)
{
return true;
}
}
return false;
}
@ -282,4 +322,24 @@ public class TestSynonymFilter extends TestCase {
}
public void testOffsetBug() throws IOException {
// With the following rules:
// a a=>b
// x=>y
// analysing "a x" causes "y" to have a bad offset (end less than start)
// SOLR-167
SynonymMap map = new SynonymMap();
boolean orig = false;
boolean merge = true;
map.add(strings("a a"), tokens("b"), orig, merge);
map.add(strings("x"), tokens("y"), orig, merge);
System.out.println(getTokList(map,"a,1,0,1 a,1,2,3 x,1,4,5",false));
// "a a x" => "b y"
assertTokEqualOff(getTokList(map,"a,1,0,1 a,1,2,3 x,1,4,5",false), tokens("b,1,0,3 y,1,4,5"));
}
}