mirror of https://github.com/apache/lucene.git
fix synonymfilter offsets: SOLR-167
git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@513517 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
52d00cda6a
commit
6fdf11d7aa
|
@ -175,6 +175,9 @@ Bug Fixes
|
|||
|
||||
7. SOLR-168: Fix display positioning of multiple tokens at the same
|
||||
position in analysis.jsp (yonik)
|
||||
|
||||
8. SOLR-167: The SynonymFilter sometimes generated incorrect offsets when
|
||||
multi token synonyms were mached in the source text. (yonik)
|
||||
|
||||
Other Changes
|
||||
1. Updated to Lucene 2.1
|
||||
|
|
|
@ -86,8 +86,7 @@ public class SynonymFilter extends TokenFilter {
|
|||
|
||||
// OK, we matched a token, so find the longest match.
|
||||
|
||||
// since matched is only used for matches >= 2, defer creation until now
|
||||
if (matched==null) matched=new LinkedList();
|
||||
matched = new LinkedList();
|
||||
|
||||
SynonymMap result = match((SynonymMap)o);
|
||||
|
||||
|
|
|
@ -43,6 +43,7 @@ public class TestSynonymFilter extends TestCase {
|
|||
* a b c => returns List<Token> [a,b,c]
|
||||
* a/b => tokens a and b share the same spot (b.positionIncrement=0)
|
||||
* a,3/b/c => a,b,c all share same position (a.positionIncrement=3, b.positionIncrement=0, c.positionIncrement=0)
|
||||
* a,1,10,11 => "a" with positionIncrement=1, startOffset=10, endOffset=11
|
||||
*/
|
||||
public List tokens(String str) {
|
||||
String[] arr = str.split(" ");
|
||||
|
@ -50,8 +51,32 @@ public class TestSynonymFilter extends TestCase {
|
|||
for (int i=0; i<arr.length; i++) {
|
||||
String[] toks = arr[i].split("/");
|
||||
String[] params = toks[0].split(",");
|
||||
Token t = new Token(params[0],0,0,"TEST");
|
||||
if (params.length > 1) t.setPositionIncrement(Integer.parseInt(params[1]));
|
||||
|
||||
int posInc;
|
||||
int start;
|
||||
int end;
|
||||
|
||||
if (params.length > 1) {
|
||||
posInc = Integer.parseInt(params[1]);
|
||||
} else {
|
||||
posInc = 1;
|
||||
}
|
||||
|
||||
if (params.length > 2) {
|
||||
start = Integer.parseInt(params[2]);
|
||||
} else {
|
||||
start = 0;
|
||||
}
|
||||
|
||||
if (params.length > 3) {
|
||||
end = Integer.parseInt(params[3]);
|
||||
} else {
|
||||
end = start + params[0].length();
|
||||
}
|
||||
|
||||
Token t = new Token(params[0],start,end,"TEST");
|
||||
t.setPositionIncrement(posInc);
|
||||
|
||||
result.add(t);
|
||||
for (int j=1; j<toks.length; j++) {
|
||||
t = new Token(toks[j],0,0,"TEST");
|
||||
|
@ -91,27 +116,42 @@ public class TestSynonymFilter extends TestCase {
|
|||
|
||||
|
||||
public void assertTokEqual(List a, List b) {
|
||||
assertTokEq(a,b);
|
||||
assertTokEq(b,a);
|
||||
assertTokEq(a,b,false);
|
||||
assertTokEq(b,a,false);
|
||||
}
|
||||
|
||||
private void assertTokEq(List a, List b) {
|
||||
public void assertTokEqualOff(List a, List b) {
|
||||
assertTokEq(a,b,true);
|
||||
assertTokEq(b,a,true);
|
||||
}
|
||||
|
||||
private void assertTokEq(List a, List b, boolean checkOff) {
|
||||
int pos=0;
|
||||
for (Iterator iter = a.iterator(); iter.hasNext();) {
|
||||
Token tok = (Token)iter.next();
|
||||
pos += tok.getPositionIncrement();
|
||||
if (!tokAt(b, tok.termText(), pos)) {
|
||||
if (!tokAt(b, tok.termText(), pos
|
||||
, checkOff ? tok.startOffset() : -1
|
||||
, checkOff ? tok.endOffset() : -1
|
||||
))
|
||||
{
|
||||
fail(a + "!=" + b);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public boolean tokAt(List lst, String val, int tokPos) {
|
||||
public boolean tokAt(List lst, String val, int tokPos, int startOff, int endOff) {
|
||||
int pos=0;
|
||||
for (Iterator iter = lst.iterator(); iter.hasNext();) {
|
||||
Token tok = (Token)iter.next();
|
||||
pos += tok.getPositionIncrement();
|
||||
if (pos==tokPos && tok.termText().equals(val)) return true;
|
||||
if (pos==tokPos && tok.termText().equals(val)
|
||||
&& (startOff==-1 || tok.startOffset()==startOff)
|
||||
&& (endOff==-1 || tok.endOffset()==endOff)
|
||||
)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
@ -282,4 +322,24 @@ public class TestSynonymFilter extends TestCase {
|
|||
}
|
||||
|
||||
|
||||
public void testOffsetBug() throws IOException {
|
||||
// With the following rules:
|
||||
// a a=>b
|
||||
// x=>y
|
||||
// analysing "a x" causes "y" to have a bad offset (end less than start)
|
||||
// SOLR-167
|
||||
SynonymMap map = new SynonymMap();
|
||||
|
||||
boolean orig = false;
|
||||
boolean merge = true;
|
||||
|
||||
map.add(strings("a a"), tokens("b"), orig, merge);
|
||||
map.add(strings("x"), tokens("y"), orig, merge);
|
||||
|
||||
System.out.println(getTokList(map,"a,1,0,1 a,1,2,3 x,1,4,5",false));
|
||||
|
||||
// "a a x" => "b y"
|
||||
assertTokEqualOff(getTokList(map,"a,1,0,1 a,1,2,3 x,1,4,5",false), tokens("b,1,0,3 y,1,4,5"));
|
||||
}
|
||||
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue