mirror of https://github.com/apache/lucene.git
SOLR-1394: calculate offsets correctly for entities
git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@826299 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
16df0cfe16
commit
1194844912
|
@ -628,7 +628,8 @@ Bug Fixes
|
|||
72. SOLR-1504: empty char mapping can cause ArrayIndexOutOfBoundsException in analysis.jsp and co.
|
||||
(koji)
|
||||
|
||||
73. SOLR-1394: HTMLStripCharFilter split tokens that contained entities.
|
||||
73. SOLR-1394: HTMLStripCharFilter split tokens that contained entities and
|
||||
often calculated offsets incorrectly for entities.
|
||||
(Anders Melchiorsen via yonik)
|
||||
|
||||
Other Changes
|
||||
|
|
|
@ -175,6 +175,7 @@ public class HTMLStripCharFilter extends BaseCharFilter {
|
|||
|
||||
private int readNumericEntity() throws IOException {
|
||||
// "&#" has already been read at this point
|
||||
int eaten = 2;
|
||||
|
||||
// is this decimal, hex, or nothing at all.
|
||||
int ch = next();
|
||||
|
@ -194,6 +195,7 @@ public class HTMLStripCharFilter extends BaseCharFilter {
|
|||
}
|
||||
}
|
||||
} else if (ch=='x') {
|
||||
eaten++;
|
||||
// hex character entity
|
||||
base=16;
|
||||
sb.setLength(0);
|
||||
|
@ -215,7 +217,8 @@ public class HTMLStripCharFilter extends BaseCharFilter {
|
|||
// the entity.
|
||||
try {
|
||||
if (ch==';' || ch==-1) {
|
||||
numWhitespace = sb.length() + 2;// + 2 accounts for &, #, and ;, then, take away 1 for the fact that we do output a char
|
||||
// do not account for the eaten ";" due to the fact that we do output a char
|
||||
numWhitespace = sb.length() + eaten;
|
||||
return Integer.parseInt(sb.toString(), base);
|
||||
}
|
||||
|
||||
|
@ -223,7 +226,7 @@ public class HTMLStripCharFilter extends BaseCharFilter {
|
|||
// that whitespace on the next call to read().
|
||||
if (isSpace(ch)) {
|
||||
push(ch);
|
||||
numWhitespace = sb.length() + 2;// + 2 accounts for &, #, and ;, then, take away 1 for the fact that we do output a char
|
||||
numWhitespace = sb.length() + eaten;
|
||||
return Integer.parseInt(sb.toString(), base);
|
||||
}
|
||||
} catch (NumberFormatException e) {
|
||||
|
|
|
@ -236,4 +236,31 @@ public class HTMLStripCharFilterTest extends TestCase {
|
|||
assertTrue(builder.toString() + " is not equal to " + gold + "<EOS>", builder.toString().equals(gold) == true);
|
||||
}
|
||||
|
||||
|
||||
public void doTestOffsets(String in) throws Exception {
|
||||
HTMLStripCharFilter reader = new HTMLStripCharFilter(CharReader.get(new BufferedReader(new StringReader(in))));
|
||||
int ch = 0;
|
||||
int off = 0; // offset in the reader
|
||||
int strOff = -1; // offset in the original string
|
||||
while ((ch = reader.read()) != -1) {
|
||||
int correctedOff = reader.correctOffset(off);
|
||||
|
||||
if (ch == 'X') {
|
||||
strOff = in.indexOf('X',strOff+1);
|
||||
assertEquals(strOff, correctedOff);
|
||||
}
|
||||
|
||||
off++;
|
||||
}
|
||||
}
|
||||
|
||||
public void testOffsets() throws Exception {
|
||||
doTestOffsets("hello X how X are you");
|
||||
doTestOffsets("hello <p> X<p> how <p>X are you");
|
||||
doTestOffsets("X & X ( X < > X");
|
||||
|
||||
// test backtracking
|
||||
doTestOffsets("X < &zz >X &# < X > < &l > &g < X");
|
||||
}
|
||||
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue