SOLR-1394: calculate offsets correctly for entities

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@826299 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Yonik Seeley 2009-10-17 19:56:01 +00:00
parent 16df0cfe16
commit 1194844912
3 changed files with 34 additions and 3 deletions

View File

@ -628,7 +628,8 @@ Bug Fixes
72. SOLR-1504: empty char mapping can cause ArrayIndexOutOfBoundsException in analysis.jsp and co. 72. SOLR-1504: empty char mapping can cause ArrayIndexOutOfBoundsException in analysis.jsp and co.
(koji) (koji)
73. SOLR-1394: HTMLStripCharFilter split tokens that contained entities. 73. SOLR-1394: HTMLStripCharFilter split tokens that contained entities and
often calculated offsets incorrectly for entities.
(Anders Melchiorsen via yonik) (Anders Melchiorsen via yonik)
Other Changes Other Changes

View File

@ -175,6 +175,7 @@ public class HTMLStripCharFilter extends BaseCharFilter {
private int readNumericEntity() throws IOException { private int readNumericEntity() throws IOException {
// "&#" has already been read at this point // "&#" has already been read at this point
int eaten = 2;
// is this decimal, hex, or nothing at all. // is this decimal, hex, or nothing at all.
int ch = next(); int ch = next();
@ -194,6 +195,7 @@ public class HTMLStripCharFilter extends BaseCharFilter {
} }
} }
} else if (ch=='x') { } else if (ch=='x') {
eaten++;
// hex character entity // hex character entity
base=16; base=16;
sb.setLength(0); sb.setLength(0);
@ -215,7 +217,8 @@ public class HTMLStripCharFilter extends BaseCharFilter {
// the entity. // the entity.
try { try {
if (ch==';' || ch==-1) { if (ch==';' || ch==-1) {
numWhitespace = sb.length() + 2;// + 2 accounts for &, #, and ;, then, take away 1 for the fact that we do output a char // do not account for the eaten ";" due to the fact that we do output a char
numWhitespace = sb.length() + eaten;
return Integer.parseInt(sb.toString(), base); return Integer.parseInt(sb.toString(), base);
} }
@ -223,7 +226,7 @@ public class HTMLStripCharFilter extends BaseCharFilter {
// that whitespace on the next call to read(). // that whitespace on the next call to read().
if (isSpace(ch)) { if (isSpace(ch)) {
push(ch); push(ch);
numWhitespace = sb.length() + 2;// + 2 accounts for &, #, and ;, then, take away 1 for the fact that we do output a char numWhitespace = sb.length() + eaten;
return Integer.parseInt(sb.toString(), base); return Integer.parseInt(sb.toString(), base);
} }
} catch (NumberFormatException e) { } catch (NumberFormatException e) {

View File

@ -236,4 +236,31 @@ public class HTMLStripCharFilterTest extends TestCase {
assertTrue(builder.toString() + " is not equal to " + gold + "<EOS>", builder.toString().equals(gold) == true); assertTrue(builder.toString() + " is not equal to " + gold + "<EOS>", builder.toString().equals(gold) == true);
} }
public void doTestOffsets(String in) throws Exception {
HTMLStripCharFilter reader = new HTMLStripCharFilter(CharReader.get(new BufferedReader(new StringReader(in))));
int ch = 0;
int off = 0; // offset in the reader
int strOff = -1; // offset in the original string
while ((ch = reader.read()) != -1) {
int correctedOff = reader.correctOffset(off);
if (ch == 'X') {
strOff = in.indexOf('X',strOff+1);
assertEquals(strOff, correctedOff);
}
off++;
}
}
public void testOffsets() throws Exception {
doTestOffsets("hello X how X are you");
doTestOffsets("hello <p> X<p> how <p>X are you");
doTestOffsets("X &amp; X &#40; X &lt; &gt; X");
// test backtracking
doTestOffsets("X < &zz >X &# < X > < &l > &g < X");
}
} }