mirror of https://github.com/apache/lucene.git
SOLR-1394: calculate offsets correctly for entities
git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@826299 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
16df0cfe16
commit
1194844912
|
@ -628,7 +628,8 @@ Bug Fixes
|
||||||
72. SOLR-1504: empty char mapping can cause ArrayIndexOutOfBoundsException in analysis.jsp and co.
|
72. SOLR-1504: empty char mapping can cause ArrayIndexOutOfBoundsException in analysis.jsp and co.
|
||||||
(koji)
|
(koji)
|
||||||
|
|
||||||
73. SOLR-1394: HTMLStripCharFilter split tokens that contained entities.
|
73. SOLR-1394: HTMLStripCharFilter split tokens that contained entities and
|
||||||
|
often calculated offsets incorrectly for entities.
|
||||||
(Anders Melchiorsen via yonik)
|
(Anders Melchiorsen via yonik)
|
||||||
|
|
||||||
Other Changes
|
Other Changes
|
||||||
|
|
|
@ -175,6 +175,7 @@ public class HTMLStripCharFilter extends BaseCharFilter {
|
||||||
|
|
||||||
private int readNumericEntity() throws IOException {
|
private int readNumericEntity() throws IOException {
|
||||||
// "&#" has already been read at this point
|
// "&#" has already been read at this point
|
||||||
|
int eaten = 2;
|
||||||
|
|
||||||
// is this decimal, hex, or nothing at all.
|
// is this decimal, hex, or nothing at all.
|
||||||
int ch = next();
|
int ch = next();
|
||||||
|
@ -194,6 +195,7 @@ public class HTMLStripCharFilter extends BaseCharFilter {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else if (ch=='x') {
|
} else if (ch=='x') {
|
||||||
|
eaten++;
|
||||||
// hex character entity
|
// hex character entity
|
||||||
base=16;
|
base=16;
|
||||||
sb.setLength(0);
|
sb.setLength(0);
|
||||||
|
@ -215,7 +217,8 @@ public class HTMLStripCharFilter extends BaseCharFilter {
|
||||||
// the entity.
|
// the entity.
|
||||||
try {
|
try {
|
||||||
if (ch==';' || ch==-1) {
|
if (ch==';' || ch==-1) {
|
||||||
numWhitespace = sb.length() + 2;// + 2 accounts for &, #, and ;, then, take away 1 for the fact that we do output a char
|
// do not account for the eaten ";" due to the fact that we do output a char
|
||||||
|
numWhitespace = sb.length() + eaten;
|
||||||
return Integer.parseInt(sb.toString(), base);
|
return Integer.parseInt(sb.toString(), base);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -223,7 +226,7 @@ public class HTMLStripCharFilter extends BaseCharFilter {
|
||||||
// that whitespace on the next call to read().
|
// that whitespace on the next call to read().
|
||||||
if (isSpace(ch)) {
|
if (isSpace(ch)) {
|
||||||
push(ch);
|
push(ch);
|
||||||
numWhitespace = sb.length() + 2;// + 2 accounts for &, #, and ;, then, take away 1 for the fact that we do output a char
|
numWhitespace = sb.length() + eaten;
|
||||||
return Integer.parseInt(sb.toString(), base);
|
return Integer.parseInt(sb.toString(), base);
|
||||||
}
|
}
|
||||||
} catch (NumberFormatException e) {
|
} catch (NumberFormatException e) {
|
||||||
|
|
|
@ -236,4 +236,31 @@ public class HTMLStripCharFilterTest extends TestCase {
|
||||||
assertTrue(builder.toString() + " is not equal to " + gold + "<EOS>", builder.toString().equals(gold) == true);
|
assertTrue(builder.toString() + " is not equal to " + gold + "<EOS>", builder.toString().equals(gold) == true);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void doTestOffsets(String in) throws Exception {
|
||||||
|
HTMLStripCharFilter reader = new HTMLStripCharFilter(CharReader.get(new BufferedReader(new StringReader(in))));
|
||||||
|
int ch = 0;
|
||||||
|
int off = 0; // offset in the reader
|
||||||
|
int strOff = -1; // offset in the original string
|
||||||
|
while ((ch = reader.read()) != -1) {
|
||||||
|
int correctedOff = reader.correctOffset(off);
|
||||||
|
|
||||||
|
if (ch == 'X') {
|
||||||
|
strOff = in.indexOf('X',strOff+1);
|
||||||
|
assertEquals(strOff, correctedOff);
|
||||||
|
}
|
||||||
|
|
||||||
|
off++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testOffsets() throws Exception {
|
||||||
|
doTestOffsets("hello X how X are you");
|
||||||
|
doTestOffsets("hello <p> X<p> how <p>X are you");
|
||||||
|
doTestOffsets("X & X ( X < > X");
|
||||||
|
|
||||||
|
// test backtracking
|
||||||
|
doTestOffsets("X < &zz >X &# < X > < &l > &g < X");
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue