SOLR-1394: calculate offsets correctly for entities

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@826299 13f79535-47bb-0310-9956-ffa450edef68
2009-10-17 19:56:01 +00:00 · 2009-10-17 19:56:01 +00:00 · 1194844912
parent 16df0cfe16
commit 1194844912
3 changed files with 34 additions and 3 deletions
--- a/CHANGES.txt
+++ b/CHANGES.txt
@ -628,7 +628,8 @@ Bug Fixes
 72. SOLR-1504: empty char mapping can cause ArrayIndexOutOfBoundsException in analysis.jsp and co.
    (koji)

-73. SOLR-1394: HTMLStripCharFilter split tokens that contained entities.
+73. SOLR-1394: HTMLStripCharFilter split tokens that contained entities and
+    often calculated offsets incorrectly for entities.
    (Anders Melchiorsen via yonik)

 Other Changes
--- a/src/java/org/apache/solr/analysis/HTMLStripCharFilter.java
+++ b/src/java/org/apache/solr/analysis/HTMLStripCharFilter.java
@ -175,6 +175,7 @@ public class HTMLStripCharFilter extends BaseCharFilter {

  private int readNumericEntity() throws IOException {
    // "&#" has already been read at this point
+    int eaten = 2;

    // is this decimal, hex, or nothing at all.
    int ch = next();
@ -194,6 +195,7 @@ public class HTMLStripCharFilter extends BaseCharFilter {
        }
      }
    } else if (ch=='x') {
+      eaten++;
      // hex character entity
      base=16;
      sb.setLength(0);
@ -215,7 +217,8 @@ public class HTMLStripCharFilter extends BaseCharFilter {
    // the entity.
    try {
      if (ch==';' || ch==-1) {
-        numWhitespace = sb.length() + 2;// + 2 accounts for &, #, and ;, then, take away 1 for the fact that we do output a char
+        // do not account for the eaten ";" due to the fact that we do output a char
+        numWhitespace = sb.length() + eaten;
        return Integer.parseInt(sb.toString(), base);
      }

@ -223,7 +226,7 @@ public class HTMLStripCharFilter extends BaseCharFilter {
      // that whitespace on the next call to read().
      if (isSpace(ch)) {
        push(ch);
-        numWhitespace = sb.length() + 2;// + 2 accounts for &, #, and ;, then, take away 1 for the fact that we do output a char
+        numWhitespace = sb.length() + eaten;
        return Integer.parseInt(sb.toString(), base);
      }
    } catch (NumberFormatException e) {
--- a/src/test/org/apache/solr/analysis/HTMLStripCharFilterTest.java
+++ b/src/test/org/apache/solr/analysis/HTMLStripCharFilterTest.java
@ -236,4 +236,31 @@ public class HTMLStripCharFilterTest extends TestCase {
    assertTrue(builder.toString() + " is not equal to " + gold + "<EOS>", builder.toString().equals(gold) == true);
  }

+
+  public void doTestOffsets(String in) throws Exception {
+    HTMLStripCharFilter reader = new HTMLStripCharFilter(CharReader.get(new BufferedReader(new StringReader(in))));
+    int ch = 0;
+    int off = 0;     // offset in the reader
+    int strOff = -1; // offset in the original string
+    while ((ch = reader.read()) != -1) {
+      int correctedOff = reader.correctOffset(off);
+
+      if (ch == 'X') {
+        strOff = in.indexOf('X',strOff+1);
+        assertEquals(strOff, correctedOff);
+      }
+
+      off++;
+    }
+  }
+
+  public void testOffsets() throws Exception {
+    doTestOffsets("hello X how X are you");
+    doTestOffsets("hello <p> X<p> how <p>X are you");
+    doTestOffsets("X &amp; X &#40; X &lt; &gt; X");
+
+    // test backtracking
+    doTestOffsets("X < &zz >X &# < X > < &l > &g < X");
+  }
+
 }