From 1194844912a1253a8dd23625c88576156f2514e7 Mon Sep 17 00:00:00 2001 From: Yonik Seeley Date: Sat, 17 Oct 2009 19:56:01 +0000 Subject: [PATCH] SOLR-1394: calculate offsets correctly for entities git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@826299 13f79535-47bb-0310-9956-ffa450edef68 --- CHANGES.txt | 3 ++- .../solr/analysis/HTMLStripCharFilter.java | 7 +++-- .../analysis/HTMLStripCharFilterTest.java | 27 +++++++++++++++++++ 3 files changed, 34 insertions(+), 3 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index 39ddcbffcbe..64f7a21ca35 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -628,7 +628,8 @@ Bug Fixes 72. SOLR-1504: empty char mapping can cause ArrayIndexOutOfBoundsException in analysis.jsp and co. (koji) -73. SOLR-1394: HTMLStripCharFilter split tokens that contained entities. +73. SOLR-1394: HTMLStripCharFilter split tokens that contained entities and + often calculated offsets incorrectly for entities. (Anders Melchiorsen via yonik) Other Changes diff --git a/src/java/org/apache/solr/analysis/HTMLStripCharFilter.java b/src/java/org/apache/solr/analysis/HTMLStripCharFilter.java index 92891e1b5e6..b2ecf328536 100644 --- a/src/java/org/apache/solr/analysis/HTMLStripCharFilter.java +++ b/src/java/org/apache/solr/analysis/HTMLStripCharFilter.java @@ -175,6 +175,7 @@ public class HTMLStripCharFilter extends BaseCharFilter { private int readNumericEntity() throws IOException { // "&#" has already been read at this point + int eaten = 2; // is this decimal, hex, or nothing at all. int ch = next(); @@ -194,6 +195,7 @@ public class HTMLStripCharFilter extends BaseCharFilter { } } } else if (ch=='x') { + eaten++; // hex character entity base=16; sb.setLength(0); @@ -215,7 +217,8 @@ public class HTMLStripCharFilter extends BaseCharFilter { // the entity. try { if (ch==';' || ch==-1) { - numWhitespace = sb.length() + 2;// + 2 accounts for &, #, and ;, then, take away 1 for the fact that we do output a char + // do not account for the eaten ";" due to the fact that we do output a char + numWhitespace = sb.length() + eaten; return Integer.parseInt(sb.toString(), base); } @@ -223,7 +226,7 @@ public class HTMLStripCharFilter extends BaseCharFilter { // that whitespace on the next call to read(). if (isSpace(ch)) { push(ch); - numWhitespace = sb.length() + 2;// + 2 accounts for &, #, and ;, then, take away 1 for the fact that we do output a char + numWhitespace = sb.length() + eaten; return Integer.parseInt(sb.toString(), base); } } catch (NumberFormatException e) { diff --git a/src/test/org/apache/solr/analysis/HTMLStripCharFilterTest.java b/src/test/org/apache/solr/analysis/HTMLStripCharFilterTest.java index 0feb0b73e24..3e55f2a9df6 100644 --- a/src/test/org/apache/solr/analysis/HTMLStripCharFilterTest.java +++ b/src/test/org/apache/solr/analysis/HTMLStripCharFilterTest.java @@ -236,4 +236,31 @@ public class HTMLStripCharFilterTest extends TestCase { assertTrue(builder.toString() + " is not equal to " + gold + "", builder.toString().equals(gold) == true); } + + public void doTestOffsets(String in) throws Exception { + HTMLStripCharFilter reader = new HTMLStripCharFilter(CharReader.get(new BufferedReader(new StringReader(in)))); + int ch = 0; + int off = 0; // offset in the reader + int strOff = -1; // offset in the original string + while ((ch = reader.read()) != -1) { + int correctedOff = reader.correctOffset(off); + + if (ch == 'X') { + strOff = in.indexOf('X',strOff+1); + assertEquals(strOff, correctedOff); + } + + off++; + } + } + + public void testOffsets() throws Exception { + doTestOffsets("hello X how X are you"); + doTestOffsets("hello

X

how

X are you"); + doTestOffsets("X & X ( X < > X"); + + // test backtracking + doTestOffsets("X < &zz >X &# < X > < &l > &g < X"); + } + }