From 09e187a533b5b6af59b188388984435d98467d0d Mon Sep 17 00:00:00 2001 From: Yonik Seeley Date: Fri, 16 Oct 2009 22:21:38 +0000 Subject: [PATCH] SOLR-1394: HTMLStripCharFilter split tokens that contained entities git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@826114 13f79535-47bb-0310-9956-ffa450edef68 --- CHANGES.txt | 3 + .../solr/analysis/HTMLStripCharFilter.java | 20 +++-- .../analysis/HTMLStripCharFilterTest.java | 76 +++++-------------- 3 files changed, 32 insertions(+), 67 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index 7c760f5aefb..39ddcbffcbe 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -628,6 +628,9 @@ Bug Fixes 72. SOLR-1504: empty char mapping can cause ArrayIndexOutOfBoundsException in analysis.jsp and co. (koji) +73. SOLR-1394: HTMLStripCharFilter split tokens that contained entities. + (Anders Melchiorsen via yonik) + Other Changes ---------------------- 1. Upgraded to Lucene 2.4.0 (yonik) diff --git a/src/java/org/apache/solr/analysis/HTMLStripCharFilter.java b/src/java/org/apache/solr/analysis/HTMLStripCharFilter.java index 33723ead0d0..92891e1b5e6 100644 --- a/src/java/org/apache/solr/analysis/HTMLStripCharFilter.java +++ b/src/java/org/apache/solr/analysis/HTMLStripCharFilter.java @@ -38,6 +38,8 @@ public class HTMLStripCharFilter extends BaseCharFilter { private int safeReadAheadLimit = readAheadLimit - 3; private int numWhitespace = 0; private int numRead = 0; + private int numEaten = 0; + private int numReturned = 0; private int lastMark; private Set escapedTags; @@ -535,13 +537,13 @@ public class HTMLStripCharFilter extends BaseCharFilter { private int readName(boolean checkEscaped) throws IOException { StringBuilder builder = (checkEscaped && escapedTags!=null) ? new StringBuilder() : null; - int ch = read(); + int ch = next(); if (builder!=null) builder.append((char)ch); if (!isFirstIdChar(ch)) return MISMATCH; - ch = read(); + ch = next(); if (builder!=null) builder.append((char)ch); while(isIdChar(ch)) { - ch=read(); + ch=next(); if (builder!=null) builder.append((char)ch); } if (ch!=-1) { @@ -570,11 +572,11 @@ public class HTMLStripCharFilter extends BaseCharFilter { // "> private int readAttr2() throws IOException { if ((numRead - lastMark < safeReadAheadLimit)) { - int ch = read(); + int ch = next(); if (!isFirstIdChar(ch)) return MISMATCH; - ch = read(); + ch = next(); while(isIdChar(ch) && ((numRead - lastMark) < safeReadAheadLimit)){ - ch=read(); + ch=next(); } if (isSpace(ch)) ch = nextSkipWS(); @@ -674,9 +676,11 @@ public class HTMLStripCharFilter extends BaseCharFilter { // where do we have to worry about them? // if (numWhitespace > 0){ - numWhitespace--; - return ' '; + numEaten += numWhitespace; + addOffCorrectMap(numReturned, numEaten); + numWhitespace = 0; } + numReturned++; //do not limit this one by the READAHEAD while(true) { int lastNumRead = numRead; diff --git a/src/test/org/apache/solr/analysis/HTMLStripCharFilterTest.java b/src/test/org/apache/solr/analysis/HTMLStripCharFilterTest.java index 7be7c7e83ce..0feb0b73e24 100644 --- a/src/test/org/apache/solr/analysis/HTMLStripCharFilterTest.java +++ b/src/test/org/apache/solr/analysis/HTMLStripCharFilterTest.java @@ -49,9 +49,9 @@ public class HTMLStripCharFilterTest extends TestCase { String html = "
this is some text
here is a
link and " + "another link. " + "This is an entity: & plus a <. Here is an &. "; - String gold = " this is some text here is a link and " + - "another link . " + - "This is an entity: & plus a < . Here is an &. "; + String gold = " this is some text here is a link and " + + "another link . " + + "This is an entity: & plus a <. Here is an &. "; HTMLStripCharFilter reader = new HTMLStripCharFilter(CharReader.get(new StringReader(html))); StringBuilder builder = new StringBuilder(); int ch = -1; @@ -87,7 +87,7 @@ public class HTMLStripCharFilterTest extends TestCase { public void testGamma() throws Exception { String test = "Γ"; - String gold = "\u0393 "; + String gold = "\u0393"; Set set = new HashSet(); set.add("reserved"); Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)), set); @@ -103,8 +103,8 @@ public class HTMLStripCharFilterTest extends TestCase { } public void testEntities() throws Exception { - String test = "  <foo> = Γ bar Γ"; - String gold = " < foo> = \u0393 bar \u0393 "; + String test = "  <foo> Übermensch = Γ bar Γ"; + String gold = " \u00DCbermensch = \u0393 bar \u0393"; Set set = new HashSet(); set.add("reserved"); Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)), set); @@ -121,7 +121,7 @@ public class HTMLStripCharFilterTest extends TestCase { public void testMoreEntities() throws Exception { String test = "  <junk/>   ! @ and ’"; - String gold = " < junk/> ! @ and ’ "; + String gold = " ! @ and ’"; Set set = new HashSet(); set.add("reserved"); Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)), set); @@ -154,61 +154,19 @@ public class HTMLStripCharFilterTest extends TestCase { assertTrue("Other tag should be removed", result.indexOf("other") == -1); } - public void testStrip() throws Exception { - String test = "{{aaaaaaaa|aaaaaaaaa|aaa [[aaaaaa aaaaaa]] [[aaaaaaaaa]]|aaaaaaaaa (aaaaaa)}}\n" + - "{{aaaaaaaaa}}\n" + - "'''aaaaaaaaa''' aa a [[aaaaaaaaa aaaaaaaaaa]] aa aaaaa aa aaaaaaaaa aaa aaaaaaaaa aaaaaaaa aa aaaaaaaaa aa aaa aaaa aa aaaaaaaaaa " + - "[[aaaaaaaaaa]] ([[aa.]] \"[[aaaaa]]\"aaaa aaaaaaaaaaaaa aaaaaaaaaa aaaa aaaaa: \"a aaaaaaaaaaa aaaa aa aaaaaaaa aa aaa aaaaaaaaa " + - "aaaaa aa aaa aaaaaaaaaa aaaaaaa ''aaa aaaaaaaaaa'', aaaaaaaaa aa aaa aaaaa, aaa ''aaaaaaaaaa'', aaaaaaaaa aa aaa aaaaaaaaaaaaaa aa a aaaaaaaaa aaaaaa. aaaaaaaaaa, aaaa aaaaaaaa, " + - "aaaa aa aaa aaa aaaa aaaaaaaaaa aa a aaaaaaa aaa aaaaa, aaa aaaa aa aaaaaaaa aa aaaaaaaaa'a ''a aaaaaa'' aaaaaaaaaa aa aaa aaaaa aa aaa aaa aaaaaaa aa aaaaaaaaaa aa aaaa aaa aaa " + - "aaaa aa a aaaaaaaaa aaaaa aaaa aaaaaa aaa aaaaaaa aaaaaaaaa, aaa aa aaaaaaaaa, aaa aaaaa aa aaa aaaaaaaa. aaaaaaaaa aaaaaaa aaa aaaa aa aaaaaaa, aaaaaaaaaaa aaaaaaaaa aaaaaaaaa " + - "aaa aaaa aaaaaaaa aa aaa aaaaa.\" -aaaaaaa, aaaa. aaaaaaaaa," + - " aaaaaaaa aaaaa aaaa, a. aa-aa) aaa aaaaaaaaaa aaa aaaaaaaaaaa.''aaaaaaaaa''. aaaaaaaa¾aaa aaaaaaaaaa. aaaa. aaaaaaaa¾aaa aaaaaaaaaa aaaaaaa aaaaaaa. " + - "[[aa aaaaaa]] [[aaaa]] . aaaaaaaaa aa \"a aaaaaaa aa aaaaaaaaa aaa aaaaaaaaa aaaaaaa aa aaa aaaaaa aaaa aaaaaaaaaa aa aaaa aaaaaaa aaa " + - "aaaaaaaaaaa.\"''aaaaaaaaa''. aaa aaaaaaa aaaaaaaaa aaaaaaaaaaaa aa aaaaaaaaaa. aaaa. a. aa" + - " \"aaaaaaaaa aa aaa aaaa aaaa a aaaaaaa aaaaaaa aaa aaaaa, aa aaaaaaaaaa, aa aaaa aaaaaaaa aaa aaaaaaaaa.\" aaa aaaa \"aaaaaaaaa\" " + - "aa [[aaaaaaaaa|aaaaaaa aaaa]] aaa [[aaaaa aaaaaaaa|aaaaa]] ''[[aaaaaaaaaa:???????|???????]]'' (\"aaaaaaa [[aaaaaa]]a\" aa \"aaaaaaa aaaaaa\")." + - " aaaa \"aaaaaaaaa\", aa aaa aaaa aaaaaaa aaaaaaa, aa aaa aaaaaa aaaa aaa aaaaa aa [[aaaaaaaaa]] (aaa aaaa aaaa [[aaaaaaaaaaa aaaaaaaaa]]) aaa " + - "aaaaaaaaaaa aaa aaaaaa aa aaaaaaaaa. \n" + - "\n" + - "aaaaa aaa a aaaaaaa aa aaaaa aaa aaaaaaaaaa aa aaaaaaaaa aaaa aaaaaaa aaaaaa aa aaaaaaaaaa.aaaaaaaaaa, aaaa aaaaaaaaaaa. ''aaaaaaaaa: a " + - "aaaaaaaaaa aa aaaaaaaaaaaaa aaaaaaaa'', aaaaaaa aaaaa aaaaaaaaaaaa, aaaa, a.a{{aaaa aaaaaaa|aaaaaa=a.a. aaaaaa|aaaaa=aaa aaaaaaaaa " + - "aaaaaaaaa aa aaaaaaaaa aaaaaaa|aaaa=aaaa|aaaaaaa=aaaaaaa aaaaaaaaa aaaaaaaaa|aaaaaa=aa|aaaaa=a|aaaaa=aaa-aaa|aaa=aa.aaaa/aaaaaa}} aaaaaaa," + - " aaa aaaaaaaaa aaa aaa aaaaaaaaaaaa aaaa aaaaaaaaaaaaa aaa aaa aaa aa aaaa aaa aaaaaaaa aaaaaaaaa.aaaaaa, aaaaaaa. aaaaaaaaa. a aaaaaaaaa aa " + - "aaaaaaaaaaaa aaaaaaaaa aaaaaaaaaa, aaaaaaa aaaaaaa, aaaaaa a. aaa aaaaaa, aaaaaa. aaaaaaaaa aaaaaaaaaa, aaaa, a.aaa aaaaa aaaa aaa aaaaaaaaaaa" + - " aaaaa, \"aaaaa aa aa aaaaaa aaaaaaaa aaaaaaaa aaaa aaa aaaaaaaaaa aaaa, aaa aaaaa aaaaaaaaaa aaaaaaaaaa aa aaaa aaaaa a aaaaaaa [[aaaaaa aaaaaaaaaaa]].\"aaaaaaaaa. " + - "aaa aaaaaa aaaaaaaaa aa aaaaaaaaaa, aaaaaa aaaaaaaaaa aaaaa, aaaa, a. aa aaaaaaaaa aaaaaaaa aaaaaaa aaa aaa aa aaa aaaa aaaaa aa aaaaaaaaaaaa aaa aaaaaaaaaa.aaaaaaaa, " + - "aaaaaa aaaaaaa \"aaaaaaaaa aaaaaaaa aaa aaa aaaaaaa aaaaa aaaaaaaa aa aaaaa, aaaa-aaaa\" [a. aaa]\n" + - "==aaaaaaa==\n" + - "===aaa-aaaaaaaaaa aaaaaaa===\n" + - "{{aaaa|aaaaaaa aa aaaaaaaaa}}\n" + - "{{aaaaa aa aaaaaaaaaa}}\n" + - "aaaaaaaaaa aa aaa aaaaa aaa aaaaaaaaaaaa aaaaaaaaa aaa a aaaa aaaaaaa aaaaa aa aaa aaaaaaaaa aa aaa aaaaaaaaa aaaaaaaa aa aaaaaaaaaa aaaaaaa aaaaaa. aaaa aaaaa aaaa aaaaaaaaa aaaaaa " + - "aaa aa aaaaaaaa aa aaaaa aa aaa aa aaaaa aa aaa [[aaaaaa]] aaaa [[aaaaa|aaa aaa]],aaaaa aaaaaaaaa, [aaaa://aaaaaaaa.aaaaaa.aaa/aaaaaaaaaaaaaaaaaa/aaaaaaaaa/aaaaaaaaaaaaaaaa.aaaa " + - "\"aaaaaaaaa\", aaaa aaa aaaaaaaaaaaaa aaaaaaaaaa, aaaa] aaaaaa aaaa aa a aaaaaaaaaaaaa" + - " aaaaa.{{aaaaaaa|[aaaa://aaa.aaaaaaaaaaaaaaaaa.aaa/aaa/aaaaaa/aaaaa--aaaaaaaaa.aaa]|aa.a&aaaa;[[aaaaaaaa|aaa]]" + - "}} [[aaaa aa aaaaaa]], aaa aaaaaaa aa [[aaaaaaaa]] aaaa aaaaaaaaaa aaaaaa aaaaa aaaaaaa aaaaaaaaa aaaaaa.\n" + - "\n" + - "aaaaaaaaa aa aaa aaaaaa aaaaa, aaaaaaa, aaa aaa aaaaa aa aaa aaaaaaa aaaaaaaaa aaaaaaa aa aaa [[aaa aa aaaaaaaaaaaaa|aaaaaaaaaaaaa]], aaaaaaaaaaaa [[aaaaaaaa]]'a aaaaaaaaa aaa aaa aaaaa " + - "aaaaaaaaaa aa aaaaaaa.''aaaaaaaaa'', aaaaaaaaa¨ aaaaaaa¨ aaaaaa aaaaaaaaaaaa aaaa (aa aaaaaaa) aaaa://aa.aaaaaaa.aaa.aaa © aaaa-aaaa aaaaaaaaa aaaaaaaaaaa. aaa aaaaaa aaaaaaaa\n" + - " aaa aaaa \"aaaaaaaaa\" aaa aaaaaaaaaa aaaa aa a aaaa aa [[aaaaa]], aaa aa aaa [[aaaaaa aaaaaaaaaa]] aaaa aaaaaa aaaa aa aaa ''aaaaaŽa'' aaa aaaaaaa aa aaa aaa aaaa aa a aaaaaaaa " + - "aaaaa,aaaaaaa, aaaa. ''aaaaaaaaa'', aaaaaa: aaaaaaaa aaaaa aaa., aaaa. aa. aa aaaaaa aaa [[aaaaaaa (aaaaaaaa)|aaaaaaa]] aaaaaaa aa a \"aaaaaaaaaaaaa aaaaaaaaaa\" aa aa [[aaaaaaaa]]. " + - "aa aaa aa aaaa aaaaaaaaa aaaaaaa aaaa [[aaaaaaa aaaaaa]] aaaaa aaaaaaa aaa aaaaaaaaaa, aaaaa aa aaaaaaaaaa aa aaaa aa aa aaa aaaaa aaaaaaaaaa aa aaaaaa aaaaaaaaa aaaaaaa." + - "[aaaa://aaaaa.aaaaaaaa.aaa/aaaaaaa/aaaaaa/ aaaaaaa aaaaaa] aaaaaaaa aaaaaaaaaaaa aa aaaaaaaaaa, aaaaa aaaaaaaaa aaa [[aa aaaaaaa]] [[aaaa]]; aaaaaaaaaaa aaaaaaaa aaa [[aaa aa]] [[aaaa]]\n" + - "\n" + - "[[aaaaa aaaaaaaaaa]] aa ''[[aaa aaaaaaaaaaaa aa aaaaaa]]'' (aaaa) aaaa aaa aaaa [[zzzzzzz]] aa aaaaaaaa"; + public void testMalformedHTML() throws Exception { + String test = "a > "; + String gold = "a ::: String: " + builder.toString(), ch == ch2 || ch == ' '/*&& ch != '<' && ch != '>'*/); + int ch = 0; + while ((ch = reader.read()) != -1){ builder.append((char)ch); - i++; } + String result = builder.toString(); + System.out.println("Resu: " + result + ""); + System.out.println("Gold: " + gold + ""); + assertTrue(result + " is not equal to " + gold + "", result.equals(gold) == true); } public void testBufferOverflow() throws Exception { @@ -264,7 +222,7 @@ public class HTMLStripCharFilterTest extends TestCase { public void testComment() throws Exception { String test = " "; - String gold = " "; + String gold = " "; Reader reader = new HTMLStripCharFilter(CharReader.get(new BufferedReader(new StringReader(test))));//force the use of BufferedReader int ch = 0; StringBuilder builder = new StringBuilder();