diff --git a/CHANGES.txt b/CHANGES.txt index 18f319b10b3..b26a40f9422 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -240,6 +240,9 @@ Bug Fixes 15. SOLR-449: the python and ruby response writers are now able to correctly output NaN and Infinity in their respective languages. (klaas) +16. SOLR-42: HTMLStripReader tokenizers now preserve correct source + offsets for highlighting. (Grant Ingersoll via yonik) + Other Changes 1. SOLR-135: Moved common classes to org.apache.solr.common and altered the build scripts to make two jars: apache-solr-1.3.jar and diff --git a/src/java/org/apache/solr/analysis/HTMLStripReader.java b/src/java/org/apache/solr/analysis/HTMLStripReader.java index f7db32f02b5..a4c599d7ef4 100644 --- a/src/java/org/apache/solr/analysis/HTMLStripReader.java +++ b/src/java/org/apache/solr/analysis/HTMLStripReader.java @@ -23,6 +23,8 @@ import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.util.HashMap; +import java.util.Set; +import java.util.Collections; /** * A Reader that wraps another reader and attempts to strip out HTML constructs. @@ -34,6 +36,9 @@ import java.util.HashMap; public class HTMLStripReader extends Reader { private final Reader in; private final int READAHEAD=4096; + private int numWhitespace = 0; + private int numRead = 0; + private Set escapedTags = Collections.emptySet(); // pushback buffer private final StringBuilder pushed = new StringBuilder(); @@ -58,6 +63,11 @@ public class HTMLStripReader extends Reader { this.in=source.markSupported() ? source : new BufferedReader(source); } + public HTMLStripReader(Reader source, Set escapedTags){ + this(source); + this.escapedTags = escapedTags; + } + private int next() throws IOException { int len = pushed.length(); @@ -66,6 +76,7 @@ public class HTMLStripReader extends Reader { pushed.setLength(len-1); return ch; } + numRead++; return in.read(); } @@ -364,7 +375,10 @@ public class HTMLStripReader extends Reader { break; } } - + if (escapedTags.contains(sb.toString())){ + //if this is a reservedTag, then keep it + return MISMATCH; + } // After the tag id, there needs to be either whitespace or // '>' if ( !(ch=='>' || isSpace(ch)) ) { @@ -445,7 +459,7 @@ public class HTMLStripReader extends Reader { push(ch); continue; } - int ret = readName(); + int ret = readName(false); if (ret==MISMATCH) return MISMATCH; ch=nextSkipWS(); if (ch!='>') return MISMATCH; @@ -482,12 +496,25 @@ public class HTMLStripReader extends Reader { } - private int readName() throws IOException { + private int readName(boolean checkEscaped) throws IOException { + StringBuilder builder = new StringBuilder(); int ch = read(); + builder.append((char)ch); if (!isFirstIdChar(ch)) return MISMATCH; ch = read(); - while(isIdChar(ch)) ch=read(); - if (ch!=-1) push(ch); + builder.append((char)ch); + while(isIdChar(ch)) { + ch=read(); + builder.append((char)ch); + } + if (ch!=-1) { + push(ch); + + } + //strip off the trailing > + if (checkEscaped && escapedTags.contains(builder.substring(0, builder.length() - 1))){ + return MISMATCH; + } return MATCH; } @@ -645,12 +672,18 @@ public class HTMLStripReader extends Reader { } + public int read() throws IOException { // TODO: Do we ever want to preserve CDATA sections? // where do we have to worry about them? // + if (numWhitespace > 0){ + numWhitespace--; + return ' '; + } while(true) { + int lastNumRead = numRead; int ch = next(); switch (ch) { @@ -660,6 +693,7 @@ public class HTMLStripReader extends Reader { if (ch>=0) return ch; if (ch==MISMATCH) { restoreState(); + return '&'; } break; @@ -671,7 +705,7 @@ public class HTMLStripReader extends Reader { if (ch=='!') { ret = readBang(false); } else if (ch=='/') { - ret = readName(); + ret = readName(true); if (ret==MATCH) { ch=nextSkipWS(); ret= ch=='>' ? MATCH : MISMATCH; @@ -685,7 +719,12 @@ public class HTMLStripReader extends Reader { // matched something to be discarded, so break // from this case and continue in the loop - if (ret==MATCH) break; + if (ret==MATCH) { + //break;//was + //return whitespace from + numWhitespace = (numRead - lastNumRead) - 1;//tack on the -1 since we are returning a space right now + return ' '; + } // didn't match any HTML constructs, so roll back // the stream state and just return '<' diff --git a/src/test/test-files/htmlStripReaderTest.html b/src/test/test-files/htmlStripReaderTest.html new file mode 100755 index 00000000000..04e6cc1b39e --- /dev/null +++ b/src/test/test-files/htmlStripReaderTest.html @@ -0,0 +1,350 @@ + + + + + + + +Welcome to Solr + + + + + + + + + +
+ + + +
+ + + + + + + + + + + + +
+
+
+
+ +
+ + +
+ +
+ +   +
+ + + + + +
+ +

Welcome to Solr

+ + + +

What Is Solr?

+
+

+ Solr is an open source enterprise search server based on the + Lucene Java search library, with XML/HTTP and JSON APIs, + hit highlighting, faceted search, caching, replication, and a web administration interface. + It runs in a Java servlet container such as Tomcat. +

+

+ See the complete feature list for more details, then check out the tutorial. +

+
+ + + +

News

+
+ +

02 October 2007 - Solr at OSSummit Asia

+

+OSSummit Asia logo + Lucene and Solr tutorials! +

+

The following talks and trainings are scheduled for the upcoming 2008 OSSummit:

+
    + +
  • +Lucene Boot Camp by Erik Hatcher (originally by Grant Ingersoll). An all-day training focusing on getting started with Lucene - the core library under Solr.
  • + +
  • +Solr in a Day by Erik Hatcher. All you need to know to use Solr effectively.
  • + +
  • +Lucene Case Studies by Erik Hatcher. A rapid series of examples of many Lucene and Solr using applications.
  • + +
+ +

03 September 2007 - Lucene at ApacheCon Atlanta

+

+ApacheCon US logo + Lucene will once again be well represented at ApacheCon USA in Atlanta this November 12-16, 2007. +

+

The following talks and trainings are scheduled for this year's conference:

+ + +

06 June 2007: Release 1.2 available

+

+ This is the first release since Solr graduated from the Incubator, + bringing many new features, including CSV/delimited-text data + loading, time based autocommit, faster faceting, negative filters, + a spell-check handler, sounds-like word filters, regex text filters, + and more flexible plugins. +

+

See the release notes for more details.

+ +

17 January 2007: Solr graduates from Incubator

+

+ Solr has graduated from the Apache Incubator, and is now a sub-project of Lucene. +

+ +

22 December 2006: Release 1.1.0 available

+

+ This is the first release since Solr joined the Incubator, and brings + many new features and performance optimizations including highlighting, + faceted search, and JSON/Python/Ruby response formats. +

+ +

15 August 2006: Solr at ApacheCon US

+

Chris Hostetter will be presenting + "Faceted Searching With Apache Solr" + at ApacheCon US 2006, on October 13th at 4:30pm. + See the ApacheCon website for more details. +

+ +

21 April 2006: Solr at ApacheCon

+

Yonik Seeley will be presenting + "Apache Solr, a Full-Text Search Server based on Lucene" + at ApacheCon Europe 2006, on June 29th at 5:30pm. + See the ApacheCon website for more details. +

+ +

21 February 2006: nightly builds

+

Solr now has nightly builds. This automatically creates a + downloadable version of Solr every + night. All unit tests must pass, or a message is sent to + the developers mailing list and no new version is created. This + also updates the javadoc.

+ +

17 January 2006: Solr Joins Apache Incubator

+

Solr, a search server based on Lucene, has been accepted into the Apache Incubator. + Solr was originally developed by CNET Networks, and is widely used within CNET + to provide high relevancy search and faceted browsing capabilities. +

+
+ + +
+ +
 
+
+ + +