SOLR-42: HTMLStripReader replaces removed content with spaces to preserve offsets

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@609162 13f79535-47bb-0310-9956-ffa450edef68
2008-01-05 15:59:47 +00:00 · 2008-01-05 15:59:47 +00:00 · db84ea5964
parent 71dc04fcef
commit db84ea5964
3 changed files with 399 additions and 7 deletions
--- a/CHANGES.txt
+++ b/CHANGES.txt
@ -240,6 +240,9 @@ Bug Fixes
 15. SOLR-449: the python and ruby response writers are now able to correctly 
    output NaN and Infinity in their respective languages.  (klaas)

+16. SOLR-42: HTMLStripReader tokenizers now preserve correct source
+    offsets for highlighting.  (Grant Ingersoll via yonik)
+
 Other Changes
 1. SOLR-135: Moved common classes to org.apache.solr.common and altered the
    build scripts to make two jars: apache-solr-1.3.jar and 
--- a/src/java/org/apache/solr/analysis/HTMLStripReader.java
+++ b/src/java/org/apache/solr/analysis/HTMLStripReader.java
@ -23,6 +23,8 @@ import java.io.BufferedReader;
 import java.io.IOException;
 import java.io.InputStreamReader;
 import java.util.HashMap;
+import java.util.Set;
+import java.util.Collections;

 /**
 * A Reader that wraps another reader and attempts to strip out HTML constructs.
@ -34,6 +36,9 @@ import java.util.HashMap;
 public class HTMLStripReader extends Reader {
  private final Reader in;
  private final int READAHEAD=4096;
+  private int numWhitespace = 0;
+  private int numRead = 0;
+  private Set<String> escapedTags = Collections.emptySet();

  // pushback buffer
  private final StringBuilder pushed = new StringBuilder();
@ -58,6 +63,11 @@ public class HTMLStripReader extends Reader {
    this.in=source.markSupported() ? source : new BufferedReader(source);
  }

+  public HTMLStripReader(Reader source, Set<String> escapedTags){
+    this(source);
+    this.escapedTags = escapedTags;
+  }
+

  private int next() throws IOException {
    int len = pushed.length();
@ -66,6 +76,7 @@ public class HTMLStripReader extends Reader {
      pushed.setLength(len-1);
      return ch;
    }
+    numRead++;
    return in.read();
  }

@ -364,7 +375,10 @@ public class HTMLStripReader extends Reader {
        break;
      }
    }
-
+    if (escapedTags.contains(sb.toString())){
+      //if this is a reservedTag, then keep it
+      return MISMATCH;
+    }
    // After the tag id, there needs to be either whitespace or
    // '>'
    if ( !(ch=='>' || isSpace(ch)) ) {
@ -445,7 +459,7 @@ public class HTMLStripReader extends Reader {
          push(ch);
          continue;
        }
-        int ret = readName();
+        int ret = readName(false);
        if (ret==MISMATCH) return MISMATCH;
        ch=nextSkipWS();
        if (ch!='>') return MISMATCH;
@ -482,12 +496,25 @@ public class HTMLStripReader extends Reader {
  }


-  private int readName() throws IOException {
+  private int readName(boolean checkEscaped) throws IOException {
+    StringBuilder builder = new StringBuilder();
    int ch = read();
+    builder.append((char)ch);
    if (!isFirstIdChar(ch)) return MISMATCH;
    ch = read();
-    while(isIdChar(ch)) ch=read();
-    if (ch!=-1) push(ch);
+    builder.append((char)ch);
+    while(isIdChar(ch)) {
+      ch=read();
+      builder.append((char)ch);
+    }
+    if (ch!=-1) {
+      push(ch);
+
+    }
+    //strip off the trailing >
+    if (checkEscaped && escapedTags.contains(builder.substring(0, builder.length() - 1))){
+      return MISMATCH;
+    }
    return MATCH;
  }

@ -645,12 +672,18 @@ public class HTMLStripReader extends Reader {
  }


+
  public int read() throws IOException {
    // TODO: Do we ever want to preserve CDATA sections?
    // where do we have to worry about them?
    // <![ CDATA [ unescaped markup ]]>
+    if (numWhitespace > 0){
+      numWhitespace--;
+      return ' ';
+    }

    while(true) {
+      int lastNumRead = numRead;
      int ch = next();

      switch (ch) {
@ -660,6 +693,7 @@ public class HTMLStripReader extends Reader {
          if (ch>=0) return ch;
          if (ch==MISMATCH) {
            restoreState();
+
            return '&';
          }
          break;
@ -671,7 +705,7 @@ public class HTMLStripReader extends Reader {
          if (ch=='!') {
            ret = readBang(false);
          } else if (ch=='/') {
-            ret = readName();
+            ret = readName(true);
            if (ret==MATCH) {
              ch=nextSkipWS();
              ret= ch=='>' ? MATCH : MISMATCH;
@ -685,7 +719,12 @@ public class HTMLStripReader extends Reader {

          // matched something to be discarded, so break
          // from this case and continue in the loop
-          if (ret==MATCH) break;
+          if (ret==MATCH) {
+            //break;//was
+            //return whitespace from
+            numWhitespace = (numRead - lastNumRead) - 1;//tack on the -1 since we are returning a space right now
+            return ' ';
+          }

          // didn't match any HTML constructs, so roll back
          // the stream state and just return '<'
--- a/src/test/test-files/htmlStripReaderTest.html
+++ b/src/test/test-files/htmlStripReaderTest.html
@ -0,0 +1,350 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html>
+<head>
+<META http-equiv="Content-Type" content="text/html; charset=UTF-8">
+<meta content="Apache Forrest" name="Generator">
+<meta name="Forrest-version" content="0.8">
+<meta name="Forrest-skin-name" content="pelt">
+<title>Welcome to Solr</title>
+<link type="text/css" href="skin/basic.css" rel="stylesheet">
+<link media="screen" type="text/css" href="skin/screen.css" rel="stylesheet">
+<link media="print" type="text/css" href="skin/print.css" rel="stylesheet">
+<link type="text/css" href="skin/profile.css" rel="stylesheet">
+<script src="skin/getBlank.js" language="javascript" type="text/javascript"></script><script src="skin/getMenu.js" language="javascript" type="text/javascript"></script><script src="skin/fontsize.js" language="javascript" type="text/javascript"></script>
+<link rel="shortcut icon" href="images/favicon.ico">
+</head>
+<body onload="init()">
+<script type="text/javascript">ndeSetTextSize();</script>
+<div id="top">
+<!--+
+    |breadtrail
+    +-->
+<div class="breadtrail">
+<a href="http://www.apache.org/">apache</a> &gt; <a href="http://lucene.apache.org/">lucene</a><script src="skin/breadcrumbs.js" language="JavaScript" type="text/javascript"></script>
+</div>
+<!--+
+    |header
+    +-->
+<div class="header">
+<!--+
+    |start group logo
+    +-->
+<div class="grouplogo">
+<a href="http://lucene.apache.org/"><img class="logoImage" alt="Lucene" src="images/lucene_green_150.gif" title="Apache Lucene"></a>
+</div>
+<!--+
+    |end group logo
+    +-->
+<!--+
+    |start Project Logo
+    +-->
+<div class="projectlogo">
+<a href="http://lucene.apache.org/solr/"><img class="logoImage" alt="Solr" src="images/solr.png" title="Solr Description"></a>
+</div>
+<!--+
+    |end Project Logo
+    +-->
+<!--+
+    |start Search
+    +-->
+<div class="searchbox">
+<form action="http://www.google.com/search" method="get" class="roundtopsmall">
+<input value="lucene.apache.org" name="sitesearch" type="hidden"><input onFocus="getBlank (this, 'Search the site with google');" size="25" name="q" id="query" type="text" value="Search the site with google">&nbsp; 
+                    <input name="Search" value="Search" type="submit">
+</form>
+</div>
+<!--+
+    |end search
+    +-->
+<!--+
+    |start Tabs
+    +-->
+<ul id="tabs">
+<li class="current">
+<a class="selected" href="index.html">Main</a>
+</li>
+<li>
+<a class="unselected" href="http://wiki.apache.org/solr">Wiki</a>
+</li>
+</ul>
+<!--+
+    |end Tabs
+    +-->
+</div>
+</div>
+<div id="main">
+<div id="publishedStrip">
+<!--+
+    |start Subtabs
+    +-->
+<div id="level2tabs"></div>
+<!--+
+    |end Endtabs
+    +-->
+<script type="text/javascript"><!--
+document.write("Last Published: " + document.lastModified);
+//  --></script>
+</div>
+<!--+
+    |breadtrail
+    +-->
+<div class="breadtrail">
+
+             &nbsp;
+           </div>
+<!--+
+    |start Menu, mainarea
+    +-->
+<!--+
+    |start Menu
+    +-->
+<div id="menu">
+<div onclick="SwitchMenu('menu_selected_1.1', 'skin/')" id="menu_selected_1.1Title" class="menutitle" style="background-image: url('skin/images/chapter_open.gif');">About</div>
+<div id="menu_selected_1.1" class="selectedmenuitemgroup" style="display: block;">
+<div class="menupage">
+<div class="menupagetitle">Welcome</div>
+</div>
+<div class="menuitem">
+<a href="who.html" title="Solr Committers">Who We Are</a>
+</div>
+</div>
+<div onclick="SwitchMenu('menu_1.2', 'skin/')" id="menu_1.2Title" class="menutitle">Documentation</div>
+<div id="menu_1.2" class="menuitemgroup">
+<div class="menuitem">
+<a href="features.html">Features</a>
+</div>
+<div class="menuitem">
+<a href="tutorial.html">Tutorial</a>
+</div>
+<div class="menuitem">
+<a href="http://wiki.apache.org/solr/">Docs (Wiki)</a>
+</div>
+<div class="menuitem">
+<a href="http://wiki.apache.org/solr/FAQ">FAQ</a>
+</div>
+<div class="menuitem">
+<a href="api/index.html">javadoc</a>
+</div>
+</div>
+<div onclick="SwitchMenu('menu_1.3', 'skin/')" id="menu_1.3Title" class="menutitle">Resources</div>
+<div id="menu_1.3" class="menuitemgroup">
+<div class="menuitem">
+<a href="http://www.apache.org/dyn/closer.cgi/lucene/solr/">Download</a>
+</div>
+<div class="menuitem">
+<a href="mailing_lists.html">Mailing Lists</a>
+</div>
+<div class="menuitem">
+<a href="issue_tracking.html">Issue Tracking</a>
+</div>
+<div class="menuitem">
+<a href="version_control.html">Version Control</a>
+</div>
+</div>
+<div onclick="SwitchMenu('menu_1.4', 'skin/')" id="menu_1.4Title" class="menutitle">Related Projects</div>
+<div id="menu_1.4" class="menuitemgroup">
+<div class="menuitem">
+<a href="http://lucene.apache.org/java/">Lucene Java</a>
+</div>
+<div class="menuitem">
+<a href="http://lucene.apache.org/nutch/">Nutch</a>
+</div>
+</div>
+<div id="credit">
+<hr>
+<a href="http://forrest.apache.org/"><img border="0" title="Built with Apache Forrest" alt="Built with Apache Forrest - logo" src="images/built-with-forrest-button.png" style="width: 88px;height: 31px;"></a>
+</div>
+<div id="roundbottom">
+<img style="display: none" class="corner" height="15" width="15" alt="" src="skin/images/rc-b-l-15-1body-2menu-3menu.png"></div>
+<!--+
+  |alternative credits
+  +-->
+<div id="credit2"></div>
+</div>
+<!--+
+    |end Menu
+    +-->
+<!--+
+    |start content
+    +-->
+<div id="content">
+<div title="Portable Document Format" class="pdflink">
+<a class="dida" href="index.pdf"><img alt="PDF -icon" src="skin/images/pdfdoc.gif" class="skin"><br>
+        PDF</a>
+</div>
+<h1>Welcome to Solr</h1>
+<div id="minitoc-area">
+<ul class="minitoc">
+<li>
+<a href="#intro">What Is Solr?</a>
+</li>
+<li>
+<a href="#news">News</a>
+<ul class="minitoc">
+<li>
+<a href="#02+October+2007+-+Solr+at+OSSummit+Asia">02 October 2007 - Solr at OSSummit Asia</a>
+</li>
+<li>
+<a href="#03+September+2007+-+Lucene+at+ApacheCon+Atlanta">03 September 2007 - Lucene at ApacheCon Atlanta</a>
+</li>
+<li>
+<a href="#06+June+2007%3A+Release+1.2+available">06 June 2007: Release 1.2 available</a>
+</li>
+<li>
+<a href="#17+January+2007%3A+Solr+graduates+from+Incubator">17 January 2007: Solr graduates from Incubator</a>
+</li>
+<li>
+<a href="#22+December+2006%3A+Release+1.1.0+available">22 December 2006: Release 1.1.0 available</a>
+</li>
+<li>
+<a href="#15+August+2006%3A+Solr+at+ApacheCon+US">15 August 2006: Solr at ApacheCon US</a>
+</li>
+<li>
+<a href="#21+April+2006%3A+Solr+at+ApacheCon">21 April 2006: Solr at ApacheCon</a>
+</li>
+<li>
+<a href="#21+February+2006%3A+nightly+builds">21 February 2006: nightly builds</a>
+</li>
+<li>
+<a href="#17+January+2006%3A+Solr+Joins+Apache+Incubator">17 January 2006: Solr Joins Apache Incubator</a>
+</li>
+</ul>
+</li>
+</ul>
+</div> 
+    
+<a name="N1000D"></a><a name="intro"></a>
+<h2 class="boxed">What Is Solr?</h2>
+<div class="section">
+<p>
+        Solr is an open source enterprise search server based on the
+        <a href="http://lucene.apache.org/java/">Lucene Java</a> search library, with XML/HTTP and JSON APIs,
+        hit highlighting, faceted search, caching, replication, and a web administration interface.
+        It runs in a Java servlet container such as <a href="http://tomcat.apache.org">Tomcat</a>.
+      </p>
+<p>
+        See the complete <a href="features.html">feature list</a> for more details, then check out the <a href="tutorial.html">tutorial</a>.
+      </p>
+</div>
+
+    
+<a name="N1002A"></a><a name="news"></a>
+<h2 class="boxed">News</h2>
+<div class="section">
+<a name="N10030"></a><a name="02+October+2007+-+Solr+at+OSSummit+Asia"></a>
+<h3 class="boxed">02 October 2007 - Solr at OSSummit Asia</h3>
+<p>
+<a href="http://www.ossummit.com"><img alt="OSSummit Asia logo" class="float-right" src="http://www.ossummit.com/2007/images/logo.png"></a>
+          Lucene and Solr tutorials!
+        </p>
+<p>The following talks and trainings are scheduled for the upcoming 2008 OSSummit:</p>
+<ul>
+            
+<li>
+<a href="http://www.ossummit.com/2007/program/talk/8">Lucene Boot Camp</a> by Erik Hatcher (originally by Grant Ingersoll).  An all-day training focusing on getting started with Lucene - the core library under Solr.</li>
+            
+<li>
+<a href="http://www.ossummit.com/2007/program/talk/25">Solr in a Day</a> by Erik Hatcher.  All you need to know to use Solr effectively.</li>
+            
+<li>
+<a href="http://www.ossummit.com/2007/program/talk/67">Lucene Case Studies</a> by Erik Hatcher.  A rapid series of examples of many Lucene and Solr using applications.</li>
+          
+</ul>
+<a name="N10058"></a><a name="03+September+2007+-+Lucene+at+ApacheCon+Atlanta"></a>
+<h3 class="boxed">03 September 2007 - Lucene at ApacheCon Atlanta</h3>
+<p>
+<a href="http://www.us.apachecon.com"><img alt="ApacheCon US logo" class="float-right" src="http://www.apache.org/ads/ApacheCon/2007-usa-125x125.png"></a>
+              Lucene will once again be well represented at ApacheCon USA in Atlanta this November 12-16, 2007.  
+            </p>
+<p>The following talks and trainings are scheduled for this year's conference:</p>
+<ul>
+                
+<li>November 12: <a href="http://us.apachecon.com/us2007/program/talk/1859">Lucene Boot Camp</a> by Grant Ingersoll.  An all-day training focusing on getting started with Lucene.</li>
+                
+<li>November 16, 9:00 am: <a href="http://us.apachecon.com/us2007/program/talk/1992">Apache Solr out of the Box</a> by Chris Hostetter. Introduction to Solr.</li>
+                
+<li>November 16, 10:00 am: <a href="http://us.apachecon.com/us2007/program/talk/1943">Building a Vertical Search Site using Apache Software</a> by Ken Krugler. Will cover many Lucene-based projects.</li>
+                
+<li>November 16, 3:00 pm: <a href="http://us.apachecon.com/us2007/program/talk/1953">Apache Lucene Performance</a> by Grant Ingersoll. Tips and techniques for improving Lucene performance.</li>
+                
+<li>November 16, 4:00 pm: <a href="http://us.apachecon.com/us2007/program/talk/2017"> Advanced Indexing Techniques with Apache Lucene</a> by Michael Busch. Information on payloads and advanced indexing techniques.</li>
+              
+</ul>
+<a name="N10091"></a><a name="06+June+2007%3A+Release+1.2+available"></a>
+<h3 class="boxed">06 June 2007: Release 1.2 available</h3>
+<p>
+        This is the first release since Solr graduated from the Incubator,
+        bringing many new features, including CSV/delimited-text data
+        loading, time based autocommit, faster faceting, negative filters,
+        a spell-check handler, sounds-like word filters, regex text filters,
+        and more flexible plugins.
+      </p>
+<p>See the <a href="http://svn.apache.org/repos/asf/lucene/solr/tags/release-1.2.0/CHANGES.txt">release notes</a> for more details.</p>
+<a name="N100A2"></a><a name="17+January+2007%3A+Solr+graduates+from+Incubator"></a>
+<h3 class="boxed">17 January 2007: Solr graduates from Incubator</h3>
+<p>
+        Solr has graduated from the Apache Incubator, and is now a sub-project of Lucene.
+      </p>
+<a name="N100AC"></a><a name="22+December+2006%3A+Release+1.1.0+available"></a>
+<h3 class="boxed">22 December 2006: Release 1.1.0 available</h3>
+<p>
+        This is the first release since Solr joined the Incubator, and brings
+        many new features and performance optimizations including highlighting,
+        faceted search, and JSON/Python/Ruby response formats.
+      </p>
+<a name="N100B6"></a><a name="15+August+2006%3A+Solr+at+ApacheCon+US"></a>
+<h3 class="boxed">15 August 2006: Solr at ApacheCon US</h3>
+<p>Chris Hostetter will be presenting
+        <strong><a href="http://www.apachecon.com/2006/US/html/sessions.html#FR26">"Faceted Searching With Apache Solr"</a></strong>  
+        at ApacheCon US 2006, on October 13th at 4:30pm.
+        See the <a href="http://www.us.apachecon.com/">ApacheCon</a> website for more details.
+      </p>
+<a name="N100C9"></a><a name="21+April+2006%3A+Solr+at+ApacheCon"></a>
+<h3 class="boxed">21 April 2006: Solr at ApacheCon</h3>
+<p>Yonik Seeley will be presenting
+        <strong>"Apache Solr, a Full-Text Search Server based on Lucene"</strong>  
+        at ApacheCon Europe 2006, on June 29th at 5:30pm.
+        See the <a href="http://www.eu.apachecon.com/">ApacheCon</a> website for more details.
+      </p>
+<a name="N100DA"></a><a name="21+February+2006%3A+nightly+builds"></a>
+<h3 class="boxed">21 February 2006: nightly builds</h3>
+<p>Solr now has nightly builds.  This automatically creates a
+      <a href="http://people.apache.org/builds/lucene/solr/nightly/">downloadable version of Solr every
+      night</a>.  All unit tests must pass, or a message is sent to
+      the developers mailing list and no new version is created.  This
+      also updates the <a href="api/index.html">javadoc</a>.</p>
+<a name="N100EC"></a><a name="17+January+2006%3A+Solr+Joins+Apache+Incubator"></a>
+<h3 class="boxed">17 January 2006: Solr Joins Apache Incubator</h3>
+<p>Solr, a search server based on Lucene, has been accepted into the Apache Incubator.
+            Solr was originally developed by CNET Networks, and is widely used within CNET
+            to provide high relevancy search and faceted browsing capabilities.
+            </p>
+</div>
+
+  
+</div>
+<!--+
+    |end content
+    +-->
+<div class="clearboth">&nbsp;</div>
+</div>
+<div id="footer">
+<!--+
+    |start bottomstrip
+    +-->
+<div class="lastmodified">
+<script type="text/javascript"><!--
+document.write("Last Published: " + document.lastModified);
+//  --></script>
+</div>
+<div class="copyright">
+        Copyright &copy;
+         2007 <a href="http://www.apache.org/licenses/">The Apache Software Foundation.</a>
+</div>
+<div id="logos"></div>
+<!--+
+    |end bottomstrip
+    +-->
+</div>
+</body>
+</html>