SOLR-42: HTMLStripReader replaces removed content with spaces to preserve offsets

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@609162 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Yonik Seeley 2008-01-05 15:59:47 +00:00
parent 71dc04fcef
commit db84ea5964
3 changed files with 399 additions and 7 deletions

View File

@ -240,6 +240,9 @@ Bug Fixes
15. SOLR-449: the python and ruby response writers are now able to correctly
output NaN and Infinity in their respective languages. (klaas)
16. SOLR-42: HTMLStripReader tokenizers now preserve correct source
offsets for highlighting. (Grant Ingersoll via yonik)
Other Changes
1. SOLR-135: Moved common classes to org.apache.solr.common and altered the
build scripts to make two jars: apache-solr-1.3.jar and

View File

@ -23,6 +23,8 @@ import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.HashMap;
import java.util.Set;
import java.util.Collections;
/**
* A Reader that wraps another reader and attempts to strip out HTML constructs.
@ -34,6 +36,9 @@ import java.util.HashMap;
public class HTMLStripReader extends Reader {
private final Reader in;
private final int READAHEAD=4096;
private int numWhitespace = 0;
private int numRead = 0;
private Set<String> escapedTags = Collections.emptySet();
// pushback buffer
private final StringBuilder pushed = new StringBuilder();
@ -58,6 +63,11 @@ public class HTMLStripReader extends Reader {
this.in=source.markSupported() ? source : new BufferedReader(source);
}
public HTMLStripReader(Reader source, Set<String> escapedTags){
this(source);
this.escapedTags = escapedTags;
}
private int next() throws IOException {
int len = pushed.length();
@ -66,6 +76,7 @@ public class HTMLStripReader extends Reader {
pushed.setLength(len-1);
return ch;
}
numRead++;
return in.read();
}
@ -364,7 +375,10 @@ public class HTMLStripReader extends Reader {
break;
}
}
if (escapedTags.contains(sb.toString())){
//if this is a reservedTag, then keep it
return MISMATCH;
}
// After the tag id, there needs to be either whitespace or
// '>'
if ( !(ch=='>' || isSpace(ch)) ) {
@ -445,7 +459,7 @@ public class HTMLStripReader extends Reader {
push(ch);
continue;
}
int ret = readName();
int ret = readName(false);
if (ret==MISMATCH) return MISMATCH;
ch=nextSkipWS();
if (ch!='>') return MISMATCH;
@ -482,12 +496,25 @@ public class HTMLStripReader extends Reader {
}
private int readName() throws IOException {
private int readName(boolean checkEscaped) throws IOException {
StringBuilder builder = new StringBuilder();
int ch = read();
builder.append((char)ch);
if (!isFirstIdChar(ch)) return MISMATCH;
ch = read();
while(isIdChar(ch)) ch=read();
if (ch!=-1) push(ch);
builder.append((char)ch);
while(isIdChar(ch)) {
ch=read();
builder.append((char)ch);
}
if (ch!=-1) {
push(ch);
}
//strip off the trailing >
if (checkEscaped && escapedTags.contains(builder.substring(0, builder.length() - 1))){
return MISMATCH;
}
return MATCH;
}
@ -645,12 +672,18 @@ public class HTMLStripReader extends Reader {
}
public int read() throws IOException {
// TODO: Do we ever want to preserve CDATA sections?
// where do we have to worry about them?
// <![ CDATA [ unescaped markup ]]>
if (numWhitespace > 0){
numWhitespace--;
return ' ';
}
while(true) {
int lastNumRead = numRead;
int ch = next();
switch (ch) {
@ -660,6 +693,7 @@ public class HTMLStripReader extends Reader {
if (ch>=0) return ch;
if (ch==MISMATCH) {
restoreState();
return '&';
}
break;
@ -671,7 +705,7 @@ public class HTMLStripReader extends Reader {
if (ch=='!') {
ret = readBang(false);
} else if (ch=='/') {
ret = readName();
ret = readName(true);
if (ret==MATCH) {
ch=nextSkipWS();
ret= ch=='>' ? MATCH : MISMATCH;
@ -685,7 +719,12 @@ public class HTMLStripReader extends Reader {
// matched something to be discarded, so break
// from this case and continue in the loop
if (ret==MATCH) break;
if (ret==MATCH) {
//break;//was
//return whitespace from
numWhitespace = (numRead - lastNumRead) - 1;//tack on the -1 since we are returning a space right now
return ' ';
}
// didn't match any HTML constructs, so roll back
// the stream state and just return '<'

View File

@ -0,0 +1,350 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
<html>
<head>
<META http-equiv="Content-Type" content="text/html; charset=UTF-8">
<meta content="Apache Forrest" name="Generator">
<meta name="Forrest-version" content="0.8">
<meta name="Forrest-skin-name" content="pelt">
<title>Welcome to Solr</title>
<link type="text/css" href="skin/basic.css" rel="stylesheet">
<link media="screen" type="text/css" href="skin/screen.css" rel="stylesheet">
<link media="print" type="text/css" href="skin/print.css" rel="stylesheet">
<link type="text/css" href="skin/profile.css" rel="stylesheet">
<script src="skin/getBlank.js" language="javascript" type="text/javascript"></script><script src="skin/getMenu.js" language="javascript" type="text/javascript"></script><script src="skin/fontsize.js" language="javascript" type="text/javascript"></script>
<link rel="shortcut icon" href="images/favicon.ico">
</head>
<body onload="init()">
<script type="text/javascript">ndeSetTextSize();</script>
<div id="top">
<!--+
|breadtrail
+-->
<div class="breadtrail">
<a href="http://www.apache.org/">apache</a> &gt; <a href="http://lucene.apache.org/">lucene</a><script src="skin/breadcrumbs.js" language="JavaScript" type="text/javascript"></script>
</div>
<!--+
|header
+-->
<div class="header">
<!--+
|start group logo
+-->
<div class="grouplogo">
<a href="http://lucene.apache.org/"><img class="logoImage" alt="Lucene" src="images/lucene_green_150.gif" title="Apache Lucene"></a>
</div>
<!--+
|end group logo
+-->
<!--+
|start Project Logo
+-->
<div class="projectlogo">
<a href="http://lucene.apache.org/solr/"><img class="logoImage" alt="Solr" src="images/solr.png" title="Solr Description"></a>
</div>
<!--+
|end Project Logo
+-->
<!--+
|start Search
+-->
<div class="searchbox">
<form action="http://www.google.com/search" method="get" class="roundtopsmall">
<input value="lucene.apache.org" name="sitesearch" type="hidden"><input onFocus="getBlank (this, 'Search the site with google');" size="25" name="q" id="query" type="text" value="Search the site with google">&nbsp;
<input name="Search" value="Search" type="submit">
</form>
</div>
<!--+
|end search
+-->
<!--+
|start Tabs
+-->
<ul id="tabs">
<li class="current">
<a class="selected" href="index.html">Main</a>
</li>
<li>
<a class="unselected" href="http://wiki.apache.org/solr">Wiki</a>
</li>
</ul>
<!--+
|end Tabs
+-->
</div>
</div>
<div id="main">
<div id="publishedStrip">
<!--+
|start Subtabs
+-->
<div id="level2tabs"></div>
<!--+
|end Endtabs
+-->
<script type="text/javascript"><!--
document.write("Last Published: " + document.lastModified);
// --></script>
</div>
<!--+
|breadtrail
+-->
<div class="breadtrail">
&nbsp;
</div>
<!--+
|start Menu, mainarea
+-->
<!--+
|start Menu
+-->
<div id="menu">
<div onclick="SwitchMenu('menu_selected_1.1', 'skin/')" id="menu_selected_1.1Title" class="menutitle" style="background-image: url('skin/images/chapter_open.gif');">About</div>
<div id="menu_selected_1.1" class="selectedmenuitemgroup" style="display: block;">
<div class="menupage">
<div class="menupagetitle">Welcome</div>
</div>
<div class="menuitem">
<a href="who.html" title="Solr Committers">Who We Are</a>
</div>
</div>
<div onclick="SwitchMenu('menu_1.2', 'skin/')" id="menu_1.2Title" class="menutitle">Documentation</div>
<div id="menu_1.2" class="menuitemgroup">
<div class="menuitem">
<a href="features.html">Features</a>
</div>
<div class="menuitem">
<a href="tutorial.html">Tutorial</a>
</div>
<div class="menuitem">
<a href="http://wiki.apache.org/solr/">Docs (Wiki)</a>
</div>
<div class="menuitem">
<a href="http://wiki.apache.org/solr/FAQ">FAQ</a>
</div>
<div class="menuitem">
<a href="api/index.html">javadoc</a>
</div>
</div>
<div onclick="SwitchMenu('menu_1.3', 'skin/')" id="menu_1.3Title" class="menutitle">Resources</div>
<div id="menu_1.3" class="menuitemgroup">
<div class="menuitem">
<a href="http://www.apache.org/dyn/closer.cgi/lucene/solr/">Download</a>
</div>
<div class="menuitem">
<a href="mailing_lists.html">Mailing Lists</a>
</div>
<div class="menuitem">
<a href="issue_tracking.html">Issue Tracking</a>
</div>
<div class="menuitem">
<a href="version_control.html">Version Control</a>
</div>
</div>
<div onclick="SwitchMenu('menu_1.4', 'skin/')" id="menu_1.4Title" class="menutitle">Related Projects</div>
<div id="menu_1.4" class="menuitemgroup">
<div class="menuitem">
<a href="http://lucene.apache.org/java/">Lucene Java</a>
</div>
<div class="menuitem">
<a href="http://lucene.apache.org/nutch/">Nutch</a>
</div>
</div>
<div id="credit">
<hr>
<a href="http://forrest.apache.org/"><img border="0" title="Built with Apache Forrest" alt="Built with Apache Forrest - logo" src="images/built-with-forrest-button.png" style="width: 88px;height: 31px;"></a>
</div>
<div id="roundbottom">
<img style="display: none" class="corner" height="15" width="15" alt="" src="skin/images/rc-b-l-15-1body-2menu-3menu.png"></div>
<!--+
|alternative credits
+-->
<div id="credit2"></div>
</div>
<!--+
|end Menu
+-->
<!--+
|start content
+-->
<div id="content">
<div title="Portable Document Format" class="pdflink">
<a class="dida" href="index.pdf"><img alt="PDF -icon" src="skin/images/pdfdoc.gif" class="skin"><br>
PDF</a>
</div>
<h1>Welcome to Solr</h1>
<div id="minitoc-area">
<ul class="minitoc">
<li>
<a href="#intro">What Is Solr?</a>
</li>
<li>
<a href="#news">News</a>
<ul class="minitoc">
<li>
<a href="#02+October+2007+-+Solr+at+OSSummit+Asia">02 October 2007 - Solr at OSSummit Asia</a>
</li>
<li>
<a href="#03+September+2007+-+Lucene+at+ApacheCon+Atlanta">03 September 2007 - Lucene at ApacheCon Atlanta</a>
</li>
<li>
<a href="#06+June+2007%3A+Release+1.2+available">06 June 2007: Release 1.2 available</a>
</li>
<li>
<a href="#17+January+2007%3A+Solr+graduates+from+Incubator">17 January 2007: Solr graduates from Incubator</a>
</li>
<li>
<a href="#22+December+2006%3A+Release+1.1.0+available">22 December 2006: Release 1.1.0 available</a>
</li>
<li>
<a href="#15+August+2006%3A+Solr+at+ApacheCon+US">15 August 2006: Solr at ApacheCon US</a>
</li>
<li>
<a href="#21+April+2006%3A+Solr+at+ApacheCon">21 April 2006: Solr at ApacheCon</a>
</li>
<li>
<a href="#21+February+2006%3A+nightly+builds">21 February 2006: nightly builds</a>
</li>
<li>
<a href="#17+January+2006%3A+Solr+Joins+Apache+Incubator">17 January 2006: Solr Joins Apache Incubator</a>
</li>
</ul>
</li>
</ul>
</div>
<a name="N1000D"></a><a name="intro"></a>
<h2 class="boxed">What Is Solr?</h2>
<div class="section">
<p>
Solr is an open source enterprise search server based on the
<a href="http://lucene.apache.org/java/">Lucene Java</a> search library, with XML/HTTP and JSON APIs,
hit highlighting, faceted search, caching, replication, and a web administration interface.
It runs in a Java servlet container such as <a href="http://tomcat.apache.org">Tomcat</a>.
</p>
<p>
See the complete <a href="features.html">feature list</a> for more details, then check out the <a href="tutorial.html">tutorial</a>.
</p>
</div>
<a name="N1002A"></a><a name="news"></a>
<h2 class="boxed">News</h2>
<div class="section">
<a name="N10030"></a><a name="02+October+2007+-+Solr+at+OSSummit+Asia"></a>
<h3 class="boxed">02 October 2007 - Solr at OSSummit Asia</h3>
<p>
<a href="http://www.ossummit.com"><img alt="OSSummit Asia logo" class="float-right" src="http://www.ossummit.com/2007/images/logo.png"></a>
Lucene and Solr tutorials!
</p>
<p>The following talks and trainings are scheduled for the upcoming 2008 OSSummit:</p>
<ul>
<li>
<a href="http://www.ossummit.com/2007/program/talk/8">Lucene Boot Camp</a> by Erik Hatcher (originally by Grant Ingersoll). An all-day training focusing on getting started with Lucene - the core library under Solr.</li>
<li>
<a href="http://www.ossummit.com/2007/program/talk/25">Solr in a Day</a> by Erik Hatcher. All you need to know to use Solr effectively.</li>
<li>
<a href="http://www.ossummit.com/2007/program/talk/67">Lucene Case Studies</a> by Erik Hatcher. A rapid series of examples of many Lucene and Solr using applications.</li>
</ul>
<a name="N10058"></a><a name="03+September+2007+-+Lucene+at+ApacheCon+Atlanta"></a>
<h3 class="boxed">03 September 2007 - Lucene at ApacheCon Atlanta</h3>
<p>
<a href="http://www.us.apachecon.com"><img alt="ApacheCon US logo" class="float-right" src="http://www.apache.org/ads/ApacheCon/2007-usa-125x125.png"></a>
Lucene will once again be well represented at ApacheCon USA in Atlanta this November 12-16, 2007.
</p>
<p>The following talks and trainings are scheduled for this year's conference:</p>
<ul>
<li>November 12: <a href="http://us.apachecon.com/us2007/program/talk/1859">Lucene Boot Camp</a> by Grant Ingersoll. An all-day training focusing on getting started with Lucene.</li>
<li>November 16, 9:00 am: <a href="http://us.apachecon.com/us2007/program/talk/1992">Apache Solr out of the Box</a> by Chris Hostetter. Introduction to Solr.</li>
<li>November 16, 10:00 am: <a href="http://us.apachecon.com/us2007/program/talk/1943">Building a Vertical Search Site using Apache Software</a> by Ken Krugler. Will cover many Lucene-based projects.</li>
<li>November 16, 3:00 pm: <a href="http://us.apachecon.com/us2007/program/talk/1953">Apache Lucene Performance</a> by Grant Ingersoll. Tips and techniques for improving Lucene performance.</li>
<li>November 16, 4:00 pm: <a href="http://us.apachecon.com/us2007/program/talk/2017"> Advanced Indexing Techniques with Apache Lucene</a> by Michael Busch. Information on payloads and advanced indexing techniques.</li>
</ul>
<a name="N10091"></a><a name="06+June+2007%3A+Release+1.2+available"></a>
<h3 class="boxed">06 June 2007: Release 1.2 available</h3>
<p>
This is the first release since Solr graduated from the Incubator,
bringing many new features, including CSV/delimited-text data
loading, time based autocommit, faster faceting, negative filters,
a spell-check handler, sounds-like word filters, regex text filters,
and more flexible plugins.
</p>
<p>See the <a href="http://svn.apache.org/repos/asf/lucene/solr/tags/release-1.2.0/CHANGES.txt">release notes</a> for more details.</p>
<a name="N100A2"></a><a name="17+January+2007%3A+Solr+graduates+from+Incubator"></a>
<h3 class="boxed">17 January 2007: Solr graduates from Incubator</h3>
<p>
Solr has graduated from the Apache Incubator, and is now a sub-project of Lucene.
</p>
<a name="N100AC"></a><a name="22+December+2006%3A+Release+1.1.0+available"></a>
<h3 class="boxed">22 December 2006: Release 1.1.0 available</h3>
<p>
This is the first release since Solr joined the Incubator, and brings
many new features and performance optimizations including highlighting,
faceted search, and JSON/Python/Ruby response formats.
</p>
<a name="N100B6"></a><a name="15+August+2006%3A+Solr+at+ApacheCon+US"></a>
<h3 class="boxed">15 August 2006: Solr at ApacheCon US</h3>
<p>Chris Hostetter will be presenting
<strong><a href="http://www.apachecon.com/2006/US/html/sessions.html#FR26">"Faceted Searching With Apache Solr"</a></strong>
at ApacheCon US 2006, on October 13th at 4:30pm.
See the <a href="http://www.us.apachecon.com/">ApacheCon</a> website for more details.
</p>
<a name="N100C9"></a><a name="21+April+2006%3A+Solr+at+ApacheCon"></a>
<h3 class="boxed">21 April 2006: Solr at ApacheCon</h3>
<p>Yonik Seeley will be presenting
<strong>"Apache Solr, a Full-Text Search Server based on Lucene"</strong>
at ApacheCon Europe 2006, on June 29th at 5:30pm.
See the <a href="http://www.eu.apachecon.com/">ApacheCon</a> website for more details.
</p>
<a name="N100DA"></a><a name="21+February+2006%3A+nightly+builds"></a>
<h3 class="boxed">21 February 2006: nightly builds</h3>
<p>Solr now has nightly builds. This automatically creates a
<a href="http://people.apache.org/builds/lucene/solr/nightly/">downloadable version of Solr every
night</a>. All unit tests must pass, or a message is sent to
the developers mailing list and no new version is created. This
also updates the <a href="api/index.html">javadoc</a>.</p>
<a name="N100EC"></a><a name="17+January+2006%3A+Solr+Joins+Apache+Incubator"></a>
<h3 class="boxed">17 January 2006: Solr Joins Apache Incubator</h3>
<p>Solr, a search server based on Lucene, has been accepted into the Apache Incubator.
Solr was originally developed by CNET Networks, and is widely used within CNET
to provide high relevancy search and faceted browsing capabilities.
</p>
</div>
</div>
<!--+
|end content
+-->
<div class="clearboth">&nbsp;</div>
</div>
<div id="footer">
<!--+
|start bottomstrip
+-->
<div class="lastmodified">
<script type="text/javascript"><!--
document.write("Last Published: " + document.lastModified);
// --></script>
</div>
<div class="copyright">
Copyright &copy;
2007 <a href="http://www.apache.org/licenses/">The Apache Software Foundation.</a>
</div>
<div id="logos"></div>
<!--+
|end bottomstrip
+-->
</div>
</body>
</html>