Added Nicko Cadell's Encoder contribution

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@151622 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Mark Harwood 2005-02-06 21:31:54 +00:00
parent b1555b0bbf
commit 276ab079f5
6 changed files with 238 additions and 15 deletions

View File

@ -0,0 +1,33 @@
package org.apache.lucene.search.highlight;
/**
* Copyright 2005 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Simple {@link Encoder} implementation that does not modify the output
* @author Nicko Cadell
*
*/
public class DefaultEncoder implements Encoder
{
public DefaultEncoder()
{
}
public String encodeText(String originalText)
{
return originalText;
}
}

View File

@ -0,0 +1,31 @@
package org.apache.lucene.search.highlight;
/**
* Copyright 2005 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Encodes original text. The Encoder works with the Formatter to generate the output.
*
* @author Nicko Cadell
*/
public interface Encoder
{
/**
* @param originalText The section of text being output
* @return
*/
String encodeText(String originalText);
}

View File

@ -1,6 +1,6 @@
package org.apache.lucene.search.highlight;
/**
* Copyright 2002-2004 The Apache Software Foundation
* Copyright 2002-2005 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@ -24,8 +24,8 @@ import org.apache.lucene.util.PriorityQueue;
/**
* Class used to markup highlighted terms found in the best sections of a
* text, using configurable {@link Fragmenter}, {@link Scorer}, {@link Formatter}
* and tokenizers.
* text, using configurable {@link Fragmenter}, {@link Scorer}, {@link Formatter},
* {@link Encoder} and tokenizers.
* @author mark@searcharea.co.uk
*/
public class Highlighter
@ -34,6 +34,7 @@ public class Highlighter
public static final int DEFAULT_MAX_DOC_BYTES_TO_ANALYZE=50*1024;
private int maxDocBytesToAnalyze=DEFAULT_MAX_DOC_BYTES_TO_ANALYZE;
private Formatter formatter;
private Encoder encoder;
private Fragmenter textFragmenter=new SimpleFragmenter();
private Scorer fragmentScorer=null;
@ -43,12 +44,18 @@ public class Highlighter
}
public Highlighter(Formatter formatter, Scorer fragmentScorer)
{
this.formatter = formatter;
this.fragmentScorer = fragmentScorer;
public Highlighter(Formatter formatter, Scorer fragmentScorer)
{
this(formatter,new DefaultEncoder(),fragmentScorer);
}
public Highlighter(Formatter formatter, Encoder encoder, Scorer fragmentScorer)
{
this.formatter = formatter;
this.encoder = encoder;
this.fragmentScorer = fragmentScorer;
}
@ -160,10 +167,10 @@ public class Highlighter
startOffset = tokenGroup.startOffset;
endOffset = tokenGroup.endOffset;
tokenText = text.substring(startOffset, endOffset);
String markedUpText=formatter.highlightTerm(tokenText, tokenGroup);
String markedUpText=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
//store any whitespace etc from between this and last group
if (startOffset > lastEndOffset)
newText.append(text.substring(lastEndOffset, startOffset));
newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset)));
newText.append(markedUpText);
lastEndOffset=endOffset;
tokenGroup.clear();
@ -195,17 +202,17 @@ public class Highlighter
startOffset = tokenGroup.startOffset;
endOffset = tokenGroup.endOffset;
tokenText = text.substring(startOffset, endOffset);
String markedUpText=formatter.highlightTerm(tokenText, tokenGroup);
String markedUpText=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
//store any whitespace etc from between this and last group
if (startOffset > lastEndOffset)
newText.append(text.substring(lastEndOffset, startOffset));
newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset)));
newText.append(markedUpText);
lastEndOffset=endOffset;
}
// append text after end of last token
if (lastEndOffset < text.length())
newText.append(text.substring(lastEndOffset));
newText.append(encoder.encodeText(text.substring(lastEndOffset)));
currentFrag.textEndPos = newText.length();
@ -438,7 +445,14 @@ public class Highlighter
fragmentScorer = scorer;
}
public Encoder getEncoder()
{
return encoder;
}
public void setEncoder(Encoder encoder)
{
this.encoder = encoder;
}
}
class FragmentQueue extends PriorityQueue
{

View File

@ -0,0 +1,75 @@
package org.apache.lucene.search.highlight;
/**
* Copyright 2005 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Simple {@link Encoder} implementation to escape text for HTML output
* @author Nicko Cadell
*
*/
public class SimpleHTMLEncoder implements Encoder
{
public SimpleHTMLEncoder()
{
}
public String encodeText(String originalText)
{
return htmlEncode(originalText);
}
/**
* Encode string into HTML
*/
public final static String htmlEncode(String plainText)
{
if (plainText == null || plainText.length() == 0)
{
return "";
}
StringBuffer result = new StringBuffer(plainText.length());
for (int index=0; index<plainText.length(); index++)
{
char ch = plainText.charAt(index);
switch (ch)
{
case '"':
result.append("&quot;");
break;
case '&':
result.append("&amp;");
break;
case '<':
result.append("&lt;");
break;
case '>':
result.append("&gt;");
break;
default:
result.append(ch);
}
}
return result.toString();
}
}

View File

@ -24,7 +24,13 @@ Fragmenter, FragmentScorer, Formatter classes.
}
</pre>
<h2>New features 06/02/2005</h2>
This release adds options for encoding (thanks to Nicko Cadell)
This release adds options for encoding (thanks to Nicko Cadell).
An "Encoder" implementation such as the new SimpleHTMLEncoder class can be passed to the highlighter to encode
all those non-xhtml standard characters such as &amp; into legal values. This simple class may not suffice for
some languages - Commons Lang has an implementation that could be used: escapeHtml(String) in
http://svn.apache.org/viewcvs.cgi/jakarta/commons/proper/lang/trunk/src/java/org/apache/commons/lang/StringEscapeUtils.java?rev=137958&view=markup
<h2>New features 22/12/2004</h2>
This release adds some new capabilities:

View File

@ -16,6 +16,8 @@ package org.apache.lucene.search.highlight;
* limitations under the License.
*/
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
@ -23,6 +25,10 @@ import java.util.HashMap;
import java.util.Map;
import java.util.StringTokenizer;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import junit.framework.TestCase;
import org.apache.lucene.analysis.Analyzer;
@ -42,6 +48,9 @@ import org.apache.lucene.search.MultiSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.store.RAMDirectory;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;
/**
* JUnit Test for Highlighter class.
@ -105,7 +114,7 @@ public class HighlighterTest extends TestCase implements Formatter
{
doSearching("Kinnedy~");
doStandardHighlights();
assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 4);
assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 5);
}
public void testGetWildCardFragments() throws Exception
@ -323,6 +332,61 @@ public class HighlighterTest extends TestCase implements Formatter
assertNull("The highlight result should be null for text with no query terms", result);
}
}
/**
* Demonstrates creation of an XHTML compliant doc using new encoding facilities.
* @throws Exception
*/
public void testEncoding() throws Exception
{
String rawDocContent = "\"Smith & sons' prices < 3 and >4\" claims article";
//run the highlighter on the raw content (scorer does not score any tokens for
// highlighting but scores a single fragment for selection
Highlighter highlighter = new Highlighter(this,
new SimpleHTMLEncoder(), new Scorer()
{
public void startFragment(TextFragment newFragment)
{
}
public float getTokenScore(Token token)
{
return 0;
}
public float getFragmentScore()
{
return 1;
}
});
highlighter.setTextFragmenter(new SimpleFragmenter(2000));
TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME,
new StringReader(rawDocContent));
String encodedSnippet = highlighter.getBestFragments(tokenStream, rawDocContent,1,"");
//An ugly bit of XML creation:
String xhtml="<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"+
"<!DOCTYPE html\n"+
"PUBLIC \"//W3C//DTD XHTML 1.0 Transitional//EN\"\n"+
"\"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n"+
"<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\" lang=\"en\">\n"+
"<head>\n"+
"<title>My Test HTML Document</title>\n"+
"</head>\n"+
"<body>\n"+
"<h2>"+encodedSnippet+"</h2>\n"+
"</body>\n"+
"</html>";
//now an ugly built of XML parsing to test the snippet is encoded OK
DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
DocumentBuilder db = dbf.newDocumentBuilder();
org.w3c.dom.Document doc = db.parse(new ByteArrayInputStream(xhtml.getBytes()));
Element root=doc.getDocumentElement();
NodeList nodes=root.getElementsByTagName("body");
Element body=(Element) nodes.item(0);
nodes=body.getElementsByTagName("h2");
Element h2=(Element) nodes.item(0);
String decodedSnippet=h2.getFirstChild().getNodeValue();
assertEquals("XHTML Encoding should have worked:", rawDocContent,decodedSnippet);
}
public void testMultiSearcher() throws Exception
{