mirror of https://github.com/apache/lucene.git
Added Nicko Cadell's Encoder contribution
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@151622 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
b1555b0bbf
commit
276ab079f5
|
@ -0,0 +1,33 @@
|
|||
package org.apache.lucene.search.highlight;
|
||||
/**
|
||||
* Copyright 2005 The Apache Software Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Simple {@link Encoder} implementation that does not modify the output
|
||||
* @author Nicko Cadell
|
||||
*
|
||||
*/
|
||||
public class DefaultEncoder implements Encoder
|
||||
{
|
||||
public DefaultEncoder()
|
||||
{
|
||||
}
|
||||
|
||||
public String encodeText(String originalText)
|
||||
{
|
||||
return originalText;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,31 @@
|
|||
package org.apache.lucene.search.highlight;
|
||||
/**
|
||||
* Copyright 2005 The Apache Software Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
|
||||
/**
|
||||
* Encodes original text. The Encoder works with the Formatter to generate the output.
|
||||
*
|
||||
* @author Nicko Cadell
|
||||
*/
|
||||
public interface Encoder
|
||||
{
|
||||
/**
|
||||
* @param originalText The section of text being output
|
||||
* @return
|
||||
*/
|
||||
String encodeText(String originalText);
|
||||
}
|
|
@ -1,6 +1,6 @@
|
|||
package org.apache.lucene.search.highlight;
|
||||
/**
|
||||
* Copyright 2002-2004 The Apache Software Foundation
|
||||
* Copyright 2002-2005 The Apache Software Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
|
@ -24,8 +24,8 @@ import org.apache.lucene.util.PriorityQueue;
|
|||
|
||||
/**
|
||||
* Class used to markup highlighted terms found in the best sections of a
|
||||
* text, using configurable {@link Fragmenter}, {@link Scorer}, {@link Formatter}
|
||||
* and tokenizers.
|
||||
* text, using configurable {@link Fragmenter}, {@link Scorer}, {@link Formatter},
|
||||
* {@link Encoder} and tokenizers.
|
||||
* @author mark@searcharea.co.uk
|
||||
*/
|
||||
public class Highlighter
|
||||
|
@ -34,6 +34,7 @@ public class Highlighter
|
|||
public static final int DEFAULT_MAX_DOC_BYTES_TO_ANALYZE=50*1024;
|
||||
private int maxDocBytesToAnalyze=DEFAULT_MAX_DOC_BYTES_TO_ANALYZE;
|
||||
private Formatter formatter;
|
||||
private Encoder encoder;
|
||||
private Fragmenter textFragmenter=new SimpleFragmenter();
|
||||
private Scorer fragmentScorer=null;
|
||||
|
||||
|
@ -43,12 +44,18 @@ public class Highlighter
|
|||
}
|
||||
|
||||
|
||||
public Highlighter(Formatter formatter, Scorer fragmentScorer)
|
||||
{
|
||||
this.formatter = formatter;
|
||||
this.fragmentScorer = fragmentScorer;
|
||||
public Highlighter(Formatter formatter, Scorer fragmentScorer)
|
||||
{
|
||||
this(formatter,new DefaultEncoder(),fragmentScorer);
|
||||
}
|
||||
|
||||
|
||||
public Highlighter(Formatter formatter, Encoder encoder, Scorer fragmentScorer)
|
||||
{
|
||||
this.formatter = formatter;
|
||||
this.encoder = encoder;
|
||||
this.fragmentScorer = fragmentScorer;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
@ -160,10 +167,10 @@ public class Highlighter
|
|||
startOffset = tokenGroup.startOffset;
|
||||
endOffset = tokenGroup.endOffset;
|
||||
tokenText = text.substring(startOffset, endOffset);
|
||||
String markedUpText=formatter.highlightTerm(tokenText, tokenGroup);
|
||||
String markedUpText=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
|
||||
//store any whitespace etc from between this and last group
|
||||
if (startOffset > lastEndOffset)
|
||||
newText.append(text.substring(lastEndOffset, startOffset));
|
||||
newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset)));
|
||||
newText.append(markedUpText);
|
||||
lastEndOffset=endOffset;
|
||||
tokenGroup.clear();
|
||||
|
@ -195,17 +202,17 @@ public class Highlighter
|
|||
startOffset = tokenGroup.startOffset;
|
||||
endOffset = tokenGroup.endOffset;
|
||||
tokenText = text.substring(startOffset, endOffset);
|
||||
String markedUpText=formatter.highlightTerm(tokenText, tokenGroup);
|
||||
String markedUpText=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
|
||||
//store any whitespace etc from between this and last group
|
||||
if (startOffset > lastEndOffset)
|
||||
newText.append(text.substring(lastEndOffset, startOffset));
|
||||
newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset)));
|
||||
newText.append(markedUpText);
|
||||
lastEndOffset=endOffset;
|
||||
}
|
||||
|
||||
// append text after end of last token
|
||||
if (lastEndOffset < text.length())
|
||||
newText.append(text.substring(lastEndOffset));
|
||||
newText.append(encoder.encodeText(text.substring(lastEndOffset)));
|
||||
|
||||
currentFrag.textEndPos = newText.length();
|
||||
|
||||
|
@ -438,7 +445,14 @@ public class Highlighter
|
|||
fragmentScorer = scorer;
|
||||
}
|
||||
|
||||
|
||||
public Encoder getEncoder()
|
||||
{
|
||||
return encoder;
|
||||
}
|
||||
public void setEncoder(Encoder encoder)
|
||||
{
|
||||
this.encoder = encoder;
|
||||
}
|
||||
}
|
||||
class FragmentQueue extends PriorityQueue
|
||||
{
|
||||
|
|
|
@ -0,0 +1,75 @@
|
|||
package org.apache.lucene.search.highlight;
|
||||
/**
|
||||
* Copyright 2005 The Apache Software Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Simple {@link Encoder} implementation to escape text for HTML output
|
||||
* @author Nicko Cadell
|
||||
*
|
||||
*/
|
||||
public class SimpleHTMLEncoder implements Encoder
|
||||
{
|
||||
public SimpleHTMLEncoder()
|
||||
{
|
||||
}
|
||||
|
||||
public String encodeText(String originalText)
|
||||
{
|
||||
return htmlEncode(originalText);
|
||||
}
|
||||
|
||||
/**
|
||||
* Encode string into HTML
|
||||
*/
|
||||
public final static String htmlEncode(String plainText)
|
||||
{
|
||||
if (plainText == null || plainText.length() == 0)
|
||||
{
|
||||
return "";
|
||||
}
|
||||
|
||||
StringBuffer result = new StringBuffer(plainText.length());
|
||||
|
||||
for (int index=0; index<plainText.length(); index++)
|
||||
{
|
||||
char ch = plainText.charAt(index);
|
||||
|
||||
switch (ch)
|
||||
{
|
||||
case '"':
|
||||
result.append(""");
|
||||
break;
|
||||
|
||||
case '&':
|
||||
result.append("&");
|
||||
break;
|
||||
|
||||
case '<':
|
||||
result.append("<");
|
||||
break;
|
||||
|
||||
case '>':
|
||||
result.append(">");
|
||||
break;
|
||||
|
||||
default:
|
||||
result.append(ch);
|
||||
}
|
||||
}
|
||||
|
||||
return result.toString();
|
||||
}
|
||||
}
|
|
@ -24,7 +24,13 @@ Fragmenter, FragmentScorer, Formatter classes.
|
|||
}
|
||||
</pre>
|
||||
<h2>New features 06/02/2005</h2>
|
||||
This release adds options for encoding (thanks to Nicko Cadell)
|
||||
This release adds options for encoding (thanks to Nicko Cadell).
|
||||
An "Encoder" implementation such as the new SimpleHTMLEncoder class can be passed to the highlighter to encode
|
||||
all those non-xhtml standard characters such as & into legal values. This simple class may not suffice for
|
||||
some languages - Commons Lang has an implementation that could be used: escapeHtml(String) in
|
||||
http://svn.apache.org/viewcvs.cgi/jakarta/commons/proper/lang/trunk/src/java/org/apache/commons/lang/StringEscapeUtils.java?rev=137958&view=markup
|
||||
|
||||
|
||||
|
||||
<h2>New features 22/12/2004</h2>
|
||||
This release adds some new capabilities:
|
||||
|
|
|
@ -16,6 +16,8 @@ package org.apache.lucene.search.highlight;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
|
@ -23,6 +25,10 @@ import java.util.HashMap;
|
|||
import java.util.Map;
|
||||
import java.util.StringTokenizer;
|
||||
|
||||
import javax.xml.parsers.DocumentBuilder;
|
||||
import javax.xml.parsers.DocumentBuilderFactory;
|
||||
import javax.xml.parsers.ParserConfigurationException;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
|
@ -42,6 +48,9 @@ import org.apache.lucene.search.MultiSearcher;
|
|||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.Searcher;
|
||||
import org.apache.lucene.store.RAMDirectory;
|
||||
import org.w3c.dom.Element;
|
||||
import org.w3c.dom.NodeList;
|
||||
import org.xml.sax.SAXException;
|
||||
|
||||
/**
|
||||
* JUnit Test for Highlighter class.
|
||||
|
@ -105,7 +114,7 @@ public class HighlighterTest extends TestCase implements Formatter
|
|||
{
|
||||
doSearching("Kinnedy~");
|
||||
doStandardHighlights();
|
||||
assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 4);
|
||||
assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 5);
|
||||
}
|
||||
|
||||
public void testGetWildCardFragments() throws Exception
|
||||
|
@ -323,6 +332,61 @@ public class HighlighterTest extends TestCase implements Formatter
|
|||
assertNull("The highlight result should be null for text with no query terms", result);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Demonstrates creation of an XHTML compliant doc using new encoding facilities.
|
||||
* @throws Exception
|
||||
*/
|
||||
public void testEncoding() throws Exception
|
||||
{
|
||||
String rawDocContent = "\"Smith & sons' prices < 3 and >4\" claims article";
|
||||
//run the highlighter on the raw content (scorer does not score any tokens for
|
||||
// highlighting but scores a single fragment for selection
|
||||
Highlighter highlighter = new Highlighter(this,
|
||||
new SimpleHTMLEncoder(), new Scorer()
|
||||
{
|
||||
public void startFragment(TextFragment newFragment)
|
||||
{
|
||||
}
|
||||
public float getTokenScore(Token token)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
public float getFragmentScore()
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
});
|
||||
highlighter.setTextFragmenter(new SimpleFragmenter(2000));
|
||||
TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME,
|
||||
new StringReader(rawDocContent));
|
||||
|
||||
String encodedSnippet = highlighter.getBestFragments(tokenStream, rawDocContent,1,"");
|
||||
//An ugly bit of XML creation:
|
||||
String xhtml="<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"+
|
||||
"<!DOCTYPE html\n"+
|
||||
"PUBLIC \"//W3C//DTD XHTML 1.0 Transitional//EN\"\n"+
|
||||
"\"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n"+
|
||||
"<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\" lang=\"en\">\n"+
|
||||
"<head>\n"+
|
||||
"<title>My Test HTML Document</title>\n"+
|
||||
"</head>\n"+
|
||||
"<body>\n"+
|
||||
"<h2>"+encodedSnippet+"</h2>\n"+
|
||||
"</body>\n"+
|
||||
"</html>";
|
||||
//now an ugly built of XML parsing to test the snippet is encoded OK
|
||||
DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
|
||||
DocumentBuilder db = dbf.newDocumentBuilder();
|
||||
org.w3c.dom.Document doc = db.parse(new ByteArrayInputStream(xhtml.getBytes()));
|
||||
Element root=doc.getDocumentElement();
|
||||
NodeList nodes=root.getElementsByTagName("body");
|
||||
Element body=(Element) nodes.item(0);
|
||||
nodes=body.getElementsByTagName("h2");
|
||||
Element h2=(Element) nodes.item(0);
|
||||
String decodedSnippet=h2.getFirstChild().getNodeValue();
|
||||
assertEquals("XHTML Encoding should have worked:", rawDocContent,decodedSnippet);
|
||||
}
|
||||
|
||||
public void testMultiSearcher() throws Exception
|
||||
{
|
||||
|
|
Loading…
Reference in New Issue