Added Nicko Cadell's Encoder contribution

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@151622 13f79535-47bb-0310-9956-ffa450edef68
2005-02-06 21:31:54 +00:00 · 2005-02-06 21:31:54 +00:00 · 276ab079f5
parent b1555b0bbf
commit 276ab079f5
6 changed files with 238 additions and 15 deletions
--- a/contrib/highlighter/src/java/org/apache/lucene/search/highlight/DefaultEncoder.java
+++ b/contrib/highlighter/src/java/org/apache/lucene/search/highlight/DefaultEncoder.java
@ -0,0 +1,33 @@
+package org.apache.lucene.search.highlight;
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Simple {@link Encoder} implementation that does not modify the output
+ * @author Nicko Cadell
+ *
+ */
+public class DefaultEncoder implements Encoder
+{
+	public DefaultEncoder()
+	{
+	}
+
+	public String encodeText(String originalText)
+	{
+		return originalText;
+	}
+}
--- a/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Encoder.java
+++ b/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Encoder.java
@ -0,0 +1,31 @@
+package org.apache.lucene.search.highlight;
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+/**
+ * Encodes original text. The Encoder works with the Formatter to generate the output.
+ *
+ * @author Nicko Cadell
+ */
+public interface Encoder
+{
+	/**
+	 * @param originalText The section of text being output
+	 * @return
+	 */
+	String encodeText(String originalText);
+}
--- a/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java
+++ b/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java
@ -1,6 +1,6 @@
 package org.apache.lucene.search.highlight;
 /**
- * Copyright 2002-2004 The Apache Software Foundation
+ * Copyright 2002-2005 The Apache Software Foundation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@ -24,8 +24,8 @@ import org.apache.lucene.util.PriorityQueue;

 /**
 * Class used to markup highlighted terms found in the best sections of a 
- * text, using configurable {@link Fragmenter}, {@link Scorer}, {@link Formatter} 
- * and tokenizers.
+ * text, using configurable {@link Fragmenter}, {@link Scorer}, {@link Formatter}, 
+ * {@link Encoder} and tokenizers.
 * @author mark@searcharea.co.uk
 */
 public class Highlighter
@ -34,6 +34,7 @@ public class Highlighter
 	public static final  int DEFAULT_MAX_DOC_BYTES_TO_ANALYZE=50*1024;
 	private int maxDocBytesToAnalyze=DEFAULT_MAX_DOC_BYTES_TO_ANALYZE;
 	private Formatter formatter;
+	private Encoder encoder;	
 	private Fragmenter textFragmenter=new SimpleFragmenter();
 	private Scorer fragmentScorer=null;

@ -43,12 +44,18 @@ public class Highlighter
 	}
 	
 	
-	public Highlighter(Formatter formatter, Scorer fragmentScorer)
-	{
-		this.formatter = formatter;
-		this.fragmentScorer = fragmentScorer;
+ 	public Highlighter(Formatter formatter, Scorer fragmentScorer)
+ 	{
+		this(formatter,new DefaultEncoder(),fragmentScorer);
 	}
 	
+	
+	public Highlighter(Formatter formatter, Encoder encoder, Scorer fragmentScorer)
+	{
+ 		this.formatter = formatter;
+		this.encoder = encoder;
+ 		this.fragmentScorer = fragmentScorer;
+ 	}



@ -160,10 +167,10 @@ public class Highlighter
 					startOffset = tokenGroup.startOffset;
 					endOffset = tokenGroup.endOffset;		
 					tokenText = text.substring(startOffset, endOffset);
-					String markedUpText=formatter.highlightTerm(tokenText, tokenGroup);
+					String markedUpText=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
 					//store any whitespace etc from between this and last group
 					if (startOffset > lastEndOffset)
-						newText.append(text.substring(lastEndOffset, startOffset));
+						newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset)));
 					newText.append(markedUpText);
 					lastEndOffset=endOffset;
 					tokenGroup.clear();
@ -195,17 +202,17 @@ public class Highlighter
 				startOffset = tokenGroup.startOffset;
 				endOffset = tokenGroup.endOffset;		
 				tokenText = text.substring(startOffset, endOffset);
-				String markedUpText=formatter.highlightTerm(tokenText, tokenGroup);
+				String markedUpText=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
 				//store any whitespace etc from between this and last group
 				if (startOffset > lastEndOffset)
-					newText.append(text.substring(lastEndOffset, startOffset));
+					newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset)));
 				newText.append(markedUpText);
 				lastEndOffset=endOffset;						
 			}

 			// append text after end of last token
 			if (lastEndOffset < text.length())
-				newText.append(text.substring(lastEndOffset));
+				newText.append(encoder.encodeText(text.substring(lastEndOffset)));

 			currentFrag.textEndPos = newText.length();

@ -438,7 +445,14 @@ public class Highlighter
 		fragmentScorer = scorer;
 	}

-
+    public Encoder getEncoder()
+    {
+        return encoder;
+    }
+    public void setEncoder(Encoder encoder)
+    {
+        this.encoder = encoder;
+    }
 }
 class FragmentQueue extends PriorityQueue
 {
--- a/contrib/highlighter/src/java/org/apache/lucene/search/highlight/SimpleHTMLEncoder.java
+++ b/contrib/highlighter/src/java/org/apache/lucene/search/highlight/SimpleHTMLEncoder.java
@ -0,0 +1,75 @@
+package org.apache.lucene.search.highlight;
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Simple {@link Encoder} implementation to escape text for HTML output
+ * @author Nicko Cadell
+ *
+ */
+public class SimpleHTMLEncoder implements Encoder
+{
+	public SimpleHTMLEncoder()
+	{
+	}
+
+	public String encodeText(String originalText)
+	{
+		return htmlEncode(originalText);
+	}
+	
+	/**
+	 * Encode string into HTML
+	 */
+	public final static String htmlEncode(String plainText) 
+	{
+		if (plainText == null || plainText.length() == 0)
+		{
+			return "";
+		}
+
+		StringBuffer result = new StringBuffer(plainText.length());
+
+		for (int index=0; index<plainText.length(); index++) 
+		{
+			char ch = plainText.charAt(index);
+
+			switch (ch) 
+			{
+			case '"':
+				result.append("&quot;");
+				break;
+
+			case '&':
+				result.append("&amp;");
+				break;
+
+			case '<':
+				result.append("&lt;");
+				break;
+
+			case '>':
+				result.append("&gt;");
+				break;
+
+			default:
+				result.append(ch);
+			}
+		}
+
+		return result.toString();
+	}
+}
--- a/contrib/highlighter/src/java/org/apache/lucene/search/highlight/package.html
+++ b/contrib/highlighter/src/java/org/apache/lucene/search/highlight/package.html
@ -24,7 +24,13 @@ Fragmenter, FragmentScorer, Formatter classes.
 		}
 </pre>
 <h2>New features 06/02/2005</h2>
-This release adds options for encoding (thanks to Nicko Cadell)
+This release adds options for encoding (thanks to Nicko Cadell).
+An "Encoder" implementation such as the new SimpleHTMLEncoder class can be passed to the highlighter to encode
+all those non-xhtml standard characters such as &amp; into legal values. This simple class may not suffice for
+some languages -  Commons Lang has an implementation that could be used: escapeHtml(String) in
+http://svn.apache.org/viewcvs.cgi/jakarta/commons/proper/lang/trunk/src/java/org/apache/commons/lang/StringEscapeUtils.java?rev=137958&view=markup
+
+

 <h2>New features 22/12/2004</h2>
 This release adds some new capabilities:
--- a/contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java
+++ b/contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java
@ -16,6 +16,8 @@ package org.apache.lucene.search.highlight;
 * limitations under the License.
 */

+import java.io.ByteArrayInputStream;
+import java.io.File;
 import java.io.IOException;
 import java.io.Reader;
 import java.io.StringReader;
@ -23,6 +25,10 @@ import java.util.HashMap;
 import java.util.Map;
 import java.util.StringTokenizer;

+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.parsers.ParserConfigurationException;
+
 import junit.framework.TestCase;

 import org.apache.lucene.analysis.Analyzer;
@ -42,6 +48,9 @@ import org.apache.lucene.search.MultiSearcher;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.Searcher;
 import org.apache.lucene.store.RAMDirectory;
+import org.w3c.dom.Element;
+import org.w3c.dom.NodeList;
+import org.xml.sax.SAXException;

 /**
 * JUnit Test for Highlighter class.
@ -105,7 +114,7 @@ public class HighlighterTest extends TestCase implements Formatter
 	{
 		doSearching("Kinnedy~");
 		doStandardHighlights();
-		assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 4);
+		assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 5);
 	}

 	public void testGetWildCardFragments() throws Exception
@ -323,6 +332,61 @@ public class HighlighterTest extends TestCase implements Formatter
 			assertNull("The highlight result should be null for text with no query terms", result);
 		}
 	}
+	
+	/**
+	 * Demonstrates creation of an XHTML compliant doc using new encoding facilities.
+	 * @throws Exception
+	 */
+	public void testEncoding() throws Exception
+    {
+        String rawDocContent = "\"Smith & sons' prices < 3 and >4\" claims article";
+        //run the highlighter on the raw content (scorer does not score any tokens for 
+        // highlighting but scores a single fragment for selection
+        Highlighter highlighter = new Highlighter(this,
+                new SimpleHTMLEncoder(), new Scorer()
+                {
+                    public void startFragment(TextFragment newFragment)
+                    {
+                    }
+                    public float getTokenScore(Token token)
+                    {
+                        return 0;
+                    }
+                    public float getFragmentScore()
+                    {
+                        return 1;
+                    }
+                });
+        highlighter.setTextFragmenter(new SimpleFragmenter(2000));
+        TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME,
+                new StringReader(rawDocContent));
+
+        String encodedSnippet = highlighter.getBestFragments(tokenStream, rawDocContent,1,"");
+        //An ugly bit of XML creation:
+        String xhtml="<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"+
+            		"<!DOCTYPE html\n"+
+            		"PUBLIC \"//W3C//DTD XHTML 1.0 Transitional//EN\"\n"+
+            		"\"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n"+
+            		"<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\" lang=\"en\">\n"+
+            		"<head>\n"+
+            		"<title>My Test HTML Document</title>\n"+
+            		"</head>\n"+
+            		"<body>\n"+
+            		"<h2>"+encodedSnippet+"</h2>\n"+
+            		"</body>\n"+
+            		"</html>";
+        //now an ugly built of XML parsing to test the snippet is encoded OK 
+  		DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
+  		DocumentBuilder db = dbf.newDocumentBuilder();
+  		org.w3c.dom.Document doc = db.parse(new ByteArrayInputStream(xhtml.getBytes()));
+  		Element root=doc.getDocumentElement();  		
+  		NodeList nodes=root.getElementsByTagName("body");
+  		Element body=(Element) nodes.item(0);
+  		nodes=body.getElementsByTagName("h2");
+        Element h2=(Element) nodes.item(0); 
+        String decodedSnippet=h2.getFirstChild().getNodeValue();
+        assertEquals("XHTML Encoding should have worked:", rawDocContent,decodedSnippet);
+    }

 	public void testMultiSearcher() throws Exception
 	{