Fix for Lucene-1500 - new exception added to Highlighter API to handle TokenStreams with Tokens that exceed given text length

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@758460 13f79535-47bb-0310-9956-ffa450edef68
2009-03-25 23:05:14 +00:00 · 2009-03-25 23:05:14 +00:00 · 73a02ec6fe
parent 126f4b18d3
commit 73a02ec6fe
4 changed files with 61 additions and 10 deletions
--- a/CHANGES.txt
+++ b/CHANGES.txt
@ -65,6 +65,10 @@ API Changes

 11. LUCENE-1561: Renamed Field.omitTf to Field.omitTermFreqAndPositions
   (Otis Gospodnetic via Mike McCandless)
+  
+12. LUCENE-1500: Added new InvalidTokenOffsetsException to Highlighter methods
+    to denote issues when offsets in TokenStream tokens exceed the length of the
+    provided text.  (Mark Harwood)

 Bug fixes

--- a/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java
+++ b/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java
@ -74,9 +74,10 @@ public class Highlighter
 	 * @param fieldName Name of field used to influence analyzer's tokenization policy
 	 *
 	 * @return highlighted text fragment or null if no terms found
+	 * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
 	 */
 	public final String getBestFragment(Analyzer analyzer, String fieldName,String text)
-		throws IOException
+		throws IOException, InvalidTokenOffsetsException
 	{
 		TokenStream tokenStream = analyzer.tokenStream(fieldName, new StringReader(text));
 		return getBestFragment(tokenStream, text);
@ -96,9 +97,10 @@ public class Highlighter
 	 * @param text text to highlight terms in
 	 *
 	 * @return highlighted text fragment or null if no terms found
+	 * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
 	 */
 	public final String getBestFragment(TokenStream tokenStream, String text)
-		throws IOException
+		throws IOException, InvalidTokenOffsetsException
 	{
 		String[] results = getBestFragments(tokenStream,text, 1);
 		if (results.length > 0)
@ -120,12 +122,13 @@ public class Highlighter
 	 * @deprecated This method incorrectly hardcodes the choice of fieldname. Use the
 	 * method of the same name that takes a fieldname.
 	 * @return highlighted text fragments (between 0 and maxNumFragments number of fragments)
+	 * @throws InvalidTokenOffsetsException  thrown if any token's endOffset exceeds the provided text's length
 	 */
 	public final String[] getBestFragments(
 		Analyzer analyzer,
 		String text,
 		int maxNumFragments)
-		throws IOException
+		throws IOException, InvalidTokenOffsetsException
 	{
 		TokenStream tokenStream = analyzer.tokenStream("field", new StringReader(text));
 		return getBestFragments(tokenStream, text, maxNumFragments);
@ -142,13 +145,14 @@ public class Highlighter
 	 * @param maxNumFragments  the maximum number of fragments.
 	 *
 	 * @return highlighted text fragments (between 0 and maxNumFragments number of fragments)
+	 * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
 	 */
 	public final String[] getBestFragments(
 		Analyzer analyzer,
 		String fieldName,
 		String text,
 		int maxNumFragments)
-		throws IOException
+		throws IOException, InvalidTokenOffsetsException
 	{
 		TokenStream tokenStream = analyzer.tokenStream(fieldName, new StringReader(text));
 		return getBestFragments(tokenStream, text, maxNumFragments);
@ -165,12 +169,13 @@ public class Highlighter
 	 * @param maxNumFragments  the maximum number of fragments.
 	 *
 	 * @return highlighted text fragments (between 0 and maxNumFragments number of fragments)
+	 * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
 	 */
 	public final String[] getBestFragments(
 		TokenStream tokenStream,
 		String text,
 		int maxNumFragments)
-		throws IOException
+		throws IOException, InvalidTokenOffsetsException
 	{
 		maxNumFragments = Math.max(1, maxNumFragments); //sanity check

@ -198,13 +203,14 @@ public class Highlighter
 	 * @param maxNumFragments
 	 * @param mergeContiguousFragments
 	 * @throws IOException
+	 * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
 	 */
 	public final TextFragment[] getBestTextFragments(
 		TokenStream tokenStream,
 		String text,
 		boolean mergeContiguousFragments,
 		int maxNumFragments)
-		throws IOException
+		throws IOException, InvalidTokenOffsetsException
 	{
 		ArrayList docFrags = new ArrayList();
 		StringBuffer newText=new StringBuffer();
@ -230,6 +236,14 @@ public class Highlighter
 			     (nextToken!= null)&&(nextToken.startOffset()< maxDocCharsToAnalyze);
 			     nextToken = tokenStream.next(reusableToken))
 			{
+				if(	(nextToken.endOffset()>text.length())
+					||
+					(nextToken.startOffset()>text.length())
+					)						
+				{
+					throw new InvalidTokenOffsetsException("Token "+nextToken.toString()
+							+" exceeds length of provided text sized "+text.length());
+				}
 				if((tokenGroup.numTokens>0)&&(tokenGroup.isDistinct(nextToken)))
 				{
 					//the current token is distinct from previous tokens -
@ -452,13 +466,14 @@ public class Highlighter
 	 * @param separator  the separator used to intersperse the document fragments (typically "...")
 	 *
 	 * @return highlighted text
+	 * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
 	 */
 	public final String getBestFragments(
 		TokenStream tokenStream,	
 		String text,
 		int maxNumFragments,
 		String separator)
-		throws IOException
+		throws IOException, InvalidTokenOffsetsException
 	{
 		String sections[] =	getBestFragments(tokenStream,text, maxNumFragments);
 		StringBuffer result = new StringBuffer();
--- a/contrib/highlighter/src/java/org/apache/lucene/search/highlight/InvalidTokenOffsetsException.java
+++ b/contrib/highlighter/src/java/org/apache/lucene/search/highlight/InvalidTokenOffsetsException.java
@ -0,0 +1,31 @@
+package org.apache.lucene.search.highlight;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Exception thrown if TokenStream Tokens are incompatible with provided text
+ *
+ */
+public class InvalidTokenOffsetsException extends Exception
+{
+
+	public InvalidTokenOffsetsException(String message)
+	{
+		super(message);
+	}
+
+}
--- a/contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java
+++ b/contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java
@ -136,9 +136,10 @@ public class HighlighterTest extends TestCase implements Formatter {

  /**
   * This method intended for use with <tt>testHighlightingWithDefaultField()</tt>
+ * @throws InvalidTokenOffsetsException 
   */
  private static String highlightField(Query query, String fieldName, String text)
-      throws IOException {
+      throws IOException, InvalidTokenOffsetsException {
    CachingTokenFilter tokenStream = new CachingTokenFilter(new StandardAnalyzer().tokenStream(
        fieldName, new StringReader(text)));
    // Assuming "<B>", "</B>" used to highlight
@ -1291,7 +1292,7 @@ public class HighlighterTest extends TestCase implements Formatter {
  private Directory dir = new RAMDirectory();
  private Analyzer a = new WhitespaceAnalyzer();
  
-  public void testWeightedTermsWithDeletes() throws IOException, ParseException {
+  public void testWeightedTermsWithDeletes() throws IOException, ParseException, InvalidTokenOffsetsException {
    makeIndex();
    deleteDocument();
    searchIndex();
@ -1321,7 +1322,7 @@ public class HighlighterTest extends TestCase implements Formatter {
    writer.close();
  }
  
-  private void searchIndex() throws IOException, ParseException {
+  private void searchIndex() throws IOException, ParseException, InvalidTokenOffsetsException {
    String q = "t_text1:random";
    QueryParser parser = new QueryParser( "t_text1", a );
    Query query = parser.parse( q );