Fix for Lucene-1500 - new exception added to Highlighter API to handle TokenStreams with Tokens that exceed given text length

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@758460 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Mark Harwood 2009-03-25 23:05:14 +00:00
parent 126f4b18d3
commit 73a02ec6fe
4 changed files with 61 additions and 10 deletions

View File

@ -65,6 +65,10 @@ API Changes
11. LUCENE-1561: Renamed Field.omitTf to Field.omitTermFreqAndPositions
(Otis Gospodnetic via Mike McCandless)
12. LUCENE-1500: Added new InvalidTokenOffsetsException to Highlighter methods
to denote issues when offsets in TokenStream tokens exceed the length of the
provided text. (Mark Harwood)
Bug fixes

View File

@ -74,9 +74,10 @@ public class Highlighter
* @param fieldName Name of field used to influence analyzer's tokenization policy
*
* @return highlighted text fragment or null if no terms found
* @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
*/
public final String getBestFragment(Analyzer analyzer, String fieldName,String text)
throws IOException
throws IOException, InvalidTokenOffsetsException
{
TokenStream tokenStream = analyzer.tokenStream(fieldName, new StringReader(text));
return getBestFragment(tokenStream, text);
@ -96,9 +97,10 @@ public class Highlighter
* @param text text to highlight terms in
*
* @return highlighted text fragment or null if no terms found
* @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
*/
public final String getBestFragment(TokenStream tokenStream, String text)
throws IOException
throws IOException, InvalidTokenOffsetsException
{
String[] results = getBestFragments(tokenStream,text, 1);
if (results.length > 0)
@ -120,12 +122,13 @@ public class Highlighter
* @deprecated This method incorrectly hardcodes the choice of fieldname. Use the
* method of the same name that takes a fieldname.
* @return highlighted text fragments (between 0 and maxNumFragments number of fragments)
* @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
*/
public final String[] getBestFragments(
Analyzer analyzer,
String text,
int maxNumFragments)
throws IOException
throws IOException, InvalidTokenOffsetsException
{
TokenStream tokenStream = analyzer.tokenStream("field", new StringReader(text));
return getBestFragments(tokenStream, text, maxNumFragments);
@ -142,13 +145,14 @@ public class Highlighter
* @param maxNumFragments the maximum number of fragments.
*
* @return highlighted text fragments (between 0 and maxNumFragments number of fragments)
* @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
*/
public final String[] getBestFragments(
Analyzer analyzer,
String fieldName,
String text,
int maxNumFragments)
throws IOException
throws IOException, InvalidTokenOffsetsException
{
TokenStream tokenStream = analyzer.tokenStream(fieldName, new StringReader(text));
return getBestFragments(tokenStream, text, maxNumFragments);
@ -165,12 +169,13 @@ public class Highlighter
* @param maxNumFragments the maximum number of fragments.
*
* @return highlighted text fragments (between 0 and maxNumFragments number of fragments)
* @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
*/
public final String[] getBestFragments(
TokenStream tokenStream,
String text,
int maxNumFragments)
throws IOException
throws IOException, InvalidTokenOffsetsException
{
maxNumFragments = Math.max(1, maxNumFragments); //sanity check
@ -198,13 +203,14 @@ public class Highlighter
* @param maxNumFragments
* @param mergeContiguousFragments
* @throws IOException
* @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
*/
public final TextFragment[] getBestTextFragments(
TokenStream tokenStream,
String text,
boolean mergeContiguousFragments,
int maxNumFragments)
throws IOException
throws IOException, InvalidTokenOffsetsException
{
ArrayList docFrags = new ArrayList();
StringBuffer newText=new StringBuffer();
@ -230,6 +236,14 @@ public class Highlighter
(nextToken!= null)&&(nextToken.startOffset()< maxDocCharsToAnalyze);
nextToken = tokenStream.next(reusableToken))
{
if( (nextToken.endOffset()>text.length())
||
(nextToken.startOffset()>text.length())
)
{
throw new InvalidTokenOffsetsException("Token "+nextToken.toString()
+" exceeds length of provided text sized "+text.length());
}
if((tokenGroup.numTokens>0)&&(tokenGroup.isDistinct(nextToken)))
{
//the current token is distinct from previous tokens -
@ -452,13 +466,14 @@ public class Highlighter
* @param separator the separator used to intersperse the document fragments (typically "...")
*
* @return highlighted text
* @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
*/
public final String getBestFragments(
TokenStream tokenStream,
String text,
int maxNumFragments,
String separator)
throws IOException
throws IOException, InvalidTokenOffsetsException
{
String sections[] = getBestFragments(tokenStream,text, maxNumFragments);
StringBuffer result = new StringBuffer();

View File

@ -0,0 +1,31 @@
package org.apache.lucene.search.highlight;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Exception thrown if TokenStream Tokens are incompatible with provided text
*
*/
public class InvalidTokenOffsetsException extends Exception
{
public InvalidTokenOffsetsException(String message)
{
super(message);
}
}

View File

@ -136,9 +136,10 @@ public class HighlighterTest extends TestCase implements Formatter {
/**
* This method intended for use with <tt>testHighlightingWithDefaultField()</tt>
* @throws InvalidTokenOffsetsException
*/
private static String highlightField(Query query, String fieldName, String text)
throws IOException {
throws IOException, InvalidTokenOffsetsException {
CachingTokenFilter tokenStream = new CachingTokenFilter(new StandardAnalyzer().tokenStream(
fieldName, new StringReader(text)));
// Assuming "<B>", "</B>" used to highlight
@ -1291,7 +1292,7 @@ public class HighlighterTest extends TestCase implements Formatter {
private Directory dir = new RAMDirectory();
private Analyzer a = new WhitespaceAnalyzer();
public void testWeightedTermsWithDeletes() throws IOException, ParseException {
public void testWeightedTermsWithDeletes() throws IOException, ParseException, InvalidTokenOffsetsException {
makeIndex();
deleteDocument();
searchIndex();
@ -1321,7 +1322,7 @@ public class HighlighterTest extends TestCase implements Formatter {
writer.close();
}
private void searchIndex() throws IOException, ParseException {
private void searchIndex() throws IOException, ParseException, InvalidTokenOffsetsException {
String q = "t_text1:random";
QueryParser parser = new QueryParser( "t_text1", a );
Query query = parser.parse( q );