Added support for analyzers that produce overlapping tokens

PR:
Obtained from:
Submitted by:	Mark Harwod
Reviewed by:	
CVS: ----------------------------------------------------------------------
CVS: PR:
CVS:   If this change addresses a PR in the problem report tracking
CVS:   database, then enter the PR number(s) here.
CVS: Obtained from:
CVS:   If this change has been taken from another system, such as NCSA,
CVS:   then name the system in this line, otherwise delete it.
CVS: Submitted by:
CVS:   If this code has been contributed to Apache by someone else; i.e.,
CVS:   they sent us a patch or a new module, then include their name/email
CVS:   address here. If this is your work then delete this line.
CVS: Reviewed by:
CVS:   If we are doing pre-commit code reviews and someone else has
CVS:   reviewed your changes, include their name(s) here.
CVS:   If you have not had it reviewed then delete this line.


git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150990 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Mark Harwood 2004-07-26 20:39:47 +00:00
parent cd10939321
commit 14f0da2aa2
5 changed files with 350 additions and 88 deletions

View File

@ -24,16 +24,10 @@ package org.apache.lucene.search.highlight;
public interface Formatter
{
/**
* Highlights a search term. For example, an HTML Formatter could simply do:
*
* <p><dl><dt></dt><dd><code>return "&lt;b&gt;" + term + "&lt;/b&gt;";</code></dd></dl>
*
* @param originalTermText (unstemmed) term text to highlight
* @param stemmedTerm the stemmed form of the originalTermText
* @param score The score for this term returned by Scorer.getTokenScore - one use for this may be to set font weight in highlighted text
* @param startOffset the position of the originalTermText in the text being highlighted
*
* @return highlighted term text
* @param originalText The section of text being considered for markup
* @param tokenGroup contains one or several overlapping Tokens along with
* their scores and positions.
* @return
*/
String highlightTerm(String originalTermText, String stemmedTerm, float score, int startOffset);
String highlightTerm(String originalText, TokenGroup tokenGroup);
}

View File

@ -150,27 +150,26 @@ public class Highlighter
int lastEndOffset = 0;
textFragmenter.start(text);
TokenGroup tokenGroup=new TokenGroup();
while ((token = tokenStream.next()) != null)
{
startOffset = token.startOffset();
endOffset = token.endOffset();
//FIXME an issue was reported with CJKTokenizer that I couldnt reproduce
// where the analyzer was producing overlapping tokens.
// I suspect the fix is to make startOffset=Math.max(startOffset,lastEndOffset+1)
// but cant be sure so I'll just leave this comment in for now
if((tokenGroup.numTokens>0)&&(tokenGroup.isDistinct(token)))
{
//the current token is distinct from previous tokens -
// markup the cached token group info
startOffset = tokenGroup.startOffset;
endOffset = tokenGroup.endOffset;
tokenText = text.substring(startOffset, endOffset);
// append text between end of last token (or beginning of text) and start of current token
String markedUpText=formatter.highlightTerm(tokenText, tokenGroup);
//store any whitespace etc from between this and last group
if (startOffset > lastEndOffset)
newText.append(text.substring(lastEndOffset, startOffset));
newText.append(markedUpText);
lastEndOffset=endOffset;
tokenGroup.clear();
// does query contain current token?
float score=fragmentScorer.getTokenScore(token);
newText.append(formatter.highlightTerm(tokenText, token.termText(), score, startOffset));
//check if current token marks the start of a new fragment
if(textFragmenter.isNewFragment(token))
{
currentFrag.setScore(fragmentScorer.getFragmentScore());
@ -180,8 +179,10 @@ public class Highlighter
fragmentScorer.startFragment(currentFrag);
docFrags.add(currentFrag);
}
}
tokenGroup.addToken(token,fragmentScorer.getTokenScore(token));
lastEndOffset = endOffset;
if(lastEndOffset>maxDocBytesToAnalyze)
{
break;
@ -189,6 +190,19 @@ public class Highlighter
}
currentFrag.setScore(fragmentScorer.getFragmentScore());
if(tokenGroup.numTokens>0)
{
//flush the accumulated text (same code as in above loop)
startOffset = tokenGroup.startOffset;
endOffset = tokenGroup.endOffset;
tokenText = text.substring(startOffset, endOffset);
String markedUpText=formatter.highlightTerm(tokenText, tokenGroup);
//store any whitespace etc from between this and last group
if (startOffset > lastEndOffset)
newText.append(text.substring(lastEndOffset, startOffset));
newText.append(markedUpText);
lastEndOffset=endOffset;
}
// append text after end of last token
if (lastEndOffset < text.length())

View File

@ -25,6 +25,7 @@ public class SimpleHTMLFormatter implements Formatter
String preTag;
String postTag;
public SimpleHTMLFormatter(String preTag, String postTag)
{
this.preTag = preTag;
@ -41,17 +42,20 @@ public class SimpleHTMLFormatter implements Formatter
this.postTag = "</B>";
}
public String highlightTerm(String originalText, String term, float score, int startOffset)
/* (non-Javadoc)
* @see org.apache.lucene.search.highlight.Formatter#highlightTerm(java.lang.String, org.apache.lucene.search.highlight.TokenGroup)
*/
public String highlightTerm(String originalText, TokenGroup tokenGroup)
{
if(score<=0)
StringBuffer returnBuffer;
if(tokenGroup.getTotalScore()>0)
{
returnBuffer=new StringBuffer();
returnBuffer.append(preTag);
returnBuffer.append(originalText);
returnBuffer.append(postTag);
return returnBuffer.toString();
}
return originalText;
}
StringBuffer sb = new StringBuffer();
sb.append(preTag);
sb.append(originalText);
sb.append(postTag);
return sb.toString();
}
}

View File

@ -0,0 +1,120 @@
package org.apache.lucene.search.highlight;
/**
* Copyright 2002-2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.Token;
/**
* One, or several overlapping tokens, along with the score(s) and the
* scope of the original text
* @author MAHarwood
*/
public class TokenGroup
{
private static final int MAX_NUM_TOKENS_PER_GROUP=50;
Token [] tokens=new Token[MAX_NUM_TOKENS_PER_GROUP];
float [] scores=new float[MAX_NUM_TOKENS_PER_GROUP];
int numTokens=0;
int startOffset=0;
int endOffset=0;
void addToken(Token token, float score)
{
if(numTokens==0)
{
startOffset=token.startOffset();
endOffset=token.endOffset();
}
else
{
startOffset=Math.min(startOffset,token.startOffset());
endOffset=Math.max(endOffset,token.endOffset());
}
tokens[numTokens]=token;
scores[numTokens]=score;
numTokens++;
}
boolean isDistinct(Token token)
{
return token.startOffset()>endOffset;
}
void clear()
{
numTokens=0;
}
/**
*
* @param index a value between 0 and numTokens -1
* @return the "n"th token
*/
public Token getToken(int index)
{
return tokens[index];
}
/**
*
* @param index a value between 0 and numTokens -1
* @return the "n"th score
*/
public float getScore(int index)
{
return scores[index];
}
/**
* @return the end position in the original text
*/
public int getEndOffset()
{
return endOffset;
}
/**
* @return the number of tokens in this group
*/
public int getNumTokens()
{
return numTokens;
}
/**
* @return the start position in the original text
*/
public int getStartOffset()
{
return startOffset;
}
/**
* @return
*/
public float getTotalScore()
{
float total=0;
for (int i = 0; i < numTokens; i++)
{
total+=scores[i];
}
return total;
}
}

View File

@ -1,4 +1,5 @@
package org.apache.lucene.search.highlight;
/**
* Copyright 2002-2004 The Apache Software Foundation
*
@ -16,13 +17,18 @@ package org.apache.lucene.search.highlight;
*/
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.HashMap;
import java.util.Map;
import java.util.StringTokenizer;
import junit.framework.TestCase;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.LowerCaseTokenizer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
//import org.apache.lucene.analysis.cjk.CJKAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
@ -35,6 +41,12 @@ import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MultiSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.highlight.Formatter;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleFragmenter;
import org.apache.lucene.search.highlight.TokenGroup;
import org.apache.lucene.search.highlight.WeightedTerm;
import org.apache.lucene.store.RAMDirectory;
/**
@ -60,6 +72,14 @@ public class HighlighterTest extends TestCase implements Formatter
"John Kennedy has been shot",
"This text has a typo in referring to Keneddy" };
/**
* Constructor for HighlightExtractorTest.
* @param arg0
*/
public HighlighterTest(String arg0)
{
super(arg0);
}
public void testSimpleHighlighter() throws Exception
{
@ -162,7 +182,7 @@ public class HighlighterTest extends TestCase implements Formatter
highlighter.setTextFragmenter(new SimpleFragmenter(2));
String result = highlighter.getBestFragment(tokenStream,texts[0]).trim();
assertTrue("Failed to find best section using weighted terms. Found: "+result
assertTrue("Failed to find best section using weighted terms. Found: ["+result+"]"
, "<B>Hello</B>".equals(result));
//readjust weights
@ -177,6 +197,28 @@ public class HighlighterTest extends TestCase implements Formatter
}
// tests a "complex" analyzer that produces multiple
// overlapping tokens
public void testOverlapAnalyzer() throws Exception
{
HashMap synonyms = new HashMap();
synonyms.put("football", "soccer,footie");
Analyzer analyzer = new SynonymAnalyzer(synonyms);
String srchkey = "football";
String s = "football-soccer in the euro 2004 footie competition";
Query query = QueryParser.parse(srchkey, "bookid", analyzer);
Highlighter highlighter = new Highlighter(new QueryScorer(query));
TokenStream tokenStream =
analyzer.tokenStream(null, new StringReader(s));
// Get 3 best fragments and seperate with a "..."
String result = highlighter.getBestFragments(tokenStream, s, 3, "...");
String expectedResult="<B>football</B>-<B>soccer</B> in the euro 2004 <B>footie</B> competition";
assertTrue("overlapping analyzer should handle highlights OK",expectedResult.equals(result));
}
public void testGetSimpleHighlight() throws Exception
{
@ -355,9 +397,9 @@ public class HighlighterTest extends TestCase implements Formatter
*/
public String highlightTerm(String originalText , String weightedTerm, float score, int startOffset)
public String highlightTerm(String originalText , TokenGroup group)
{
if(score<=0)
if(group.getTotalScore()<=0)
{
return originalText;
}
@ -432,3 +474,91 @@ public class HighlighterTest extends TestCase implements Formatter
}
}
//===================================================================
//========== BEGIN TEST SUPPORTING CLASSES
//========== THESE LOOK LIKE, WITH SOME MORE EFFORT THESE COULD BE
//========== MADE MORE GENERALLY USEFUL.
// TODO - make synonyms all interchangeable with each other and produce
// a version that does antonyms(?) - the "is a specialised type of ...."
// so that car=audi, bmw and volkswagen but bmw != audi so different
// behaviour to synonyms
//===================================================================
class SynonymAnalyzer extends Analyzer
{
private Map synonyms;
public SynonymAnalyzer(Map synonyms)
{
this.synonyms = synonyms;
}
/* (non-Javadoc)
* @see org.apache.lucene.analysis.Analyzer#tokenStream(java.lang.String, java.io.Reader)
*/
public TokenStream tokenStream(String arg0, Reader arg1)
{
return new SynonymTokenizer(new LowerCaseTokenizer(arg1), synonyms);
}
}
/**
* Expands a token stream with synonyms (TODO - make the synonyms analyzed by choice of analyzer)
* @author MAHarwood
*/
class SynonymTokenizer extends TokenStream
{
private TokenStream realStream;
private Token currentRealToken = null;
private Map synonyms;
StringTokenizer st = null;
public SynonymTokenizer(TokenStream realStream, Map synonyms)
{
this.realStream = realStream;
this.synonyms = synonyms;
}
public Token next() throws IOException
{
if (currentRealToken == null)
{
Token nextRealToken = realStream.next();
if (nextRealToken == null)
{
return null;
}
String expansions = (String) synonyms.get(nextRealToken.termText());
if (expansions == null)
{
return nextRealToken;
}
st = new StringTokenizer(expansions, ",");
if (st.hasMoreTokens())
{
currentRealToken = nextRealToken;
}
return currentRealToken;
}
else
{
String nextExpandedValue = st.nextToken();
Token expandedToken =
new Token(
nextExpandedValue,
currentRealToken.startOffset(),
currentRealToken.endOffset());
expandedToken.setPositionIncrement(0);
if (!st.hasMoreTokens())
{
currentRealToken = null;
st = null;
}
return expandedToken;
}
}
}