mirror of https://github.com/apache/lucene.git
Added support for analyzers that produce overlapping tokens
PR: Obtained from: Submitted by: Mark Harwod Reviewed by: CVS: ---------------------------------------------------------------------- CVS: PR: CVS: If this change addresses a PR in the problem report tracking CVS: database, then enter the PR number(s) here. CVS: Obtained from: CVS: If this change has been taken from another system, such as NCSA, CVS: then name the system in this line, otherwise delete it. CVS: Submitted by: CVS: If this code has been contributed to Apache by someone else; i.e., CVS: they sent us a patch or a new module, then include their name/email CVS: address here. If this is your work then delete this line. CVS: Reviewed by: CVS: If we are doing pre-commit code reviews and someone else has CVS: reviewed your changes, include their name(s) here. CVS: If you have not had it reviewed then delete this line. git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150990 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
cd10939321
commit
14f0da2aa2
|
@ -23,17 +23,11 @@ package org.apache.lucene.search.highlight;
|
|||
*/
|
||||
public interface Formatter
|
||||
{
|
||||
/**
|
||||
* Highlights a search term. For example, an HTML Formatter could simply do:
|
||||
*
|
||||
* <p><dl><dt></dt><dd><code>return "<b>" + term + "</b>";</code></dd></dl>
|
||||
*
|
||||
* @param originalTermText (unstemmed) term text to highlight
|
||||
* @param stemmedTerm the stemmed form of the originalTermText
|
||||
* @param score The score for this term returned by Scorer.getTokenScore - one use for this may be to set font weight in highlighted text
|
||||
* @param startOffset the position of the originalTermText in the text being highlighted
|
||||
*
|
||||
* @return highlighted term text
|
||||
*/
|
||||
String highlightTerm(String originalTermText, String stemmedTerm, float score, int startOffset);
|
||||
/**
|
||||
* @param originalText The section of text being considered for markup
|
||||
* @param tokenGroup contains one or several overlapping Tokens along with
|
||||
* their scores and positions.
|
||||
* @return
|
||||
*/
|
||||
String highlightTerm(String originalText, TokenGroup tokenGroup);
|
||||
}
|
||||
|
|
|
@ -149,46 +149,60 @@ public class Highlighter
|
|||
int endOffset;
|
||||
int lastEndOffset = 0;
|
||||
textFragmenter.start(text);
|
||||
|
||||
TokenGroup tokenGroup=new TokenGroup();
|
||||
|
||||
while ((token = tokenStream.next()) != null)
|
||||
{
|
||||
|
||||
startOffset = token.startOffset();
|
||||
endOffset = token.endOffset();
|
||||
//FIXME an issue was reported with CJKTokenizer that I couldnt reproduce
|
||||
// where the analyzer was producing overlapping tokens.
|
||||
// I suspect the fix is to make startOffset=Math.max(startOffset,lastEndOffset+1)
|
||||
// but cant be sure so I'll just leave this comment in for now
|
||||
tokenText = text.substring(startOffset, endOffset);
|
||||
|
||||
|
||||
// append text between end of last token (or beginning of text) and start of current token
|
||||
if (startOffset > lastEndOffset)
|
||||
newText.append(text.substring(lastEndOffset, startOffset));
|
||||
|
||||
// does query contain current token?
|
||||
float score=fragmentScorer.getTokenScore(token);
|
||||
newText.append(formatter.highlightTerm(tokenText, token.termText(), score, startOffset));
|
||||
|
||||
|
||||
if(textFragmenter.isNewFragment(token))
|
||||
if((tokenGroup.numTokens>0)&&(tokenGroup.isDistinct(token)))
|
||||
{
|
||||
currentFrag.setScore(fragmentScorer.getFragmentScore());
|
||||
//record stats for a new fragment
|
||||
currentFrag.textEndPos = newText.length();
|
||||
currentFrag =new TextFragment(newText.length(), docFrags.size());
|
||||
fragmentScorer.startFragment(currentFrag);
|
||||
docFrags.add(currentFrag);
|
||||
}
|
||||
//the current token is distinct from previous tokens -
|
||||
// markup the cached token group info
|
||||
startOffset = tokenGroup.startOffset;
|
||||
endOffset = tokenGroup.endOffset;
|
||||
tokenText = text.substring(startOffset, endOffset);
|
||||
String markedUpText=formatter.highlightTerm(tokenText, tokenGroup);
|
||||
//store any whitespace etc from between this and last group
|
||||
if (startOffset > lastEndOffset)
|
||||
newText.append(text.substring(lastEndOffset, startOffset));
|
||||
newText.append(markedUpText);
|
||||
lastEndOffset=endOffset;
|
||||
tokenGroup.clear();
|
||||
|
||||
lastEndOffset = endOffset;
|
||||
//check if current token marks the start of a new fragment
|
||||
if(textFragmenter.isNewFragment(token))
|
||||
{
|
||||
currentFrag.setScore(fragmentScorer.getFragmentScore());
|
||||
//record stats for a new fragment
|
||||
currentFrag.textEndPos = newText.length();
|
||||
currentFrag =new TextFragment(newText.length(), docFrags.size());
|
||||
fragmentScorer.startFragment(currentFrag);
|
||||
docFrags.add(currentFrag);
|
||||
}
|
||||
}
|
||||
|
||||
tokenGroup.addToken(token,fragmentScorer.getTokenScore(token));
|
||||
|
||||
if(lastEndOffset>maxDocBytesToAnalyze)
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
currentFrag.setScore(fragmentScorer.getFragmentScore());
|
||||
|
||||
|
||||
if(tokenGroup.numTokens>0)
|
||||
{
|
||||
//flush the accumulated text (same code as in above loop)
|
||||
startOffset = tokenGroup.startOffset;
|
||||
endOffset = tokenGroup.endOffset;
|
||||
tokenText = text.substring(startOffset, endOffset);
|
||||
String markedUpText=formatter.highlightTerm(tokenText, tokenGroup);
|
||||
//store any whitespace etc from between this and last group
|
||||
if (startOffset > lastEndOffset)
|
||||
newText.append(text.substring(lastEndOffset, startOffset));
|
||||
newText.append(markedUpText);
|
||||
lastEndOffset=endOffset;
|
||||
}
|
||||
|
||||
// append text after end of last token
|
||||
if (lastEndOffset < text.length())
|
||||
|
|
|
@ -24,6 +24,7 @@ public class SimpleHTMLFormatter implements Formatter
|
|||
{
|
||||
String preTag;
|
||||
String postTag;
|
||||
|
||||
|
||||
public SimpleHTMLFormatter(String preTag, String postTag)
|
||||
{
|
||||
|
@ -41,17 +42,20 @@ public class SimpleHTMLFormatter implements Formatter
|
|||
this.postTag = "</B>";
|
||||
}
|
||||
|
||||
public String highlightTerm(String originalText, String term, float score, int startOffset)
|
||||
/* (non-Javadoc)
|
||||
* @see org.apache.lucene.search.highlight.Formatter#highlightTerm(java.lang.String, org.apache.lucene.search.highlight.TokenGroup)
|
||||
*/
|
||||
public String highlightTerm(String originalText, TokenGroup tokenGroup)
|
||||
{
|
||||
if(score<=0)
|
||||
StringBuffer returnBuffer;
|
||||
if(tokenGroup.getTotalScore()>0)
|
||||
{
|
||||
return originalText;
|
||||
returnBuffer=new StringBuffer();
|
||||
returnBuffer.append(preTag);
|
||||
returnBuffer.append(originalText);
|
||||
returnBuffer.append(postTag);
|
||||
return returnBuffer.toString();
|
||||
}
|
||||
StringBuffer sb = new StringBuffer();
|
||||
sb.append(preTag);
|
||||
sb.append(originalText);
|
||||
sb.append(postTag);
|
||||
return sb.toString();
|
||||
return originalText;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -0,0 +1,120 @@
|
|||
package org.apache.lucene.search.highlight;
|
||||
/**
|
||||
* Copyright 2002-2004 The Apache Software Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
import org.apache.lucene.analysis.Token;
|
||||
|
||||
/**
|
||||
* One, or several overlapping tokens, along with the score(s) and the
|
||||
* scope of the original text
|
||||
* @author MAHarwood
|
||||
*/
|
||||
public class TokenGroup
|
||||
{
|
||||
|
||||
private static final int MAX_NUM_TOKENS_PER_GROUP=50;
|
||||
Token [] tokens=new Token[MAX_NUM_TOKENS_PER_GROUP];
|
||||
float [] scores=new float[MAX_NUM_TOKENS_PER_GROUP];
|
||||
int numTokens=0;
|
||||
int startOffset=0;
|
||||
int endOffset=0;
|
||||
|
||||
|
||||
void addToken(Token token, float score)
|
||||
{
|
||||
if(numTokens==0)
|
||||
{
|
||||
startOffset=token.startOffset();
|
||||
endOffset=token.endOffset();
|
||||
}
|
||||
else
|
||||
{
|
||||
startOffset=Math.min(startOffset,token.startOffset());
|
||||
endOffset=Math.max(endOffset,token.endOffset());
|
||||
}
|
||||
tokens[numTokens]=token;
|
||||
scores[numTokens]=score;
|
||||
numTokens++;
|
||||
}
|
||||
|
||||
boolean isDistinct(Token token)
|
||||
{
|
||||
return token.startOffset()>endOffset;
|
||||
}
|
||||
|
||||
|
||||
void clear()
|
||||
{
|
||||
numTokens=0;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param index a value between 0 and numTokens -1
|
||||
* @return the "n"th token
|
||||
*/
|
||||
public Token getToken(int index)
|
||||
{
|
||||
return tokens[index];
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param index a value between 0 and numTokens -1
|
||||
* @return the "n"th score
|
||||
*/
|
||||
public float getScore(int index)
|
||||
{
|
||||
return scores[index];
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the end position in the original text
|
||||
*/
|
||||
public int getEndOffset()
|
||||
{
|
||||
return endOffset;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the number of tokens in this group
|
||||
*/
|
||||
public int getNumTokens()
|
||||
{
|
||||
return numTokens;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the start position in the original text
|
||||
*/
|
||||
public int getStartOffset()
|
||||
{
|
||||
return startOffset;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return
|
||||
*/
|
||||
public float getTotalScore()
|
||||
{
|
||||
float total=0;
|
||||
for (int i = 0; i < numTokens; i++)
|
||||
{
|
||||
total+=scores[i];
|
||||
}
|
||||
return total;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,4 +1,5 @@
|
|||
package org.apache.lucene.search.highlight;
|
||||
|
||||
/**
|
||||
* Copyright 2002-2004 The Apache Software Foundation
|
||||
*
|
||||
|
@ -16,13 +17,18 @@ package org.apache.lucene.search.highlight;
|
|||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.StringTokenizer;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.LowerCaseTokenizer;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
//import org.apache.lucene.analysis.cjk.CJKAnalyzer;
|
||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
|
@ -35,6 +41,12 @@ import org.apache.lucene.search.IndexSearcher;
|
|||
import org.apache.lucene.search.MultiSearcher;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.Searcher;
|
||||
import org.apache.lucene.search.highlight.Formatter;
|
||||
import org.apache.lucene.search.highlight.Highlighter;
|
||||
import org.apache.lucene.search.highlight.QueryScorer;
|
||||
import org.apache.lucene.search.highlight.SimpleFragmenter;
|
||||
import org.apache.lucene.search.highlight.TokenGroup;
|
||||
import org.apache.lucene.search.highlight.WeightedTerm;
|
||||
import org.apache.lucene.store.RAMDirectory;
|
||||
|
||||
/**
|
||||
|
@ -60,23 +72,31 @@ public class HighlighterTest extends TestCase implements Formatter
|
|||
"John Kennedy has been shot",
|
||||
"This text has a typo in referring to Keneddy" };
|
||||
|
||||
/**
|
||||
* Constructor for HighlightExtractorTest.
|
||||
* @param arg0
|
||||
*/
|
||||
public HighlighterTest(String arg0)
|
||||
{
|
||||
super(arg0);
|
||||
}
|
||||
|
||||
public void testSimpleHighlighter() throws Exception
|
||||
{
|
||||
doSearching("Kennedy");
|
||||
Highlighter highlighter = new Highlighter(new QueryScorer(query));
|
||||
highlighter.setTextFragmenter(new SimpleFragmenter(40));
|
||||
highlighter.setTextFragmenter(new SimpleFragmenter(40));
|
||||
int maxNumFragmentsRequired = 2;
|
||||
for (int i = 0; i < hits.length(); i++)
|
||||
{
|
||||
String text = hits.doc(i).get(FIELD_NAME);
|
||||
TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(text));
|
||||
|
||||
|
||||
String result =
|
||||
highlighter.getBestFragments(tokenStream,text,maxNumFragmentsRequired, "...");
|
||||
System.out.println("\t" + result);
|
||||
}
|
||||
//Not sure we can assert anything here - just running to check we dont throw any exceptions
|
||||
//Not sure we can assert anything here - just running to check we dont throw any exceptions
|
||||
}
|
||||
|
||||
|
||||
|
@ -151,7 +171,7 @@ public class HighlighterTest extends TestCase implements Formatter
|
|||
}
|
||||
assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 4);
|
||||
}
|
||||
|
||||
|
||||
public void testGetBestSingleFragmentWithWeights() throws Exception
|
||||
{
|
||||
WeightedTerm[]wTerms=new WeightedTerm[2];
|
||||
|
@ -160,9 +180,9 @@ public class HighlighterTest extends TestCase implements Formatter
|
|||
Highlighter highlighter =new Highlighter(new QueryScorer(wTerms));
|
||||
TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(texts[0]));
|
||||
highlighter.setTextFragmenter(new SimpleFragmenter(2));
|
||||
|
||||
|
||||
String result = highlighter.getBestFragment(tokenStream,texts[0]).trim();
|
||||
assertTrue("Failed to find best section using weighted terms. Found: "+result
|
||||
assertTrue("Failed to find best section using weighted terms. Found: ["+result+"]"
|
||||
, "<B>Hello</B>".equals(result));
|
||||
|
||||
//readjust weights
|
||||
|
@ -170,14 +190,36 @@ public class HighlighterTest extends TestCase implements Formatter
|
|||
tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(texts[0]));
|
||||
highlighter =new Highlighter(new QueryScorer(wTerms));
|
||||
highlighter.setTextFragmenter(new SimpleFragmenter(2));
|
||||
|
||||
|
||||
result = highlighter.getBestFragment(tokenStream,texts[0]).trim();
|
||||
assertTrue("Failed to find best section using weighted terms. Found: "+result
|
||||
, "<B>kennedy</B>".equals(result));
|
||||
}
|
||||
|
||||
|
||||
|
||||
// tests a "complex" analyzer that produces multiple
|
||||
// overlapping tokens
|
||||
public void testOverlapAnalyzer() throws Exception
|
||||
{
|
||||
HashMap synonyms = new HashMap();
|
||||
synonyms.put("football", "soccer,footie");
|
||||
Analyzer analyzer = new SynonymAnalyzer(synonyms);
|
||||
String srchkey = "football";
|
||||
|
||||
String s = "football-soccer in the euro 2004 footie competition";
|
||||
Query query = QueryParser.parse(srchkey, "bookid", analyzer);
|
||||
|
||||
Highlighter highlighter = new Highlighter(new QueryScorer(query));
|
||||
TokenStream tokenStream =
|
||||
analyzer.tokenStream(null, new StringReader(s));
|
||||
// Get 3 best fragments and seperate with a "..."
|
||||
String result = highlighter.getBestFragments(tokenStream, s, 3, "...");
|
||||
String expectedResult="<B>football</B>-<B>soccer</B> in the euro 2004 <B>footie</B> competition";
|
||||
assertTrue("overlapping analyzer should handle highlights OK",expectedResult.equals(result));
|
||||
}
|
||||
|
||||
|
||||
|
||||
public void testGetSimpleHighlight() throws Exception
|
||||
{
|
||||
doSearching("Kennedy");
|
||||
|
@ -188,7 +230,7 @@ public class HighlighterTest extends TestCase implements Formatter
|
|||
{
|
||||
String text = hits.doc(i).get(FIELD_NAME);
|
||||
TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(text));
|
||||
|
||||
|
||||
String result = highlighter.getBestFragment(tokenStream,text);
|
||||
System.out.println("\t" + result);
|
||||
}
|
||||
|
@ -209,7 +251,7 @@ public class HighlighterTest extends TestCase implements Formatter
|
|||
}
|
||||
|
||||
|
||||
|
||||
|
||||
public void testUnRewrittenQuery() throws IOException, ParseException
|
||||
{
|
||||
//test to show how rewritten query can still be used
|
||||
|
@ -226,7 +268,7 @@ public class HighlighterTest extends TestCase implements Formatter
|
|||
Highlighter highlighter =
|
||||
new Highlighter(this,new QueryScorer(query));
|
||||
|
||||
highlighter.setTextFragmenter(new SimpleFragmenter(40));
|
||||
highlighter.setTextFragmenter(new SimpleFragmenter(40));
|
||||
|
||||
int maxNumFragmentsRequired = 3;
|
||||
|
||||
|
@ -234,14 +276,14 @@ public class HighlighterTest extends TestCase implements Formatter
|
|||
{
|
||||
String text = hits.doc(i).get(FIELD_NAME);
|
||||
TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(text));
|
||||
|
||||
|
||||
String highlightedText = highlighter.getBestFragments(tokenStream,text,maxNumFragmentsRequired,"...");
|
||||
System.out.println(highlightedText);
|
||||
}
|
||||
//We expect to have zero highlights if the query is multi-terms and is not rewritten!
|
||||
assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 0);
|
||||
}
|
||||
|
||||
|
||||
public void testNoFragments() throws Exception
|
||||
{
|
||||
doSearching("AnInvalidQueryWhichShouldYieldNoResults");
|
||||
|
@ -253,12 +295,12 @@ public class HighlighterTest extends TestCase implements Formatter
|
|||
{
|
||||
String text = texts[i];
|
||||
TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(text));
|
||||
|
||||
|
||||
String result = highlighter.getBestFragment(tokenStream,text);
|
||||
assertNull("The highlight result should be null for text with no query terms", result);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public void testMultiSearcher() throws Exception
|
||||
{
|
||||
//setup index 1
|
||||
|
@ -266,7 +308,7 @@ public class HighlighterTest extends TestCase implements Formatter
|
|||
IndexWriter writer1 = new IndexWriter(ramDir1, new StandardAnalyzer(), true);
|
||||
Document d = new Document();
|
||||
Field f = new Field(FIELD_NAME, "multiOne", true, true, true);
|
||||
d.add(f);
|
||||
d.add(f);
|
||||
writer1.addDocument(d);
|
||||
writer1.optimize();
|
||||
writer1.close();
|
||||
|
@ -277,15 +319,15 @@ public class HighlighterTest extends TestCase implements Formatter
|
|||
IndexWriter writer2 = new IndexWriter(ramDir2, new StandardAnalyzer(), true);
|
||||
d = new Document();
|
||||
f = new Field(FIELD_NAME, "multiTwo", true, true, true);
|
||||
d.add(f);
|
||||
d.add(f);
|
||||
writer2.addDocument(d);
|
||||
writer2.optimize();
|
||||
writer2.close();
|
||||
IndexReader reader2 = IndexReader.open(ramDir2);
|
||||
|
||||
|
||||
|
||||
IndexSearcher searchers[]=new IndexSearcher[2];
|
||||
|
||||
IndexSearcher searchers[]=new IndexSearcher[2];
|
||||
searchers[0] = new IndexSearcher(ramDir1);
|
||||
searchers[1] = new IndexSearcher(ramDir2);
|
||||
MultiSearcher multiSearcher=new MultiSearcher(searchers);
|
||||
|
@ -299,8 +341,8 @@ public class HighlighterTest extends TestCase implements Formatter
|
|||
expandedQueries[0]=query.rewrite(reader1);
|
||||
expandedQueries[1]=query.rewrite(reader2);
|
||||
query=query.combine(expandedQueries);
|
||||
|
||||
|
||||
|
||||
|
||||
//create an instance of the highlighter with the tags used to surround highlighted text
|
||||
Highlighter highlighter =
|
||||
new Highlighter(this,new QueryScorer(query));
|
||||
|
@ -312,13 +354,13 @@ public class HighlighterTest extends TestCase implements Formatter
|
|||
String highlightedText = highlighter.getBestFragment(tokenStream,text);
|
||||
System.out.println(highlightedText);
|
||||
}
|
||||
assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 2);
|
||||
|
||||
|
||||
|
||||
assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 2);
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
/*
|
||||
|
||||
public void testBigramAnalyzer() throws IOException, ParseException
|
||||
{
|
||||
|
@ -331,11 +373,11 @@ public class HighlighterTest extends TestCase implements Formatter
|
|||
Document d = new Document();
|
||||
Field f = new Field(FIELD_NAME, "java abc def", true, true, true);
|
||||
d.add(f);
|
||||
writer.addDocument(d);
|
||||
writer.addDocument(d);
|
||||
writer.close();
|
||||
IndexReader reader = IndexReader.open(ramDir);
|
||||
|
||||
IndexSearcher searcher=new IndexSearcher(reader);
|
||||
IndexSearcher searcher=new IndexSearcher(reader);
|
||||
query = QueryParser.parse("abc", FIELD_NAME, bigramAnalyzer);
|
||||
System.out.println("Searching for: " + query.toString(FIELD_NAME));
|
||||
hits = searcher.search(query);
|
||||
|
@ -349,15 +391,15 @@ public class HighlighterTest extends TestCase implements Formatter
|
|||
TokenStream tokenStream=bigramAnalyzer.tokenStream(FIELD_NAME,new StringReader(text));
|
||||
String highlightedText = highlighter.getBestFragment(tokenStream,text);
|
||||
System.out.println(highlightedText);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
*/
|
||||
*/
|
||||
|
||||
|
||||
public String highlightTerm(String originalText , String weightedTerm, float score, int startOffset)
|
||||
public String highlightTerm(String originalText , TokenGroup group)
|
||||
{
|
||||
if(score<=0)
|
||||
if(group.getTotalScore()<=0)
|
||||
{
|
||||
return originalText;
|
||||
}
|
||||
|
@ -369,7 +411,7 @@ public class HighlighterTest extends TestCase implements Formatter
|
|||
{
|
||||
searcher = new IndexSearcher(ramDir);
|
||||
query = QueryParser.parse(queryString, FIELD_NAME, new StandardAnalyzer());
|
||||
//for any multi-term queries to work (prefix, wildcard, range,fuzzy etc) you must use a rewritten query!
|
||||
//for any multi-term queries to work (prefix, wildcard, range,fuzzy etc) you must use a rewritten query!
|
||||
query=query.rewrite(reader);
|
||||
System.out.println("Searching for: " + query.toString(FIELD_NAME));
|
||||
hits = searcher.search(query);
|
||||
|
@ -385,7 +427,7 @@ public class HighlighterTest extends TestCase implements Formatter
|
|||
int maxNumFragmentsRequired = 2;
|
||||
String fragmentSeparator = "...";
|
||||
TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(text));
|
||||
|
||||
|
||||
String result =
|
||||
highlighter.getBestFragments(
|
||||
tokenStream,
|
||||
|
@ -432,3 +474,91 @@ public class HighlighterTest extends TestCase implements Formatter
|
|||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
//===================================================================
|
||||
//========== BEGIN TEST SUPPORTING CLASSES
|
||||
//========== THESE LOOK LIKE, WITH SOME MORE EFFORT THESE COULD BE
|
||||
//========== MADE MORE GENERALLY USEFUL.
|
||||
// TODO - make synonyms all interchangeable with each other and produce
|
||||
// a version that does antonyms(?) - the "is a specialised type of ...."
|
||||
// so that car=audi, bmw and volkswagen but bmw != audi so different
|
||||
// behaviour to synonyms
|
||||
//===================================================================
|
||||
|
||||
class SynonymAnalyzer extends Analyzer
|
||||
{
|
||||
private Map synonyms;
|
||||
|
||||
public SynonymAnalyzer(Map synonyms)
|
||||
{
|
||||
this.synonyms = synonyms;
|
||||
}
|
||||
|
||||
/* (non-Javadoc)
|
||||
* @see org.apache.lucene.analysis.Analyzer#tokenStream(java.lang.String, java.io.Reader)
|
||||
*/
|
||||
public TokenStream tokenStream(String arg0, Reader arg1)
|
||||
{
|
||||
|
||||
return new SynonymTokenizer(new LowerCaseTokenizer(arg1), synonyms);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Expands a token stream with synonyms (TODO - make the synonyms analyzed by choice of analyzer)
|
||||
* @author MAHarwood
|
||||
*/
|
||||
class SynonymTokenizer extends TokenStream
|
||||
{
|
||||
private TokenStream realStream;
|
||||
private Token currentRealToken = null;
|
||||
private Map synonyms;
|
||||
StringTokenizer st = null;
|
||||
public SynonymTokenizer(TokenStream realStream, Map synonyms)
|
||||
{
|
||||
this.realStream = realStream;
|
||||
this.synonyms = synonyms;
|
||||
}
|
||||
public Token next() throws IOException
|
||||
{
|
||||
if (currentRealToken == null)
|
||||
{
|
||||
Token nextRealToken = realStream.next();
|
||||
if (nextRealToken == null)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
String expansions = (String) synonyms.get(nextRealToken.termText());
|
||||
if (expansions == null)
|
||||
{
|
||||
return nextRealToken;
|
||||
}
|
||||
st = new StringTokenizer(expansions, ",");
|
||||
if (st.hasMoreTokens())
|
||||
{
|
||||
currentRealToken = nextRealToken;
|
||||
}
|
||||
return currentRealToken;
|
||||
}
|
||||
else
|
||||
{
|
||||
String nextExpandedValue = st.nextToken();
|
||||
Token expandedToken =
|
||||
new Token(
|
||||
nextExpandedValue,
|
||||
currentRealToken.startOffset(),
|
||||
currentRealToken.endOffset());
|
||||
expandedToken.setPositionIncrement(0);
|
||||
if (!st.hasMoreTokens())
|
||||
{
|
||||
currentRealToken = null;
|
||||
st = null;
|
||||
}
|
||||
return expandedToken;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue