Initial commit of Mark Harwood's Highlighter package

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150972 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Erik Hatcher 2004-04-09 00:34:31 +00:00
parent 45898ec436
commit 33345f7af9
13 changed files with 1539 additions and 0 deletions

View File

@ -0,0 +1,10 @@
<?xml version="1.0"?>
<project name="highlighter" default="default">
<description>
Hits highlighter
</description>
<import file="../common.xml"/>
</project>

View File

@ -0,0 +1,38 @@
package org.apache.lucene.search.highlight;
/**
* Copyright 2002-2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Processes terms found in the original text, typically by applying some form
* of mark-up to highlight terms in HTML search results pages.
*
*/
public interface Formatter
{
/**
* Highlights a search term. For example, an HTML Formatter could simply do:
*
* <p><dl><dt></dt><dd><code>return "&lt;b&gt;" + term + "&lt;/b&gt;";</code></dd></dl>
*
* @param originalTermText (unstemmed) term text to highlight
* @param stemmedTerm the stemmed form of the originalTermText
* @param startOffset the position of the originalTermText in the text being highlighted
*
* @return highlighted term text
*/
String highlightTerm(String originalTermText, String stemmedTerm, float score, int startOffset);
}

View File

@ -0,0 +1,40 @@
package org.apache.lucene.search.highlight;
/**
* Copyright 2002-2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.Token;
/**
* Implements the policy for breaking text into multiple fragments for consideration
* by the {@link Highlighter} class. A sophisticated implementation may do this on the basis
* of detecting end of sentences in the text.
* @author mark@searcharea.co.uk
*/
public interface Fragmenter
{
/**
* Initializes the Fragmenter
* @param originalText
*/
public void start(String originalText);
/**
* Test to see if this token from the stream should be held in a new TextFragment
* @param token
* @return
*/
public boolean isNewFragment(Token nextToken);
}

View File

@ -0,0 +1,430 @@
package org.apache.lucene.search.highlight;
/**
* Copyright 2002-2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.util.PriorityQueue;
/**
* Class used to markup highlighted terms found in the best sections of a
* text, using configurable {@link Fragmenter}, {@link Scorer}, {@link Formatter}
* and tokenizers.
* @author mark@searcharea.co.uk
*/
public class Highlighter
{
public static final int DEFAULT_MAX_DOC_BYTES_TO_ANALYZE=50*1024;
private int maxDocBytesToAnalyze=DEFAULT_MAX_DOC_BYTES_TO_ANALYZE;
private Formatter formatter;
private Fragmenter textFragmenter=new SimpleFragmenter();
private Scorer fragmentScorer=null;
public Highlighter(Scorer fragmentScorer)
{
this(new SimpleHTMLFormatter(),fragmentScorer);
}
public Highlighter(Formatter formatter, Scorer fragmentScorer)
{
this.formatter = formatter;
this.fragmentScorer = fragmentScorer;
}
/**
* Highlights chosen terms in a text, extracting the most relevant section.
* The document text is analysed in chunks to record hit statistics
* across the document. After accumulating stats, the fragment with the highest score
* is returned
*
* @param tokenStream a stream of tokens identified in the text parameter, including offset information.
* This is typically produced by an analyzer re-parsing a document's
* text. Some work may be done on retrieving TokenStreams more efficently
* by adding support for storing original text position data in the Lucene
* index but this support is not currently available (as of Lucene 1.4 rc2).
* @param text text to highlight terms in
*
* @return highlighted text fragment or null if no terms found
*/
public final String getBestFragment(TokenStream tokenStream, String text)
throws IOException
{
String[] results = getBestFragments(tokenStream,text, 1);
if (results.length > 0)
{
return results[0];
}
return null;
}
/**
* Highlights chosen terms in a text, extracting the most relevant sections.
* The document text is analysed in chunks to record hit statistics
* across the document. After accumulating stats, the fragments with the highest scores
* are returned as an array of strings in order of score (contiguous fragments are merged into
* one in their original order to improve readability)
*
* @param text text to highlight terms in
* @param maxNumFragments the maximum number of fragments.
*
* @return highlighted text fragments (between 0 and maxNumFragments number of fragments)
*/
public final String[] getBestFragments(
TokenStream tokenStream,
String text,
int maxNumFragments)
throws IOException
{
maxNumFragments = Math.max(1, maxNumFragments); //sanity check
StringBuffer newText = new StringBuffer();
TextFragment[] frag =getBestDocFragments(tokenStream,text, newText, maxNumFragments);
mergeContiguousFragments(frag);
//Get text
ArrayList fragTexts = new ArrayList();
int n = 0;
for (int i = 0; i < frag.length; i++)
{
if ((frag[i] != null) && (frag[i].getScore() > 0))
{
fragTexts.add(
newText.substring(
frag[i].textStartPos,
frag[i].textEndPos));
}
}
return (String[]) fragTexts.toArray(new String[0]);
}
/**
* Low level api to get the most relevant sections of the document
* @param tokenStream
* @param text
* @param maxNumFragments
* @return
* @throws IOException
*/
private final TextFragment[] getBestDocFragments(
TokenStream tokenStream,
String text,
StringBuffer newText,
int maxNumFragments)
throws IOException
{
ArrayList docFrags = new ArrayList();
TextFragment currentFrag = new TextFragment(newText.length(), docFrags.size());
fragmentScorer.startFragment(currentFrag);
docFrags.add(currentFrag);
FragmentQueue fragQueue = new FragmentQueue(maxNumFragments);
try
{
org.apache.lucene.analysis.Token token;
String tokenText;
int startOffset;
int endOffset;
int lastEndOffset = 0;
textFragmenter.start(text);
while ((token = tokenStream.next()) != null)
{
startOffset = token.startOffset();
endOffset = token.endOffset();
//FIXME an issue was reported with CJKTokenizer that I couldnt reproduce
// where the analyzer was producing overlapping tokens.
// I suspect the fix is to make startOffset=Math.max(startOffset,lastEndOffset+1)
// but cant be sure so I'll just leave this comment in for now
tokenText = text.substring(startOffset, endOffset);
// append text between end of last token (or beginning of text) and start of current token
if (startOffset > lastEndOffset)
newText.append(text.substring(lastEndOffset, startOffset));
// does query contain current token?
float score=fragmentScorer.getTokenScore(token);
newText.append(formatter.highlightTerm(tokenText, token.termText(), score, startOffset));
if(textFragmenter.isNewFragment(token))
{
currentFrag.setScore(fragmentScorer.getFragmentScore());
//record stats for a new fragment
currentFrag.textEndPos = newText.length();
currentFrag =new TextFragment(newText.length(), docFrags.size());
fragmentScorer.startFragment(currentFrag);
docFrags.add(currentFrag);
}
lastEndOffset = endOffset;
if(lastEndOffset>maxDocBytesToAnalyze)
{
break;
}
}
currentFrag.setScore(fragmentScorer.getFragmentScore());
// append text after end of last token
if (lastEndOffset < text.length())
newText.append(text.substring(lastEndOffset));
currentFrag.textEndPos = newText.length();
//sort the most relevant sections of the text
int minScore = 0;
for (Iterator i = docFrags.iterator(); i.hasNext();)
{
currentFrag = (TextFragment) i.next();
//If you are running with a version of Lucene before 11th Sept 03
// you do not have PriorityQueue.insert() - so uncomment the code below
/*
if (currentFrag.getScore() >= minScore)
{
fragQueue.put(currentFrag);
if (fragQueue.size() > maxNumFragments)
{ // if hit queue overfull
fragQueue.pop(); // remove lowest in hit queue
minScore = ((TextFragment) fragQueue.top()).getScore(); // reset minScore
}
}
*/
//The above code caused a problem as a result of Christoph Goller's 11th Sept 03
//fix to PriorityQueue. The correct method to use here is the new "insert" method
// USE ABOVE CODE IF THIS DOES NOT COMPILE!
fragQueue.insert(currentFrag);
}
//return the most relevant fragments
TextFragment frag[] = new TextFragment[fragQueue.size()];
for (int i = frag.length - 1; i >= 0; i--)
{
frag[i] = (TextFragment) fragQueue.pop();
}
return frag;
}
finally
{
if (tokenStream != null)
{
try
{
tokenStream.close();
}
catch (Exception e)
{
}
}
}
}
/** Improves readability of a score-sorted list of TextFragments by merging any fragments
* that were contiguous in the original text into one larger fragment with the correct order.
* This will leave a "null" in the array entry for the lesser scored fragment.
*
* @param frag An array of document fragments in descending score
*/
private void mergeContiguousFragments(TextFragment[] frag)
{
boolean mergingStillBeingDone;
if (frag.length > 1)
do
{
mergingStillBeingDone = false; //initialise loop control flag
//for each fragment, scan other frags looking for contiguous blocks
for (int i = 0; i < frag.length; i++)
{
if (frag[i] == null)
{
continue;
}
//merge any contiguous blocks
for (int x = 0; x < frag.length; x++)
{
if (frag[x] == null)
{
continue;
}
if (frag[i] == null)
{
break;
}
TextFragment frag1 = null;
TextFragment frag2 = null;
int frag1Num = 0;
int frag2Num = 0;
int bestScoringFragNum;
int worstScoringFragNum;
//if blocks are contiguous....
if (frag[i].follows(frag[x]))
{
frag1 = frag[x];
frag1Num = x;
frag2 = frag[i];
frag2Num = i;
}
else
if (frag[x].follows(frag[i]))
{
frag1 = frag[i];
frag1Num = i;
frag2 = frag[x];
frag2Num = x;
}
//merging required..
if (frag1 != null)
{
if (frag1.getScore() > frag2.getScore())
{
bestScoringFragNum = frag1Num;
worstScoringFragNum = frag2Num;
}
else
{
bestScoringFragNum = frag2Num;
worstScoringFragNum = frag1Num;
}
frag1.merge(frag2);
frag[worstScoringFragNum] = null;
mergingStillBeingDone = true;
frag[bestScoringFragNum] = frag1;
}
}
}
}
while (mergingStillBeingDone);
}
/**
* Highlights terms in the text , extracting the most relevant sections
* and concatenating the chosen fragments with a separator (typically "...").
* The document text is analysed in chunks to record hit statistics
* across the document. After accumulating stats, the fragments with the highest scores
* are returned in order as "separator" delimited strings.
*
* @param text text to highlight terms in
* @param maxNumFragments the maximum number of fragments.
* @param separator the separator used to intersperse the document fragments (typically "...")
*
* @return highlighted text
*/
public final String getBestFragments(
TokenStream tokenStream,
String text,
int maxNumFragments,
String separator)
throws IOException
{
String sections[] = getBestFragments(tokenStream,text, maxNumFragments);
StringBuffer result = new StringBuffer();
for (int i = 0; i < sections.length; i++)
{
if (i > 0)
{
result.append(separator);
}
result.append(sections[i]);
}
return result.toString();
}
/**
* @return the maximum number of bytes to be tokenized per doc
*/
public int getMaxDocBytesToAnalyze()
{
return maxDocBytesToAnalyze;
}
/**
* @param byteCount the maximum number of bytes to be tokenized per doc
* (This can improve performance with large documents)
*/
public void setMaxDocBytesToAnalyze(int byteCount)
{
maxDocBytesToAnalyze = byteCount;
}
/**
* @return
*/
public Fragmenter getTextFragmenter()
{
return textFragmenter;
}
/**
* @param fragmenter
*/
public void setTextFragmenter(Fragmenter fragmenter)
{
textFragmenter = fragmenter;
}
/**
* @return Object used to score each text fragment
*/
public Scorer getFragmentScorer()
{
return fragmentScorer;
}
/**
* @param scorer
*/
public void setFragmentScorer(Scorer scorer)
{
fragmentScorer = scorer;
}
}
class FragmentQueue extends PriorityQueue
{
public FragmentQueue(int size)
{
initialize(size);
}
public final boolean lessThan(Object a, Object b)
{
TextFragment fragA = (TextFragment) a;
TextFragment fragB = (TextFragment) b;
if (fragA.getScore() == fragB.getScore())
return fragA.fragNum > fragB.fragNum;
else
return fragA.getScore() < fragB.getScore();
}
}

View File

@ -0,0 +1,442 @@
package org.apache.lucene.search.highlight;
/**
* Copyright 2002-2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.StringReader;
import junit.framework.TestCase;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
//import org.apache.lucene.analysis.cjk.CJKAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MultiSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.store.RAMDirectory;
/**
* JUnit Test for Highlighter class.
* @author mark@searcharea.co.uk
*/
public class HighlighterTest extends TestCase implements Formatter
{
private IndexReader reader;
private static final String FIELD_NAME = "contents";
private Query query;
RAMDirectory ramDir;
public Searcher searcher = null;
public Hits hits = null;
int numHighlights = 0;
Analyzer analyzer=new StandardAnalyzer();
String texts[] =
{
"Hello this is a piece of text that is very long and contains too much preamble and the meat is really here which says kennedy has been shot",
"This piece of text refers to Kennedy at the beginning then has a longer piece of text that is very long in the middle and finally ends with another reference to Kennedy",
"JFK has been shot",
"John Kennedy has been shot",
"This text has a typo in referring to Keneddy" };
/**
* Constructor for HighlightExtractorTest.
* @param arg0
*/
public HighlighterTest(String arg0)
{
super(arg0);
}
public void testSimpleHighlighter() throws Exception
{
doSearching("Kennedy");
Highlighter highlighter = new Highlighter(new QueryScorer(query));
highlighter.setTextFragmenter(new SimpleFragmenter(40));
int maxNumFragmentsRequired = 2;
for (int i = 0; i < hits.length(); i++)
{
String text = hits.doc(i).get(FIELD_NAME);
TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(text));
String result =
highlighter.getBestFragments(tokenStream,text,maxNumFragmentsRequired, "...");
System.out.println("\t" + result);
}
//Not sure we can assert anything here - just running to check we dont throw any exceptions
}
public void testGetBestFragmentsSimpleQuery() throws Exception
{
doSearching("Kennedy");
doStandardHighlights();
assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 4);
}
public void testGetFuzzyFragments() throws Exception
{
doSearching("Kinnedy~");
doStandardHighlights();
assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 4);
}
public void testGetWildCardFragments() throws Exception
{
doSearching("K?nnedy");
doStandardHighlights();
assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 4);
}
public void testGetMidWildCardFragments() throws Exception
{
doSearching("K*dy");
doStandardHighlights();
assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 5);
}
public void testGetRangeFragments() throws Exception
{
doSearching(FIELD_NAME + ":[kannedy TO kznnedy]"); //bug?needs lower case
doStandardHighlights();
assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 5);
}
public void testGetBestFragmentsPhrase() throws Exception
{
doSearching("\"John Kennedy\"");
doStandardHighlights();
//Currently highlights "John" and "Kennedy" separately
assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 2);
}
public void testGetBestFragmentsMultiTerm() throws Exception
{
doSearching("John Kenn*");
doStandardHighlights();
assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 5);
}
public void testGetBestFragmentsWithOr() throws Exception
{
doSearching("JFK OR Kennedy");
doStandardHighlights();
assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 5);
}
public void testGetBestSingleFragment() throws Exception
{
doSearching("Kennedy");
// QueryHighlightExtractor highlighter = new QueryHighlightExtractor(this, query, new StandardAnalyzer());
Highlighter highlighter =new Highlighter(this,new QueryScorer(query));
highlighter.setTextFragmenter(new SimpleFragmenter(40));
for (int i = 0; i < hits.length(); i++)
{
String text = hits.doc(i).get(FIELD_NAME);
TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(text));
String result = highlighter.getBestFragment(tokenStream,text);
System.out.println("\t" + result);
}
assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 4);
}
public void testGetBestSingleFragmentWithWeights() throws Exception
{
WeightedTerm[]wTerms=new WeightedTerm[2];
wTerms[0]=new WeightedTerm(10f,"hello");
wTerms[1]=new WeightedTerm(1f,"kennedy");
Highlighter highlighter =new Highlighter(new QueryScorer(wTerms));
TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(texts[0]));
highlighter.setTextFragmenter(new SimpleFragmenter(2));
String result = highlighter.getBestFragment(tokenStream,texts[0]).trim();
assertTrue("Failed to find best section using weighted terms. Found: "+result
, "<B>Hello</B>".equals(result));
//readjust weights
wTerms[1].setWeight(50f);
tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(texts[0]));
highlighter =new Highlighter(new QueryScorer(wTerms));
highlighter.setTextFragmenter(new SimpleFragmenter(2));
result = highlighter.getBestFragment(tokenStream,texts[0]).trim();
assertTrue("Failed to find best section using weighted terms. Found: "+result
, "<B>kennedy</B>".equals(result));
}
public void testGetSimpleHighlight() throws Exception
{
doSearching("Kennedy");
Highlighter highlighter =
new Highlighter(this,new QueryScorer(query));
for (int i = 0; i < hits.length(); i++)
{
String text = hits.doc(i).get(FIELD_NAME);
TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(text));
String result = highlighter.getBestFragment(tokenStream,text);
System.out.println("\t" + result);
}
assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 4);
}
public void testMaxSizeHighlight() throws Exception
{
doSearching("meat");
Highlighter highlighter =
new Highlighter(this,new QueryScorer(query));
highlighter.setMaxDocBytesToAnalyze(30);
TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(texts[0]));
String result = highlighter.getBestFragment(tokenStream,texts[0]);
assertTrue("Setting MaxDocBytesToAnalyze should have prevented " +
"us from finding matches for this record" + numHighlights +
" found", numHighlights == 0);
}
public void testUnRewrittenQuery() throws IOException, ParseException
{
//test to show how rewritten query can still be used
searcher = new IndexSearcher(ramDir);
Analyzer analyzer=new StandardAnalyzer();
Query query = QueryParser.parse("JF? or Kenned*", FIELD_NAME, analyzer);
System.out.println("Searching with primitive query");
//forget to set this and...
//query=query.rewrite(reader);
Hits hits = searcher.search(query);
//create an instance of the highlighter with the tags used to surround highlighted text
// QueryHighlightExtractor highlighter = new QueryHighlightExtractor(this, query, new StandardAnalyzer());
Highlighter highlighter =
new Highlighter(this,new QueryScorer(query));
highlighter.setTextFragmenter(new SimpleFragmenter(40));
int maxNumFragmentsRequired = 3;
for (int i = 0; i < hits.length(); i++)
{
String text = hits.doc(i).get(FIELD_NAME);
TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(text));
String highlightedText = highlighter.getBestFragments(tokenStream,text,maxNumFragmentsRequired,"...");
System.out.println(highlightedText);
}
//We expect to have zero highlights if the query is multi-terms and is not rewritten!
assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 0);
}
public void testNoFragments() throws Exception
{
doSearching("AnInvalidQueryWhichShouldYieldNoResults");
Highlighter highlighter =
new Highlighter(this,new QueryScorer(query));
int highlightFragmentSizeInBytes = 40;
for (int i = 0; i < texts.length; i++)
{
String text = texts[i];
TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(text));
String result = highlighter.getBestFragment(tokenStream,text);
assertNull("The highlight result should be null for text with no query terms", result);
}
}
public void testMultiSearcher() throws Exception
{
//setup index 1
RAMDirectory ramDir1 = new RAMDirectory();
IndexWriter writer1 = new IndexWriter(ramDir1, new StandardAnalyzer(), true);
Document d = new Document();
Field f = new Field(FIELD_NAME, "multiOne", true, true, true);
d.add(f);
writer1.addDocument(d);
writer1.optimize();
writer1.close();
IndexReader reader1 = IndexReader.open(ramDir1);
//setup index 2
RAMDirectory ramDir2 = new RAMDirectory();
IndexWriter writer2 = new IndexWriter(ramDir2, new StandardAnalyzer(), true);
d = new Document();
f = new Field(FIELD_NAME, "multiTwo", true, true, true);
d.add(f);
writer2.addDocument(d);
writer2.optimize();
writer2.close();
IndexReader reader2 = IndexReader.open(ramDir2);
IndexSearcher searchers[]=new IndexSearcher[2];
searchers[0] = new IndexSearcher(ramDir1);
searchers[1] = new IndexSearcher(ramDir2);
MultiSearcher multiSearcher=new MultiSearcher(searchers);
query = QueryParser.parse("multi*", FIELD_NAME, new StandardAnalyzer());
System.out.println("Searching for: " + query.toString(FIELD_NAME));
//at this point the multisearcher calls combine(query[])
hits = multiSearcher.search(query);
//query = QueryParser.parse("multi*", FIELD_NAME, new StandardAnalyzer());
Query expandedQueries[]=new Query[2];
expandedQueries[0]=query.rewrite(reader1);
expandedQueries[1]=query.rewrite(reader2);
query=query.combine(expandedQueries);
//create an instance of the highlighter with the tags used to surround highlighted text
Highlighter highlighter =
new Highlighter(this,new QueryScorer(query));
for (int i = 0; i < hits.length(); i++)
{
String text = hits.doc(i).get(FIELD_NAME);
TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(text));
String highlightedText = highlighter.getBestFragment(tokenStream,text);
System.out.println(highlightedText);
}
assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 2);
}
/*
public void testBigramAnalyzer() throws IOException, ParseException
{
//test to ensure analyzers with none-consecutive start/end offsets
//dont double-highlight text
//setup index 1
RAMDirectory ramDir = new RAMDirectory();
Analyzer bigramAnalyzer=new CJKAnalyzer();
IndexWriter writer = new IndexWriter(ramDir,bigramAnalyzer , true);
Document d = new Document();
Field f = new Field(FIELD_NAME, "java abc def", true, true, true);
d.add(f);
writer.addDocument(d);
writer.close();
IndexReader reader = IndexReader.open(ramDir);
IndexSearcher searcher=new IndexSearcher(reader);
query = QueryParser.parse("abc", FIELD_NAME, bigramAnalyzer);
System.out.println("Searching for: " + query.toString(FIELD_NAME));
hits = searcher.search(query);
Highlighter highlighter =
new Highlighter(this,new QueryFragmentScorer(query));
for (int i = 0; i < hits.length(); i++)
{
String text = hits.doc(i).get(FIELD_NAME);
TokenStream tokenStream=bigramAnalyzer.tokenStream(FIELD_NAME,new StringReader(text));
String highlightedText = highlighter.getBestFragment(tokenStream,text);
System.out.println(highlightedText);
}
}
*/
public String highlightTerm(String originalText , String weightedTerm, float score, int startOffset)
{
if(score<=0)
{
return originalText;
}
numHighlights++; //update stats used in assertions
return "<b>" + originalText + "</b>";
}
public void doSearching(String queryString) throws Exception
{
searcher = new IndexSearcher(ramDir);
query = QueryParser.parse(queryString, FIELD_NAME, new StandardAnalyzer());
//for any multi-term queries to work (prefix, wildcard, range,fuzzy etc) you must use a rewritten query!
query=query.rewrite(reader);
System.out.println("Searching for: " + query.toString(FIELD_NAME));
hits = searcher.search(query);
}
void doStandardHighlights() throws Exception
{
Highlighter highlighter =new Highlighter(this,new QueryScorer(query));
highlighter.setTextFragmenter(new SimpleFragmenter(20));
for (int i = 0; i < hits.length(); i++)
{
String text = hits.doc(i).get(FIELD_NAME);
int maxNumFragmentsRequired = 2;
String fragmentSeparator = "...";
TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(text));
String result =
highlighter.getBestFragments(
tokenStream,
text,
maxNumFragmentsRequired,
fragmentSeparator);
System.out.println("\t" + result);
}
}
/*
* @see TestCase#setUp()
*/
protected void setUp() throws Exception
{
ramDir = new RAMDirectory();
IndexWriter writer = new IndexWriter(ramDir, new StandardAnalyzer(), true);
for (int i = 0; i < texts.length; i++)
{
addDoc(writer, texts[i]);
}
writer.optimize();
writer.close();
reader = IndexReader.open(ramDir);
numHighlights = 0;
}
private void addDoc(IndexWriter writer, String text) throws IOException
{
Document d = new Document();
Field f = new Field(FIELD_NAME, text, true, true, true);
d.add(f);
writer.addDocument(d);
}
/*
* @see TestCase#tearDown()
*/
protected void tearDown() throws Exception
{
super.tearDown();
}
}

View File

@ -0,0 +1,113 @@
package org.apache.lucene.search.highlight;
/**
* Copyright 2002-2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.HashMap;
import java.util.HashSet;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.search.Query;
/**
* {@link Scorer} implementation which scores text fragments by the number of unique query terms found.
* This class uses the {@link QueryTermExtractor} class to process determine the query terms and
* their boosts to be used.
* @author mark@searcharea.co.uk
*/
//TODO: provide option to roll idf into the scoring equation by passing a IndexReader.
//TODO: provide option to boost score of fragments near beginning of document
// based on fragment.getFragNum()
public class QueryScorer implements Scorer
{
TextFragment currentTextFragment=null;
HashSet uniqueTermsInFragment;
float totalScore=0;
private HashMap termsToFind;
/**
*
* @param query a Lucene query (ideally rewritten using query.rewrite
* before being passed to this class and the searcher)
*/
public QueryScorer(Query query)
{
this(QueryTermExtractor.getTerms(query));
}
public QueryScorer(WeightedTerm []weightedTerms )
{
termsToFind = new HashMap();
for (int i = 0; i < weightedTerms.length; i++)
{
termsToFind.put(weightedTerms[i].term,weightedTerms[i]);
}
}
/* (non-Javadoc)
* @see org.apache.lucene.search.highlight.FragmentScorer#startFragment(org.apache.lucene.search.highlight.TextFragment)
*/
public void startFragment(TextFragment newFragment)
{
uniqueTermsInFragment = new HashSet();
currentTextFragment=newFragment;
totalScore=0;
}
/* (non-Javadoc)
* @see org.apache.lucene.search.highlight.FragmentScorer#scoreToken(org.apache.lucene.analysis.Token)
*/
public float getTokenScore(Token token)
{
String termText=token.termText();
WeightedTerm queryTerm=(WeightedTerm) termsToFind.get(termText);
if(queryTerm==null)
{
//not a query term - return
return 0;
}
//found a query term - is it unique in this doc?
if(!uniqueTermsInFragment.contains(termText))
{
totalScore+=queryTerm.getWeight();
uniqueTermsInFragment.add(termText);
}
return queryTerm.getWeight();
}
/* (non-Javadoc)
* @see org.apache.lucene.search.highlight.FragmentScorer#endFragment(org.apache.lucene.search.highlight.TextFragment)
*/
public float getFragmentScore()
{
return totalScore;
}
/* (non-Javadoc)
* @see org.apache.lucene.search.highlight.FragmentScorer#allFragmentsProcessed()
*/
public void allFragmentsProcessed()
{
//this class has no special operations to perform at end of processing
}
}

View File

@ -0,0 +1,115 @@
package org.apache.lucene.search.highlight;
/**
* Copyright 2002-2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.HashSet;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
/**
* Utility class used to extract the terms used in a query, plus any weights.
* This class will not find terms for MultiTermQuery, RangeQuery and PrefixQuery classes
* so the caller must pass a rewritten query (see query.rewrite) to obtain a list of
* expanded terms.
*
*/
public final class QueryTermExtractor
{
/**
* Extracts all terms texts of a given Query into an array of WeightedTerms
*
* @param query Query to extract term texts from
* @return an array of the terms used in a query, plus their weights.
* @throws IOException
*/
public static final WeightedTerm[] getTerms(Query query)
{
return getTerms(query,false);
}
/**
* Extracts all terms texts of a given Query into an array of WeightedTerms
*
* @param query Query to extract term texts from
* @param prohibited <code>true</code> to extract "prohibited" terms, too
* @return an array of the terms used in a query, plus their weights.
* @throws IOException
*/
public static final WeightedTerm[] getTerms(Query query, boolean prohibited)
{
HashSet terms=new HashSet();
getTerms(query,terms,prohibited);
return (WeightedTerm[]) terms.toArray(new WeightedTerm[0]);
}
private static final void getTerms(Query query, HashSet terms,boolean prohibited)
{
if (query instanceof BooleanQuery)
getTermsFromBooleanQuery((BooleanQuery) query, terms, prohibited);
else
if (query instanceof PhraseQuery)
getTermsFromPhraseQuery((PhraseQuery) query, terms);
else
if (query instanceof TermQuery)
getTermsFromTermQuery((TermQuery) query, terms);
// else
// if ((query instanceof PrefixQuery)
// || (query instanceof RangeQuery)
// || (query instanceof MultiTermQuery))
// {
// //client should call rewrite BEFORE calling highlighter
// // Query expandedQuery = rewrite(reader, query);
// // getTerms(reader, expandedQuery, terms, prohibited);
// }
}
private static final void getTermsFromBooleanQuery(BooleanQuery query, HashSet terms, boolean prohibited)
{
BooleanClause[] queryClauses = query.getClauses();
int i;
for (i = 0; i < queryClauses.length; i++)
{
if (prohibited || !queryClauses[i].prohibited)
getTerms(queryClauses[i].query, terms, prohibited);
}
}
private static final void getTermsFromPhraseQuery(PhraseQuery query, HashSet terms)
{
Term[] queryTerms = query.getTerms();
int i;
for (i = 0; i < queryTerms.length; i++)
{
terms.add(new WeightedTerm(query.getBoost(),queryTerms[i].text()));
}
}
private static final void getTermsFromTermQuery(TermQuery query, HashSet terms)
{
terms.add(new WeightedTerm(query.getBoost(),query.getTerm().text()));
}
}

View File

@ -0,0 +1,48 @@
package org.apache.lucene.search.highlight;
/**
* Copyright 2002-2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.Token;
/**
* Adds to the score for a fragment based on its tokens
* @author mark@searcharea.co.uk
*/
public interface Scorer
{
/**
* called when a new fragment is started for consideration
* @param newFragment
*/
public void startFragment(TextFragment newFragment);
/**
* Called for each token in the current fragment
* @param token The token to be scored
* @return a score which is passed to the TermHighlighter class to influence the mark-up of the text
* (this return value is NOT used to score the fragment)
*/
public float getTokenScore(Token token);
/**
* Called when the highlighter has no more tokens for the current fragment - the scorer will typically
* call setScore() on the fragment passed in startFragment to record total info
*
*/
public float getFragmentScore();
}

View File

@ -0,0 +1,84 @@
package org.apache.lucene.search.highlight;
/**
* Copyright 2002-2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.Token;
/**
* {@link Fragmenter} implementation which breaks text up into same-size
* fragments with no concerns over spotting sentence boundaries.
* @author mark@searcharea.co.uk
*/
public class SimpleFragmenter implements Fragmenter
{
private static final int DEFAULT_FRAGMENT_SIZE =100;
private int currentNumFrags;
private int fragmentSize;
public SimpleFragmenter()
{
this(DEFAULT_FRAGMENT_SIZE);
}
/**
*
* @param fragmentSize size in bytes of each fragment
*/
public SimpleFragmenter(int fragmentSize)
{
this.fragmentSize=fragmentSize;
}
/* (non-Javadoc)
* @see org.apache.lucene.search.highlight.TextFragmenter#start(java.lang.String)
*/
public void start(String originalText)
{
currentNumFrags=1;
}
/* (non-Javadoc)
* @see org.apache.lucene.search.highlight.TextFragmenter#isNewFragment(org.apache.lucene.analysis.Token)
*/
public boolean isNewFragment(Token token)
{
boolean isNewFrag= token.endOffset()>=(fragmentSize*currentNumFrags);
if(isNewFrag)
{
currentNumFrags++;
}
return isNewFrag;
}
/**
* @return size in bytes of each fragment
*/
public int getFragmentSize()
{
return fragmentSize;
}
/**
* @param size size in bytes of each fragment
*/
public void setFragmentSize(int size)
{
fragmentSize = size;
}
}

View File

@ -0,0 +1,57 @@
package org.apache.lucene.search.highlight;
/**
* Copyright 2002-2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Simple {@link Formatter} implementation to highlight terms with a pre and post tag
* @author MAHarwood
*
*/
public class SimpleHTMLFormatter implements Formatter
{
String preTag;
String postTag;
public SimpleHTMLFormatter(String preTag, String postTag)
{
this.preTag = preTag;
this.postTag = postTag;
}
/**
* Default constructor uses HTML: &lt;B&gt; tags to markup terms
*
**/
public SimpleHTMLFormatter()
{
this.preTag = "<B>";
this.postTag = "</B>";
}
public String highlightTerm(String originalText, String term, float score, int startOffset)
{
if(score<=0)
{
return originalText;
}
StringBuffer sb = new StringBuffer();
sb.append(preTag);
sb.append(originalText);
sb.append(postTag);
return sb.toString();
}
}

View File

@ -0,0 +1,70 @@
package org.apache.lucene.search.highlight;
/**
* Copyright 2002-2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Low-level class used to record information about a section of a document
* with a score.
* @author MAHarwood
*
*
*/
public class TextFragment
{
int fragNum;
int textStartPos;
int textEndPos;
float score;
public TextFragment(int textStartPos, int fragNum)
{
this.textStartPos = textStartPos;
this.fragNum = fragNum;
}
void setScore(float score)
{
this.score=score;
}
public float getScore()
{
return score;
}
/**
* @param frag2 Fragment to be merged into this one
*/
public void merge(TextFragment frag2)
{
textEndPos = frag2.textEndPos;
}
/**
* @param fragment
* @return true if this fragment follows the one passed
*/
public boolean follows(TextFragment fragment)
{
return textStartPos == fragment.textEndPos;
}
/**
* @return the fragment sequence number
*/
public int getFragNum()
{
return fragNum;
}
}

View File

@ -0,0 +1,64 @@
package org.apache.lucene.search.highlight;
/**
* Copyright 2002-2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/** Lightweight class to hold term and a weight value used for scoring this term
* @author Mark Harwood
*/
public class WeightedTerm
{
float weight; // multiplier
String term; //stemmed form
public WeightedTerm (float weight,String term)
{
this.weight=weight;
this.term=term;
}
/**
* @return the term value (stemmed)
*/
public String getTerm()
{
return term;
}
/**
* @return the weight associated with this term
*/
public float getWeight()
{
return weight;
}
/**
* @param term the term value (stemmed)
*/
public void setTerm(String term)
{
this.term = term;
}
/**
* @param weight the weight associated with this term
*/
public void setWeight(float weight)
{
this.weight = weight;
}
}

View File

@ -0,0 +1,28 @@
<html>
<body>
The highlight package contains classes to provide "keyword in context" features
typically used to highlight search terms in the text of results pages. <br>
The Highlighter class is the central component and can be used to extract the
most interesting sections of a piece of text and highlight them, with the help of
Fragmenter, FragmentScorer and Formatter classes.
<h2>Example Usage</h2>
<pre>
IndexSearcher searcher = new IndexSearcher(ramDir);
Query query = QueryParser.parse("Kenne*", FIELD_NAME, analyzer);
query=query.rewrite(reader); //required to expand search terms
Hits hits = searcher.search(query);
Highlighter highlighter =new Highlighter(this,new QueryScorer(query));
for (int i = 0; i < hits.length(); i++)
{
String text = hits.doc(i).get(FIELD_NAME);
TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(text));
// Get 3 best fragments and seperate with a "..."
String result = highlighter.getBestFragments(tokenStream,text,3,"...");
System.out.println(result);
}
</pre>
</body>
</html>