mirror of https://github.com/apache/lucene.git
Initial commit of Mark Harwood's Highlighter package
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150972 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
45898ec436
commit
33345f7af9
|
@ -0,0 +1,10 @@
|
|||
<?xml version="1.0"?>
|
||||
|
||||
<project name="highlighter" default="default">
|
||||
|
||||
<description>
|
||||
Hits highlighter
|
||||
</description>
|
||||
|
||||
<import file="../common.xml"/>
|
||||
</project>
|
|
@ -0,0 +1,38 @@
|
|||
package org.apache.lucene.search.highlight;
|
||||
/**
|
||||
* Copyright 2002-2004 The Apache Software Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
|
||||
/**
|
||||
* Processes terms found in the original text, typically by applying some form
|
||||
* of mark-up to highlight terms in HTML search results pages.
|
||||
*
|
||||
*/
|
||||
public interface Formatter
|
||||
{
|
||||
/**
|
||||
* Highlights a search term. For example, an HTML Formatter could simply do:
|
||||
*
|
||||
* <p><dl><dt></dt><dd><code>return "<b>" + term + "</b>";</code></dd></dl>
|
||||
*
|
||||
* @param originalTermText (unstemmed) term text to highlight
|
||||
* @param stemmedTerm the stemmed form of the originalTermText
|
||||
* @param startOffset the position of the originalTermText in the text being highlighted
|
||||
*
|
||||
* @return highlighted term text
|
||||
*/
|
||||
String highlightTerm(String originalTermText, String stemmedTerm, float score, int startOffset);
|
||||
}
|
|
@ -0,0 +1,40 @@
|
|||
package org.apache.lucene.search.highlight;
|
||||
/**
|
||||
* Copyright 2002-2004 The Apache Software Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
|
||||
/**
|
||||
* Implements the policy for breaking text into multiple fragments for consideration
|
||||
* by the {@link Highlighter} class. A sophisticated implementation may do this on the basis
|
||||
* of detecting end of sentences in the text.
|
||||
* @author mark@searcharea.co.uk
|
||||
*/
|
||||
public interface Fragmenter
|
||||
{
|
||||
/**
|
||||
* Initializes the Fragmenter
|
||||
* @param originalText
|
||||
*/
|
||||
public void start(String originalText);
|
||||
|
||||
/**
|
||||
* Test to see if this token from the stream should be held in a new TextFragment
|
||||
* @param token
|
||||
* @return
|
||||
*/
|
||||
public boolean isNewFragment(Token nextToken);
|
||||
}
|
|
@ -0,0 +1,430 @@
|
|||
package org.apache.lucene.search.highlight;
|
||||
/**
|
||||
* Copyright 2002-2004 The Apache Software Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.util.PriorityQueue;
|
||||
|
||||
/**
|
||||
* Class used to markup highlighted terms found in the best sections of a
|
||||
* text, using configurable {@link Fragmenter}, {@link Scorer}, {@link Formatter}
|
||||
* and tokenizers.
|
||||
* @author mark@searcharea.co.uk
|
||||
*/
|
||||
public class Highlighter
|
||||
{
|
||||
|
||||
public static final int DEFAULT_MAX_DOC_BYTES_TO_ANALYZE=50*1024;
|
||||
private int maxDocBytesToAnalyze=DEFAULT_MAX_DOC_BYTES_TO_ANALYZE;
|
||||
private Formatter formatter;
|
||||
private Fragmenter textFragmenter=new SimpleFragmenter();
|
||||
private Scorer fragmentScorer=null;
|
||||
|
||||
public Highlighter(Scorer fragmentScorer)
|
||||
{
|
||||
this(new SimpleHTMLFormatter(),fragmentScorer);
|
||||
}
|
||||
|
||||
|
||||
public Highlighter(Formatter formatter, Scorer fragmentScorer)
|
||||
{
|
||||
this.formatter = formatter;
|
||||
this.fragmentScorer = fragmentScorer;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Highlights chosen terms in a text, extracting the most relevant section.
|
||||
* The document text is analysed in chunks to record hit statistics
|
||||
* across the document. After accumulating stats, the fragment with the highest score
|
||||
* is returned
|
||||
*
|
||||
* @param tokenStream a stream of tokens identified in the text parameter, including offset information.
|
||||
* This is typically produced by an analyzer re-parsing a document's
|
||||
* text. Some work may be done on retrieving TokenStreams more efficently
|
||||
* by adding support for storing original text position data in the Lucene
|
||||
* index but this support is not currently available (as of Lucene 1.4 rc2).
|
||||
* @param text text to highlight terms in
|
||||
*
|
||||
* @return highlighted text fragment or null if no terms found
|
||||
*/
|
||||
public final String getBestFragment(TokenStream tokenStream, String text)
|
||||
throws IOException
|
||||
{
|
||||
String[] results = getBestFragments(tokenStream,text, 1);
|
||||
if (results.length > 0)
|
||||
{
|
||||
return results[0];
|
||||
}
|
||||
return null;
|
||||
}
|
||||
/**
|
||||
* Highlights chosen terms in a text, extracting the most relevant sections.
|
||||
* The document text is analysed in chunks to record hit statistics
|
||||
* across the document. After accumulating stats, the fragments with the highest scores
|
||||
* are returned as an array of strings in order of score (contiguous fragments are merged into
|
||||
* one in their original order to improve readability)
|
||||
*
|
||||
* @param text text to highlight terms in
|
||||
* @param maxNumFragments the maximum number of fragments.
|
||||
*
|
||||
* @return highlighted text fragments (between 0 and maxNumFragments number of fragments)
|
||||
*/
|
||||
public final String[] getBestFragments(
|
||||
TokenStream tokenStream,
|
||||
String text,
|
||||
int maxNumFragments)
|
||||
throws IOException
|
||||
{
|
||||
maxNumFragments = Math.max(1, maxNumFragments); //sanity check
|
||||
StringBuffer newText = new StringBuffer();
|
||||
|
||||
TextFragment[] frag =getBestDocFragments(tokenStream,text, newText, maxNumFragments);
|
||||
|
||||
mergeContiguousFragments(frag);
|
||||
|
||||
//Get text
|
||||
ArrayList fragTexts = new ArrayList();
|
||||
int n = 0;
|
||||
for (int i = 0; i < frag.length; i++)
|
||||
{
|
||||
if ((frag[i] != null) && (frag[i].getScore() > 0))
|
||||
{
|
||||
fragTexts.add(
|
||||
newText.substring(
|
||||
frag[i].textStartPos,
|
||||
frag[i].textEndPos));
|
||||
}
|
||||
}
|
||||
return (String[]) fragTexts.toArray(new String[0]);
|
||||
}
|
||||
|
||||
/**
|
||||
* Low level api to get the most relevant sections of the document
|
||||
* @param tokenStream
|
||||
* @param text
|
||||
* @param maxNumFragments
|
||||
* @return
|
||||
* @throws IOException
|
||||
*/
|
||||
private final TextFragment[] getBestDocFragments(
|
||||
TokenStream tokenStream,
|
||||
String text,
|
||||
StringBuffer newText,
|
||||
int maxNumFragments)
|
||||
throws IOException
|
||||
{
|
||||
ArrayList docFrags = new ArrayList();
|
||||
|
||||
TextFragment currentFrag = new TextFragment(newText.length(), docFrags.size());
|
||||
fragmentScorer.startFragment(currentFrag);
|
||||
docFrags.add(currentFrag);
|
||||
|
||||
FragmentQueue fragQueue = new FragmentQueue(maxNumFragments);
|
||||
|
||||
try
|
||||
{
|
||||
org.apache.lucene.analysis.Token token;
|
||||
String tokenText;
|
||||
int startOffset;
|
||||
int endOffset;
|
||||
int lastEndOffset = 0;
|
||||
textFragmenter.start(text);
|
||||
|
||||
while ((token = tokenStream.next()) != null)
|
||||
{
|
||||
|
||||
startOffset = token.startOffset();
|
||||
endOffset = token.endOffset();
|
||||
//FIXME an issue was reported with CJKTokenizer that I couldnt reproduce
|
||||
// where the analyzer was producing overlapping tokens.
|
||||
// I suspect the fix is to make startOffset=Math.max(startOffset,lastEndOffset+1)
|
||||
// but cant be sure so I'll just leave this comment in for now
|
||||
tokenText = text.substring(startOffset, endOffset);
|
||||
|
||||
|
||||
// append text between end of last token (or beginning of text) and start of current token
|
||||
if (startOffset > lastEndOffset)
|
||||
newText.append(text.substring(lastEndOffset, startOffset));
|
||||
|
||||
// does query contain current token?
|
||||
float score=fragmentScorer.getTokenScore(token);
|
||||
newText.append(formatter.highlightTerm(tokenText, token.termText(), score, startOffset));
|
||||
|
||||
|
||||
if(textFragmenter.isNewFragment(token))
|
||||
{
|
||||
currentFrag.setScore(fragmentScorer.getFragmentScore());
|
||||
//record stats for a new fragment
|
||||
currentFrag.textEndPos = newText.length();
|
||||
currentFrag =new TextFragment(newText.length(), docFrags.size());
|
||||
fragmentScorer.startFragment(currentFrag);
|
||||
docFrags.add(currentFrag);
|
||||
}
|
||||
|
||||
lastEndOffset = endOffset;
|
||||
if(lastEndOffset>maxDocBytesToAnalyze)
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
currentFrag.setScore(fragmentScorer.getFragmentScore());
|
||||
|
||||
|
||||
// append text after end of last token
|
||||
if (lastEndOffset < text.length())
|
||||
newText.append(text.substring(lastEndOffset));
|
||||
|
||||
currentFrag.textEndPos = newText.length();
|
||||
|
||||
//sort the most relevant sections of the text
|
||||
int minScore = 0;
|
||||
for (Iterator i = docFrags.iterator(); i.hasNext();)
|
||||
{
|
||||
currentFrag = (TextFragment) i.next();
|
||||
|
||||
//If you are running with a version of Lucene before 11th Sept 03
|
||||
// you do not have PriorityQueue.insert() - so uncomment the code below
|
||||
/*
|
||||
if (currentFrag.getScore() >= minScore)
|
||||
{
|
||||
fragQueue.put(currentFrag);
|
||||
if (fragQueue.size() > maxNumFragments)
|
||||
{ // if hit queue overfull
|
||||
fragQueue.pop(); // remove lowest in hit queue
|
||||
minScore = ((TextFragment) fragQueue.top()).getScore(); // reset minScore
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
*/
|
||||
//The above code caused a problem as a result of Christoph Goller's 11th Sept 03
|
||||
//fix to PriorityQueue. The correct method to use here is the new "insert" method
|
||||
// USE ABOVE CODE IF THIS DOES NOT COMPILE!
|
||||
fragQueue.insert(currentFrag);
|
||||
}
|
||||
|
||||
//return the most relevant fragments
|
||||
TextFragment frag[] = new TextFragment[fragQueue.size()];
|
||||
for (int i = frag.length - 1; i >= 0; i--)
|
||||
{
|
||||
frag[i] = (TextFragment) fragQueue.pop();
|
||||
}
|
||||
return frag;
|
||||
|
||||
}
|
||||
finally
|
||||
{
|
||||
if (tokenStream != null)
|
||||
{
|
||||
try
|
||||
{
|
||||
tokenStream.close();
|
||||
}
|
||||
catch (Exception e)
|
||||
{
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/** Improves readability of a score-sorted list of TextFragments by merging any fragments
|
||||
* that were contiguous in the original text into one larger fragment with the correct order.
|
||||
* This will leave a "null" in the array entry for the lesser scored fragment.
|
||||
*
|
||||
* @param frag An array of document fragments in descending score
|
||||
*/
|
||||
private void mergeContiguousFragments(TextFragment[] frag)
|
||||
{
|
||||
boolean mergingStillBeingDone;
|
||||
if (frag.length > 1)
|
||||
do
|
||||
{
|
||||
mergingStillBeingDone = false; //initialise loop control flag
|
||||
//for each fragment, scan other frags looking for contiguous blocks
|
||||
for (int i = 0; i < frag.length; i++)
|
||||
{
|
||||
if (frag[i] == null)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
//merge any contiguous blocks
|
||||
for (int x = 0; x < frag.length; x++)
|
||||
{
|
||||
if (frag[x] == null)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
if (frag[i] == null)
|
||||
{
|
||||
break;
|
||||
}
|
||||
TextFragment frag1 = null;
|
||||
TextFragment frag2 = null;
|
||||
int frag1Num = 0;
|
||||
int frag2Num = 0;
|
||||
int bestScoringFragNum;
|
||||
int worstScoringFragNum;
|
||||
//if blocks are contiguous....
|
||||
if (frag[i].follows(frag[x]))
|
||||
{
|
||||
frag1 = frag[x];
|
||||
frag1Num = x;
|
||||
frag2 = frag[i];
|
||||
frag2Num = i;
|
||||
}
|
||||
else
|
||||
if (frag[x].follows(frag[i]))
|
||||
{
|
||||
frag1 = frag[i];
|
||||
frag1Num = i;
|
||||
frag2 = frag[x];
|
||||
frag2Num = x;
|
||||
}
|
||||
//merging required..
|
||||
if (frag1 != null)
|
||||
{
|
||||
if (frag1.getScore() > frag2.getScore())
|
||||
{
|
||||
bestScoringFragNum = frag1Num;
|
||||
worstScoringFragNum = frag2Num;
|
||||
}
|
||||
else
|
||||
{
|
||||
bestScoringFragNum = frag2Num;
|
||||
worstScoringFragNum = frag1Num;
|
||||
}
|
||||
frag1.merge(frag2);
|
||||
frag[worstScoringFragNum] = null;
|
||||
mergingStillBeingDone = true;
|
||||
frag[bestScoringFragNum] = frag1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
while (mergingStillBeingDone);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Highlights terms in the text , extracting the most relevant sections
|
||||
* and concatenating the chosen fragments with a separator (typically "...").
|
||||
* The document text is analysed in chunks to record hit statistics
|
||||
* across the document. After accumulating stats, the fragments with the highest scores
|
||||
* are returned in order as "separator" delimited strings.
|
||||
*
|
||||
* @param text text to highlight terms in
|
||||
* @param maxNumFragments the maximum number of fragments.
|
||||
* @param separator the separator used to intersperse the document fragments (typically "...")
|
||||
*
|
||||
* @return highlighted text
|
||||
*/
|
||||
public final String getBestFragments(
|
||||
TokenStream tokenStream,
|
||||
String text,
|
||||
int maxNumFragments,
|
||||
String separator)
|
||||
throws IOException
|
||||
{
|
||||
String sections[] = getBestFragments(tokenStream,text, maxNumFragments);
|
||||
StringBuffer result = new StringBuffer();
|
||||
for (int i = 0; i < sections.length; i++)
|
||||
{
|
||||
if (i > 0)
|
||||
{
|
||||
result.append(separator);
|
||||
}
|
||||
result.append(sections[i]);
|
||||
}
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the maximum number of bytes to be tokenized per doc
|
||||
*/
|
||||
public int getMaxDocBytesToAnalyze()
|
||||
{
|
||||
return maxDocBytesToAnalyze;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param byteCount the maximum number of bytes to be tokenized per doc
|
||||
* (This can improve performance with large documents)
|
||||
*/
|
||||
public void setMaxDocBytesToAnalyze(int byteCount)
|
||||
{
|
||||
maxDocBytesToAnalyze = byteCount;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return
|
||||
*/
|
||||
public Fragmenter getTextFragmenter()
|
||||
{
|
||||
return textFragmenter;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param fragmenter
|
||||
*/
|
||||
public void setTextFragmenter(Fragmenter fragmenter)
|
||||
{
|
||||
textFragmenter = fragmenter;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return Object used to score each text fragment
|
||||
*/
|
||||
public Scorer getFragmentScorer()
|
||||
{
|
||||
return fragmentScorer;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @param scorer
|
||||
*/
|
||||
public void setFragmentScorer(Scorer scorer)
|
||||
{
|
||||
fragmentScorer = scorer;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
class FragmentQueue extends PriorityQueue
|
||||
{
|
||||
public FragmentQueue(int size)
|
||||
{
|
||||
initialize(size);
|
||||
}
|
||||
|
||||
public final boolean lessThan(Object a, Object b)
|
||||
{
|
||||
TextFragment fragA = (TextFragment) a;
|
||||
TextFragment fragB = (TextFragment) b;
|
||||
if (fragA.getScore() == fragB.getScore())
|
||||
return fragA.fragNum > fragB.fragNum;
|
||||
else
|
||||
return fragA.getScore() < fragB.getScore();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,442 @@
|
|||
package org.apache.lucene.search.highlight;
|
||||
/**
|
||||
* Copyright 2002-2004 The Apache Software Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
//import org.apache.lucene.analysis.cjk.CJKAnalyzer;
|
||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.queryParser.ParseException;
|
||||
import org.apache.lucene.queryParser.QueryParser;
|
||||
import org.apache.lucene.search.Hits;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.MultiSearcher;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.Searcher;
|
||||
import org.apache.lucene.store.RAMDirectory;
|
||||
|
||||
/**
|
||||
* JUnit Test for Highlighter class.
|
||||
* @author mark@searcharea.co.uk
|
||||
*/
|
||||
public class HighlighterTest extends TestCase implements Formatter
|
||||
{
|
||||
private IndexReader reader;
|
||||
private static final String FIELD_NAME = "contents";
|
||||
private Query query;
|
||||
RAMDirectory ramDir;
|
||||
public Searcher searcher = null;
|
||||
public Hits hits = null;
|
||||
int numHighlights = 0;
|
||||
Analyzer analyzer=new StandardAnalyzer();
|
||||
|
||||
String texts[] =
|
||||
{
|
||||
"Hello this is a piece of text that is very long and contains too much preamble and the meat is really here which says kennedy has been shot",
|
||||
"This piece of text refers to Kennedy at the beginning then has a longer piece of text that is very long in the middle and finally ends with another reference to Kennedy",
|
||||
"JFK has been shot",
|
||||
"John Kennedy has been shot",
|
||||
"This text has a typo in referring to Keneddy" };
|
||||
|
||||
/**
|
||||
* Constructor for HighlightExtractorTest.
|
||||
* @param arg0
|
||||
*/
|
||||
public HighlighterTest(String arg0)
|
||||
{
|
||||
super(arg0);
|
||||
}
|
||||
|
||||
public void testSimpleHighlighter() throws Exception
|
||||
{
|
||||
doSearching("Kennedy");
|
||||
Highlighter highlighter = new Highlighter(new QueryScorer(query));
|
||||
highlighter.setTextFragmenter(new SimpleFragmenter(40));
|
||||
int maxNumFragmentsRequired = 2;
|
||||
for (int i = 0; i < hits.length(); i++)
|
||||
{
|
||||
String text = hits.doc(i).get(FIELD_NAME);
|
||||
TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(text));
|
||||
|
||||
String result =
|
||||
highlighter.getBestFragments(tokenStream,text,maxNumFragmentsRequired, "...");
|
||||
System.out.println("\t" + result);
|
||||
}
|
||||
//Not sure we can assert anything here - just running to check we dont throw any exceptions
|
||||
}
|
||||
|
||||
|
||||
|
||||
public void testGetBestFragmentsSimpleQuery() throws Exception
|
||||
{
|
||||
doSearching("Kennedy");
|
||||
doStandardHighlights();
|
||||
assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 4);
|
||||
}
|
||||
public void testGetFuzzyFragments() throws Exception
|
||||
{
|
||||
doSearching("Kinnedy~");
|
||||
doStandardHighlights();
|
||||
assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 4);
|
||||
}
|
||||
|
||||
public void testGetWildCardFragments() throws Exception
|
||||
{
|
||||
doSearching("K?nnedy");
|
||||
doStandardHighlights();
|
||||
assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 4);
|
||||
}
|
||||
public void testGetMidWildCardFragments() throws Exception
|
||||
{
|
||||
doSearching("K*dy");
|
||||
doStandardHighlights();
|
||||
assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 5);
|
||||
}
|
||||
public void testGetRangeFragments() throws Exception
|
||||
{
|
||||
doSearching(FIELD_NAME + ":[kannedy TO kznnedy]"); //bug?needs lower case
|
||||
doStandardHighlights();
|
||||
assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 5);
|
||||
}
|
||||
|
||||
public void testGetBestFragmentsPhrase() throws Exception
|
||||
{
|
||||
doSearching("\"John Kennedy\"");
|
||||
doStandardHighlights();
|
||||
//Currently highlights "John" and "Kennedy" separately
|
||||
assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 2);
|
||||
}
|
||||
|
||||
public void testGetBestFragmentsMultiTerm() throws Exception
|
||||
{
|
||||
doSearching("John Kenn*");
|
||||
doStandardHighlights();
|
||||
assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 5);
|
||||
}
|
||||
public void testGetBestFragmentsWithOr() throws Exception
|
||||
{
|
||||
doSearching("JFK OR Kennedy");
|
||||
doStandardHighlights();
|
||||
assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 5);
|
||||
}
|
||||
|
||||
|
||||
public void testGetBestSingleFragment() throws Exception
|
||||
{
|
||||
doSearching("Kennedy");
|
||||
// QueryHighlightExtractor highlighter = new QueryHighlightExtractor(this, query, new StandardAnalyzer());
|
||||
Highlighter highlighter =new Highlighter(this,new QueryScorer(query));
|
||||
highlighter.setTextFragmenter(new SimpleFragmenter(40));
|
||||
|
||||
for (int i = 0; i < hits.length(); i++)
|
||||
{
|
||||
String text = hits.doc(i).get(FIELD_NAME);
|
||||
TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(text));
|
||||
String result = highlighter.getBestFragment(tokenStream,text);
|
||||
System.out.println("\t" + result);
|
||||
}
|
||||
assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 4);
|
||||
}
|
||||
|
||||
public void testGetBestSingleFragmentWithWeights() throws Exception
|
||||
{
|
||||
WeightedTerm[]wTerms=new WeightedTerm[2];
|
||||
wTerms[0]=new WeightedTerm(10f,"hello");
|
||||
wTerms[1]=new WeightedTerm(1f,"kennedy");
|
||||
Highlighter highlighter =new Highlighter(new QueryScorer(wTerms));
|
||||
TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(texts[0]));
|
||||
highlighter.setTextFragmenter(new SimpleFragmenter(2));
|
||||
|
||||
String result = highlighter.getBestFragment(tokenStream,texts[0]).trim();
|
||||
assertTrue("Failed to find best section using weighted terms. Found: "+result
|
||||
, "<B>Hello</B>".equals(result));
|
||||
|
||||
//readjust weights
|
||||
wTerms[1].setWeight(50f);
|
||||
tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(texts[0]));
|
||||
highlighter =new Highlighter(new QueryScorer(wTerms));
|
||||
highlighter.setTextFragmenter(new SimpleFragmenter(2));
|
||||
|
||||
result = highlighter.getBestFragment(tokenStream,texts[0]).trim();
|
||||
assertTrue("Failed to find best section using weighted terms. Found: "+result
|
||||
, "<B>kennedy</B>".equals(result));
|
||||
}
|
||||
|
||||
|
||||
|
||||
public void testGetSimpleHighlight() throws Exception
|
||||
{
|
||||
doSearching("Kennedy");
|
||||
Highlighter highlighter =
|
||||
new Highlighter(this,new QueryScorer(query));
|
||||
|
||||
for (int i = 0; i < hits.length(); i++)
|
||||
{
|
||||
String text = hits.doc(i).get(FIELD_NAME);
|
||||
TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(text));
|
||||
|
||||
String result = highlighter.getBestFragment(tokenStream,text);
|
||||
System.out.println("\t" + result);
|
||||
}
|
||||
assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 4);
|
||||
}
|
||||
|
||||
public void testMaxSizeHighlight() throws Exception
|
||||
{
|
||||
doSearching("meat");
|
||||
Highlighter highlighter =
|
||||
new Highlighter(this,new QueryScorer(query));
|
||||
highlighter.setMaxDocBytesToAnalyze(30);
|
||||
TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(texts[0]));
|
||||
String result = highlighter.getBestFragment(tokenStream,texts[0]);
|
||||
assertTrue("Setting MaxDocBytesToAnalyze should have prevented " +
|
||||
"us from finding matches for this record" + numHighlights +
|
||||
" found", numHighlights == 0);
|
||||
}
|
||||
|
||||
|
||||
|
||||
public void testUnRewrittenQuery() throws IOException, ParseException
|
||||
{
|
||||
//test to show how rewritten query can still be used
|
||||
searcher = new IndexSearcher(ramDir);
|
||||
Analyzer analyzer=new StandardAnalyzer();
|
||||
Query query = QueryParser.parse("JF? or Kenned*", FIELD_NAME, analyzer);
|
||||
System.out.println("Searching with primitive query");
|
||||
//forget to set this and...
|
||||
//query=query.rewrite(reader);
|
||||
Hits hits = searcher.search(query);
|
||||
|
||||
//create an instance of the highlighter with the tags used to surround highlighted text
|
||||
// QueryHighlightExtractor highlighter = new QueryHighlightExtractor(this, query, new StandardAnalyzer());
|
||||
Highlighter highlighter =
|
||||
new Highlighter(this,new QueryScorer(query));
|
||||
|
||||
highlighter.setTextFragmenter(new SimpleFragmenter(40));
|
||||
|
||||
int maxNumFragmentsRequired = 3;
|
||||
|
||||
for (int i = 0; i < hits.length(); i++)
|
||||
{
|
||||
String text = hits.doc(i).get(FIELD_NAME);
|
||||
TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(text));
|
||||
|
||||
String highlightedText = highlighter.getBestFragments(tokenStream,text,maxNumFragmentsRequired,"...");
|
||||
System.out.println(highlightedText);
|
||||
}
|
||||
//We expect to have zero highlights if the query is multi-terms and is not rewritten!
|
||||
assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 0);
|
||||
}
|
||||
|
||||
public void testNoFragments() throws Exception
|
||||
{
|
||||
doSearching("AnInvalidQueryWhichShouldYieldNoResults");
|
||||
Highlighter highlighter =
|
||||
new Highlighter(this,new QueryScorer(query));
|
||||
|
||||
int highlightFragmentSizeInBytes = 40;
|
||||
for (int i = 0; i < texts.length; i++)
|
||||
{
|
||||
String text = texts[i];
|
||||
TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(text));
|
||||
|
||||
String result = highlighter.getBestFragment(tokenStream,text);
|
||||
assertNull("The highlight result should be null for text with no query terms", result);
|
||||
}
|
||||
}
|
||||
|
||||
public void testMultiSearcher() throws Exception
|
||||
{
|
||||
//setup index 1
|
||||
RAMDirectory ramDir1 = new RAMDirectory();
|
||||
IndexWriter writer1 = new IndexWriter(ramDir1, new StandardAnalyzer(), true);
|
||||
Document d = new Document();
|
||||
Field f = new Field(FIELD_NAME, "multiOne", true, true, true);
|
||||
d.add(f);
|
||||
writer1.addDocument(d);
|
||||
writer1.optimize();
|
||||
writer1.close();
|
||||
IndexReader reader1 = IndexReader.open(ramDir1);
|
||||
|
||||
//setup index 2
|
||||
RAMDirectory ramDir2 = new RAMDirectory();
|
||||
IndexWriter writer2 = new IndexWriter(ramDir2, new StandardAnalyzer(), true);
|
||||
d = new Document();
|
||||
f = new Field(FIELD_NAME, "multiTwo", true, true, true);
|
||||
d.add(f);
|
||||
writer2.addDocument(d);
|
||||
writer2.optimize();
|
||||
writer2.close();
|
||||
IndexReader reader2 = IndexReader.open(ramDir2);
|
||||
|
||||
|
||||
|
||||
IndexSearcher searchers[]=new IndexSearcher[2];
|
||||
searchers[0] = new IndexSearcher(ramDir1);
|
||||
searchers[1] = new IndexSearcher(ramDir2);
|
||||
MultiSearcher multiSearcher=new MultiSearcher(searchers);
|
||||
query = QueryParser.parse("multi*", FIELD_NAME, new StandardAnalyzer());
|
||||
System.out.println("Searching for: " + query.toString(FIELD_NAME));
|
||||
//at this point the multisearcher calls combine(query[])
|
||||
hits = multiSearcher.search(query);
|
||||
|
||||
//query = QueryParser.parse("multi*", FIELD_NAME, new StandardAnalyzer());
|
||||
Query expandedQueries[]=new Query[2];
|
||||
expandedQueries[0]=query.rewrite(reader1);
|
||||
expandedQueries[1]=query.rewrite(reader2);
|
||||
query=query.combine(expandedQueries);
|
||||
|
||||
|
||||
//create an instance of the highlighter with the tags used to surround highlighted text
|
||||
Highlighter highlighter =
|
||||
new Highlighter(this,new QueryScorer(query));
|
||||
|
||||
for (int i = 0; i < hits.length(); i++)
|
||||
{
|
||||
String text = hits.doc(i).get(FIELD_NAME);
|
||||
TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(text));
|
||||
String highlightedText = highlighter.getBestFragment(tokenStream,text);
|
||||
System.out.println(highlightedText);
|
||||
}
|
||||
assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 2);
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
public void testBigramAnalyzer() throws IOException, ParseException
|
||||
{
|
||||
//test to ensure analyzers with none-consecutive start/end offsets
|
||||
//dont double-highlight text
|
||||
//setup index 1
|
||||
RAMDirectory ramDir = new RAMDirectory();
|
||||
Analyzer bigramAnalyzer=new CJKAnalyzer();
|
||||
IndexWriter writer = new IndexWriter(ramDir,bigramAnalyzer , true);
|
||||
Document d = new Document();
|
||||
Field f = new Field(FIELD_NAME, "java abc def", true, true, true);
|
||||
d.add(f);
|
||||
writer.addDocument(d);
|
||||
writer.close();
|
||||
IndexReader reader = IndexReader.open(ramDir);
|
||||
|
||||
IndexSearcher searcher=new IndexSearcher(reader);
|
||||
query = QueryParser.parse("abc", FIELD_NAME, bigramAnalyzer);
|
||||
System.out.println("Searching for: " + query.toString(FIELD_NAME));
|
||||
hits = searcher.search(query);
|
||||
|
||||
Highlighter highlighter =
|
||||
new Highlighter(this,new QueryFragmentScorer(query));
|
||||
|
||||
for (int i = 0; i < hits.length(); i++)
|
||||
{
|
||||
String text = hits.doc(i).get(FIELD_NAME);
|
||||
TokenStream tokenStream=bigramAnalyzer.tokenStream(FIELD_NAME,new StringReader(text));
|
||||
String highlightedText = highlighter.getBestFragment(tokenStream,text);
|
||||
System.out.println(highlightedText);
|
||||
}
|
||||
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
public String highlightTerm(String originalText , String weightedTerm, float score, int startOffset)
|
||||
{
|
||||
if(score<=0)
|
||||
{
|
||||
return originalText;
|
||||
}
|
||||
numHighlights++; //update stats used in assertions
|
||||
return "<b>" + originalText + "</b>";
|
||||
}
|
||||
|
||||
public void doSearching(String queryString) throws Exception
|
||||
{
|
||||
searcher = new IndexSearcher(ramDir);
|
||||
query = QueryParser.parse(queryString, FIELD_NAME, new StandardAnalyzer());
|
||||
//for any multi-term queries to work (prefix, wildcard, range,fuzzy etc) you must use a rewritten query!
|
||||
query=query.rewrite(reader);
|
||||
System.out.println("Searching for: " + query.toString(FIELD_NAME));
|
||||
hits = searcher.search(query);
|
||||
}
|
||||
|
||||
void doStandardHighlights() throws Exception
|
||||
{
|
||||
Highlighter highlighter =new Highlighter(this,new QueryScorer(query));
|
||||
highlighter.setTextFragmenter(new SimpleFragmenter(20));
|
||||
for (int i = 0; i < hits.length(); i++)
|
||||
{
|
||||
String text = hits.doc(i).get(FIELD_NAME);
|
||||
int maxNumFragmentsRequired = 2;
|
||||
String fragmentSeparator = "...";
|
||||
TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(text));
|
||||
|
||||
String result =
|
||||
highlighter.getBestFragments(
|
||||
tokenStream,
|
||||
text,
|
||||
maxNumFragmentsRequired,
|
||||
fragmentSeparator);
|
||||
System.out.println("\t" + result);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* @see TestCase#setUp()
|
||||
*/
|
||||
protected void setUp() throws Exception
|
||||
{
|
||||
ramDir = new RAMDirectory();
|
||||
IndexWriter writer = new IndexWriter(ramDir, new StandardAnalyzer(), true);
|
||||
for (int i = 0; i < texts.length; i++)
|
||||
{
|
||||
addDoc(writer, texts[i]);
|
||||
}
|
||||
|
||||
writer.optimize();
|
||||
writer.close();
|
||||
reader = IndexReader.open(ramDir);
|
||||
numHighlights = 0;
|
||||
}
|
||||
|
||||
private void addDoc(IndexWriter writer, String text) throws IOException
|
||||
{
|
||||
Document d = new Document();
|
||||
Field f = new Field(FIELD_NAME, text, true, true, true);
|
||||
d.add(f);
|
||||
writer.addDocument(d);
|
||||
|
||||
}
|
||||
|
||||
/*
|
||||
* @see TestCase#tearDown()
|
||||
*/
|
||||
protected void tearDown() throws Exception
|
||||
{
|
||||
super.tearDown();
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,113 @@
|
|||
package org.apache.lucene.search.highlight;
|
||||
/**
|
||||
* Copyright 2002-2004 The Apache Software Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.search.Query;
|
||||
|
||||
/**
|
||||
* {@link Scorer} implementation which scores text fragments by the number of unique query terms found.
|
||||
* This class uses the {@link QueryTermExtractor} class to process determine the query terms and
|
||||
* their boosts to be used.
|
||||
* @author mark@searcharea.co.uk
|
||||
*/
|
||||
//TODO: provide option to roll idf into the scoring equation by passing a IndexReader.
|
||||
//TODO: provide option to boost score of fragments near beginning of document
|
||||
// based on fragment.getFragNum()
|
||||
public class QueryScorer implements Scorer
|
||||
{
|
||||
TextFragment currentTextFragment=null;
|
||||
HashSet uniqueTermsInFragment;
|
||||
float totalScore=0;
|
||||
private HashMap termsToFind;
|
||||
|
||||
|
||||
/**
|
||||
*
|
||||
* @param query a Lucene query (ideally rewritten using query.rewrite
|
||||
* before being passed to this class and the searcher)
|
||||
*/
|
||||
public QueryScorer(Query query)
|
||||
{
|
||||
this(QueryTermExtractor.getTerms(query));
|
||||
}
|
||||
|
||||
|
||||
public QueryScorer(WeightedTerm []weightedTerms )
|
||||
{
|
||||
termsToFind = new HashMap();
|
||||
for (int i = 0; i < weightedTerms.length; i++)
|
||||
{
|
||||
termsToFind.put(weightedTerms[i].term,weightedTerms[i]);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* (non-Javadoc)
|
||||
* @see org.apache.lucene.search.highlight.FragmentScorer#startFragment(org.apache.lucene.search.highlight.TextFragment)
|
||||
*/
|
||||
public void startFragment(TextFragment newFragment)
|
||||
{
|
||||
uniqueTermsInFragment = new HashSet();
|
||||
currentTextFragment=newFragment;
|
||||
totalScore=0;
|
||||
|
||||
}
|
||||
|
||||
/* (non-Javadoc)
|
||||
* @see org.apache.lucene.search.highlight.FragmentScorer#scoreToken(org.apache.lucene.analysis.Token)
|
||||
*/
|
||||
public float getTokenScore(Token token)
|
||||
{
|
||||
String termText=token.termText();
|
||||
|
||||
WeightedTerm queryTerm=(WeightedTerm) termsToFind.get(termText);
|
||||
if(queryTerm==null)
|
||||
{
|
||||
//not a query term - return
|
||||
return 0;
|
||||
}
|
||||
//found a query term - is it unique in this doc?
|
||||
if(!uniqueTermsInFragment.contains(termText))
|
||||
{
|
||||
totalScore+=queryTerm.getWeight();
|
||||
uniqueTermsInFragment.add(termText);
|
||||
}
|
||||
return queryTerm.getWeight();
|
||||
}
|
||||
|
||||
|
||||
/* (non-Javadoc)
|
||||
* @see org.apache.lucene.search.highlight.FragmentScorer#endFragment(org.apache.lucene.search.highlight.TextFragment)
|
||||
*/
|
||||
public float getFragmentScore()
|
||||
{
|
||||
return totalScore;
|
||||
}
|
||||
|
||||
|
||||
/* (non-Javadoc)
|
||||
* @see org.apache.lucene.search.highlight.FragmentScorer#allFragmentsProcessed()
|
||||
*/
|
||||
public void allFragmentsProcessed()
|
||||
{
|
||||
//this class has no special operations to perform at end of processing
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,115 @@
|
|||
package org.apache.lucene.search.highlight;
|
||||
/**
|
||||
* Copyright 2002-2004 The Apache Software Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.util.HashSet;
|
||||
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.BooleanClause;
|
||||
import org.apache.lucene.search.BooleanQuery;
|
||||
import org.apache.lucene.search.PhraseQuery;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
|
||||
/**
|
||||
* Utility class used to extract the terms used in a query, plus any weights.
|
||||
* This class will not find terms for MultiTermQuery, RangeQuery and PrefixQuery classes
|
||||
* so the caller must pass a rewritten query (see query.rewrite) to obtain a list of
|
||||
* expanded terms.
|
||||
*
|
||||
*/
|
||||
public final class QueryTermExtractor
|
||||
{
|
||||
|
||||
/**
|
||||
* Extracts all terms texts of a given Query into an array of WeightedTerms
|
||||
*
|
||||
* @param query Query to extract term texts from
|
||||
* @return an array of the terms used in a query, plus their weights.
|
||||
* @throws IOException
|
||||
*/
|
||||
public static final WeightedTerm[] getTerms(Query query)
|
||||
{
|
||||
return getTerms(query,false);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Extracts all terms texts of a given Query into an array of WeightedTerms
|
||||
*
|
||||
* @param query Query to extract term texts from
|
||||
* @param prohibited <code>true</code> to extract "prohibited" terms, too
|
||||
* @return an array of the terms used in a query, plus their weights.
|
||||
* @throws IOException
|
||||
*/
|
||||
public static final WeightedTerm[] getTerms(Query query, boolean prohibited)
|
||||
{
|
||||
HashSet terms=new HashSet();
|
||||
getTerms(query,terms,prohibited);
|
||||
return (WeightedTerm[]) terms.toArray(new WeightedTerm[0]);
|
||||
}
|
||||
|
||||
private static final void getTerms(Query query, HashSet terms,boolean prohibited)
|
||||
{
|
||||
if (query instanceof BooleanQuery)
|
||||
getTermsFromBooleanQuery((BooleanQuery) query, terms, prohibited);
|
||||
else
|
||||
if (query instanceof PhraseQuery)
|
||||
getTermsFromPhraseQuery((PhraseQuery) query, terms);
|
||||
else
|
||||
if (query instanceof TermQuery)
|
||||
getTermsFromTermQuery((TermQuery) query, terms);
|
||||
// else
|
||||
// if ((query instanceof PrefixQuery)
|
||||
// || (query instanceof RangeQuery)
|
||||
// || (query instanceof MultiTermQuery))
|
||||
// {
|
||||
// //client should call rewrite BEFORE calling highlighter
|
||||
// // Query expandedQuery = rewrite(reader, query);
|
||||
// // getTerms(reader, expandedQuery, terms, prohibited);
|
||||
// }
|
||||
}
|
||||
|
||||
private static final void getTermsFromBooleanQuery(BooleanQuery query, HashSet terms, boolean prohibited)
|
||||
{
|
||||
BooleanClause[] queryClauses = query.getClauses();
|
||||
int i;
|
||||
|
||||
for (i = 0; i < queryClauses.length; i++)
|
||||
{
|
||||
if (prohibited || !queryClauses[i].prohibited)
|
||||
getTerms(queryClauses[i].query, terms, prohibited);
|
||||
}
|
||||
}
|
||||
|
||||
private static final void getTermsFromPhraseQuery(PhraseQuery query, HashSet terms)
|
||||
{
|
||||
Term[] queryTerms = query.getTerms();
|
||||
int i;
|
||||
|
||||
for (i = 0; i < queryTerms.length; i++)
|
||||
{
|
||||
terms.add(new WeightedTerm(query.getBoost(),queryTerms[i].text()));
|
||||
}
|
||||
}
|
||||
|
||||
private static final void getTermsFromTermQuery(TermQuery query, HashSet terms)
|
||||
{
|
||||
terms.add(new WeightedTerm(query.getBoost(),query.getTerm().text()));
|
||||
}
|
||||
|
||||
|
||||
}
|
|
@ -0,0 +1,48 @@
|
|||
package org.apache.lucene.search.highlight;
|
||||
/**
|
||||
* Copyright 2002-2004 The Apache Software Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
|
||||
/**
|
||||
* Adds to the score for a fragment based on its tokens
|
||||
* @author mark@searcharea.co.uk
|
||||
*/
|
||||
public interface Scorer
|
||||
{
|
||||
/**
|
||||
* called when a new fragment is started for consideration
|
||||
* @param newFragment
|
||||
*/
|
||||
public void startFragment(TextFragment newFragment);
|
||||
|
||||
/**
|
||||
* Called for each token in the current fragment
|
||||
* @param token The token to be scored
|
||||
* @return a score which is passed to the TermHighlighter class to influence the mark-up of the text
|
||||
* (this return value is NOT used to score the fragment)
|
||||
*/
|
||||
public float getTokenScore(Token token);
|
||||
|
||||
|
||||
/**
|
||||
* Called when the highlighter has no more tokens for the current fragment - the scorer will typically
|
||||
* call setScore() on the fragment passed in startFragment to record total info
|
||||
*
|
||||
*/
|
||||
public float getFragmentScore();
|
||||
|
||||
}
|
|
@ -0,0 +1,84 @@
|
|||
package org.apache.lucene.search.highlight;
|
||||
/**
|
||||
* Copyright 2002-2004 The Apache Software Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
|
||||
/**
|
||||
* {@link Fragmenter} implementation which breaks text up into same-size
|
||||
* fragments with no concerns over spotting sentence boundaries.
|
||||
* @author mark@searcharea.co.uk
|
||||
*/
|
||||
public class SimpleFragmenter implements Fragmenter
|
||||
{
|
||||
private static final int DEFAULT_FRAGMENT_SIZE =100;
|
||||
private int currentNumFrags;
|
||||
private int fragmentSize;
|
||||
|
||||
|
||||
public SimpleFragmenter()
|
||||
{
|
||||
this(DEFAULT_FRAGMENT_SIZE);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
*
|
||||
* @param fragmentSize size in bytes of each fragment
|
||||
*/
|
||||
public SimpleFragmenter(int fragmentSize)
|
||||
{
|
||||
this.fragmentSize=fragmentSize;
|
||||
}
|
||||
|
||||
/* (non-Javadoc)
|
||||
* @see org.apache.lucene.search.highlight.TextFragmenter#start(java.lang.String)
|
||||
*/
|
||||
public void start(String originalText)
|
||||
{
|
||||
currentNumFrags=1;
|
||||
}
|
||||
|
||||
/* (non-Javadoc)
|
||||
* @see org.apache.lucene.search.highlight.TextFragmenter#isNewFragment(org.apache.lucene.analysis.Token)
|
||||
*/
|
||||
public boolean isNewFragment(Token token)
|
||||
{
|
||||
boolean isNewFrag= token.endOffset()>=(fragmentSize*currentNumFrags);
|
||||
if(isNewFrag)
|
||||
{
|
||||
currentNumFrags++;
|
||||
}
|
||||
return isNewFrag;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return size in bytes of each fragment
|
||||
*/
|
||||
public int getFragmentSize()
|
||||
{
|
||||
return fragmentSize;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param size size in bytes of each fragment
|
||||
*/
|
||||
public void setFragmentSize(int size)
|
||||
{
|
||||
fragmentSize = size;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,57 @@
|
|||
package org.apache.lucene.search.highlight;
|
||||
/**
|
||||
* Copyright 2002-2004 The Apache Software Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Simple {@link Formatter} implementation to highlight terms with a pre and post tag
|
||||
* @author MAHarwood
|
||||
*
|
||||
*/
|
||||
public class SimpleHTMLFormatter implements Formatter
|
||||
{
|
||||
String preTag;
|
||||
String postTag;
|
||||
|
||||
public SimpleHTMLFormatter(String preTag, String postTag)
|
||||
{
|
||||
this.preTag = preTag;
|
||||
this.postTag = postTag;
|
||||
}
|
||||
|
||||
/**
|
||||
* Default constructor uses HTML: <B> tags to markup terms
|
||||
*
|
||||
**/
|
||||
public SimpleHTMLFormatter()
|
||||
{
|
||||
this.preTag = "<B>";
|
||||
this.postTag = "</B>";
|
||||
}
|
||||
|
||||
public String highlightTerm(String originalText, String term, float score, int startOffset)
|
||||
{
|
||||
if(score<=0)
|
||||
{
|
||||
return originalText;
|
||||
}
|
||||
StringBuffer sb = new StringBuffer();
|
||||
sb.append(preTag);
|
||||
sb.append(originalText);
|
||||
sb.append(postTag);
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,70 @@
|
|||
package org.apache.lucene.search.highlight;
|
||||
/**
|
||||
* Copyright 2002-2004 The Apache Software Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
|
||||
/**
|
||||
* Low-level class used to record information about a section of a document
|
||||
* with a score.
|
||||
* @author MAHarwood
|
||||
*
|
||||
*
|
||||
*/
|
||||
public class TextFragment
|
||||
{
|
||||
int fragNum;
|
||||
int textStartPos;
|
||||
int textEndPos;
|
||||
float score;
|
||||
|
||||
public TextFragment(int textStartPos, int fragNum)
|
||||
{
|
||||
this.textStartPos = textStartPos;
|
||||
this.fragNum = fragNum;
|
||||
}
|
||||
void setScore(float score)
|
||||
{
|
||||
this.score=score;
|
||||
}
|
||||
public float getScore()
|
||||
{
|
||||
return score;
|
||||
}
|
||||
/**
|
||||
* @param frag2 Fragment to be merged into this one
|
||||
*/
|
||||
public void merge(TextFragment frag2)
|
||||
{
|
||||
textEndPos = frag2.textEndPos;
|
||||
}
|
||||
/**
|
||||
* @param fragment
|
||||
* @return true if this fragment follows the one passed
|
||||
*/
|
||||
public boolean follows(TextFragment fragment)
|
||||
{
|
||||
return textStartPos == fragment.textEndPos;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the fragment sequence number
|
||||
*/
|
||||
public int getFragNum()
|
||||
{
|
||||
return fragNum;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,64 @@
|
|||
package org.apache.lucene.search.highlight;
|
||||
/**
|
||||
* Copyright 2002-2004 The Apache Software Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/** Lightweight class to hold term and a weight value used for scoring this term
|
||||
* @author Mark Harwood
|
||||
*/
|
||||
public class WeightedTerm
|
||||
{
|
||||
float weight; // multiplier
|
||||
String term; //stemmed form
|
||||
public WeightedTerm (float weight,String term)
|
||||
{
|
||||
this.weight=weight;
|
||||
this.term=term;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @return the term value (stemmed)
|
||||
*/
|
||||
public String getTerm()
|
||||
{
|
||||
return term;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the weight associated with this term
|
||||
*/
|
||||
public float getWeight()
|
||||
{
|
||||
return weight;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param term the term value (stemmed)
|
||||
*/
|
||||
public void setTerm(String term)
|
||||
{
|
||||
this.term = term;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param weight the weight associated with this term
|
||||
*/
|
||||
public void setWeight(float weight)
|
||||
{
|
||||
this.weight = weight;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,28 @@
|
|||
<html>
|
||||
<body>
|
||||
The highlight package contains classes to provide "keyword in context" features
|
||||
typically used to highlight search terms in the text of results pages. <br>
|
||||
The Highlighter class is the central component and can be used to extract the
|
||||
most interesting sections of a piece of text and highlight them, with the help of
|
||||
Fragmenter, FragmentScorer and Formatter classes.
|
||||
<h2>Example Usage</h2>
|
||||
|
||||
<pre>
|
||||
IndexSearcher searcher = new IndexSearcher(ramDir);
|
||||
Query query = QueryParser.parse("Kenne*", FIELD_NAME, analyzer);
|
||||
query=query.rewrite(reader); //required to expand search terms
|
||||
Hits hits = searcher.search(query);
|
||||
|
||||
Highlighter highlighter =new Highlighter(this,new QueryScorer(query));
|
||||
for (int i = 0; i < hits.length(); i++)
|
||||
{
|
||||
String text = hits.doc(i).get(FIELD_NAME);
|
||||
TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(text));
|
||||
// Get 3 best fragments and seperate with a "..."
|
||||
String result = highlighter.getBestFragments(tokenStream,text,3,"...");
|
||||
System.out.println(result);
|
||||
}
|
||||
</pre>
|
||||
|
||||
</body>
|
||||
</html>
|
Loading…
Reference in New Issue