Simplified QueryTermExtractor.java to make use of Query.extractTerms method (especially now that all the SpanQuery classes implement this correctly).

Added tests in Junit test to demonstrate new support for other Queries (FilteredQuery) now that we use the standard extractTerms feature of Query objects.
Also deprecated highlighter getBestFragments method that hard-coded choice of fieldname and introduced new variation that takes an additional fieldName argument

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@389888 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Mark Harwood 2006-03-29 21:01:40 +00:00
parent 4696ac421e
commit 286f4f5f07
3 changed files with 99 additions and 82 deletions

View File

@ -113,7 +113,8 @@ public class Highlighter
* into chunks
* @param text text to highlight terms in
* @param maxNumFragments the maximum number of fragments.
*
* @deprecated This method incorrectly hardcodes the choice of fieldname. Use the
* method of the same name that takes a fieldname.
* @return highlighted text fragments (between 0 and maxNumFragments number of fragments)
*/
public final String[] getBestFragments(
@ -125,6 +126,29 @@ public class Highlighter
TokenStream tokenStream = analyzer.tokenStream("field", new StringReader(text));
return getBestFragments(tokenStream, text, maxNumFragments);
}
/**
* Highlights chosen terms in a text, extracting the most relevant sections.
* This is a convenience method that calls
* {@link #getBestFragments(TokenStream, String, int)}
*
* @param analyzer the analyzer that will be used to split <code>text</code>
* into chunks
* @param fieldName the name of the field being highlighted (used by analyzer)
* @param text text to highlight terms in
* @param maxNumFragments the maximum number of fragments.
*
* @return highlighted text fragments (between 0 and maxNumFragments number of fragments)
*/
public final String[] getBestFragments(
Analyzer analyzer,
String fieldName,
String text,
int maxNumFragments)
throws IOException
{
TokenStream tokenStream = analyzer.tokenStream(fieldName, new StringReader(text));
return getBestFragments(tokenStream, text, maxNumFragments);
}
/**
* Highlights chosen terms in a text, extracting the most relevant sections.

View File

@ -16,18 +16,12 @@ package org.apache.lucene.search.highlight;
*/
import java.io.IOException;
import java.util.Collection;
import java.util.HashSet;
import java.util.Iterator;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.spans.SpanNearQuery;
/**
* Utility class used to extract the terms used in a query, plus any weights.
@ -114,75 +108,22 @@ public final class QueryTermExtractor
//fieldname MUST be interned prior to this call
private static final void getTerms(Query query, HashSet terms,boolean prohibited, String fieldName)
{
if (query instanceof BooleanQuery)
getTermsFromBooleanQuery((BooleanQuery) query, terms, prohibited, fieldName);
else
if (query instanceof PhraseQuery)
getTermsFromPhraseQuery((PhraseQuery) query, terms, fieldName);
else
if (query instanceof TermQuery)
getTermsFromTermQuery((TermQuery) query, terms, fieldName);
else
if(query instanceof SpanNearQuery)
getTermsFromSpanNearQuery((SpanNearQuery) query, terms, fieldName);
try
{
HashSet nonWeightedTerms=new HashSet();
query.extractTerms(nonWeightedTerms);
for (Iterator iter = nonWeightedTerms.iterator(); iter.hasNext();)
{
Term term = (Term) iter.next();
if((fieldName==null)||(term.field()==fieldName))
{
terms.add(new WeightedTerm(query.getBoost(),term.text()));
}
}
}
catch(UnsupportedOperationException ignore)
{
//this is non-fatal for our purposes
}
}
private static final void getTermsFromBooleanQuery(BooleanQuery query, HashSet terms, boolean prohibited, String fieldName)
{
BooleanClause[] queryClauses = query.getClauses();
int i;
for (i = 0; i < queryClauses.length; i++)
{
//Pre Lucene 2.0 code
// if (prohibited || !queryClauses[i].prohibited)
// getTerms(queryClauses[i].query, terms, prohibited, fieldName);
// Lucene 2.0 ready code
if (prohibited || queryClauses[i].getOccur()!=BooleanClause.Occur.MUST_NOT)
getTerms(queryClauses[i].getQuery(), terms, prohibited, fieldName);
}
}
private static final void getTermsFromPhraseQuery(PhraseQuery query, HashSet terms, String fieldName)
{
Term[] queryTerms = query.getTerms();
int i;
for (i = 0; i < queryTerms.length; i++)
{
if((fieldName==null)||(queryTerms[i].field()==fieldName))
{
terms.add(new WeightedTerm(query.getBoost(),queryTerms[i].text()));
}
}
}
private static final void getTermsFromTermQuery(TermQuery query, HashSet terms, String fieldName)
{
if((fieldName==null)||(query.getTerm().field()==fieldName))
{
terms.add(new WeightedTerm(query.getBoost(),query.getTerm().text()));
}
}
private static final void getTermsFromSpanNearQuery(SpanNearQuery query, HashSet terms, String fieldName){
Collection queryTerms = query.getTerms();
for(Iterator iterator = queryTerms.iterator(); iterator.hasNext();){
// break it out for debugging.
Term term = (Term) iterator.next();
String text = term.text();
if((fieldName==null)||(term.field()==fieldName))
{
terms.add(new WeightedTerm(query.getBoost(), text));
}
}
}
}

View File

@ -38,13 +38,20 @@ import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.FilteredQuery;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MultiSearcher;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.RangeFilter;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.spans.SpanNearQuery;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.store.RAMDirectory;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;
@ -140,6 +147,47 @@ public class HighlighterTest extends TestCase implements Formatter
//Currently highlights "John" and "Kennedy" separately
assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 2);
}
public void testGetBestFragmentsSpan() throws Exception
{
SpanQuery clauses[]={
new SpanTermQuery(new Term("contents","john")),
new SpanTermQuery(new Term("contents","kennedy")),
};
SpanNearQuery snq=new SpanNearQuery(clauses,1,true);
doSearching(snq);
doStandardHighlights();
//Currently highlights "John" and "Kennedy" separately
assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 2);
}
public void testGetBestFragmentsFilteredQuery() throws Exception
{
RangeFilter rf=new RangeFilter("contents","john","john",true,true);
SpanQuery clauses[]={
new SpanTermQuery(new Term("contents","john")),
new SpanTermQuery(new Term("contents","kennedy")),
};
SpanNearQuery snq=new SpanNearQuery(clauses,1,true);
FilteredQuery fq=new FilteredQuery(snq,rf);
doSearching(fq);
doStandardHighlights();
//Currently highlights "John" and "Kennedy" separately
assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 2);
}
public void testGetBestFragmentsFilteredPhraseQuery() throws Exception
{
RangeFilter rf=new RangeFilter("contents","john","john",true,true);
PhraseQuery pq=new PhraseQuery();
pq.add(new Term("contents","john"));
pq.add(new Term("contents","kennedy"));
FilteredQuery fq=new FilteredQuery(pq,rf);
doSearching(fq);
doStandardHighlights();
//Currently highlights "John" and "Kennedy" separately
assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 2);
}
public void testGetBestFragmentsMultiTerm() throws Exception
{
@ -181,7 +229,7 @@ public class HighlighterTest extends TestCase implements Formatter
for (int i = 0; i < hits.length(); i++)
{
String text = hits.doc(i).get(FIELD_NAME);
highlighter.getBestFragments(analyzer, text, 10);
highlighter.getBestFragments(analyzer,FIELD_NAME, text, 10);
}
assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 4);
@ -539,11 +587,15 @@ public class HighlighterTest extends TestCase implements Formatter
public void doSearching(String queryString) throws Exception
{
searcher = new IndexSearcher(ramDir);
QueryParser parser=new QueryParser(FIELD_NAME, new StandardAnalyzer());
query = parser.parse(queryString);
doSearching(query);
}
public void doSearching(Query unReWrittenQuery) throws Exception
{
searcher = new IndexSearcher(ramDir);
//for any multi-term queries to work (prefix, wildcard, range,fuzzy etc) you must use a rewritten query!
query=query.rewrite(reader);
query=unReWrittenQuery.rewrite(reader);
System.out.println("Searching for: " + query.toString(FIELD_NAME));
hits = searcher.search(query);
}