LUCENE-3234: add phraseLimit parameter for FVH speed up

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1139995 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Koji Sekiguchi 2011-06-27 01:44:43 +00:00
parent 88689125fa
commit 9d6f451871
7 changed files with 73 additions and 6 deletions

View File

@ -55,7 +55,16 @@ Bug Fixes
======================= Lucene 3.x (not yet released) ================
(No changes)
New Features
* LUCENE-3234: provide a limit on phrase analysis in FastVectorHighlighter for
highlighting speed up. Use FastVectorHighlighter.setPhraseLimit() to set limit
(e.g. 5000). (Mike Sokolov via Koji Sekiguchi)
API Changes
Bug Fixes
======================= Lucene 3.3.0 =======================

View File

@ -35,6 +35,7 @@ public class FastVectorHighlighter {
private final boolean fieldMatch;
private final FragListBuilder fragListBuilder;
private final FragmentsBuilder fragmentsBuilder;
private int phraseLimit = Integer.MAX_VALUE;
/**
* the default constructor.
@ -173,7 +174,7 @@ public class FastVectorHighlighter {
final FieldQuery fieldQuery, IndexReader reader, int docId,
String fieldName, int fragCharSize ) throws IOException {
FieldTermStack fieldTermStack = new FieldTermStack( reader, docId, fieldName, fieldQuery );
FieldPhraseList fieldPhraseList = new FieldPhraseList( fieldTermStack, fieldQuery );
FieldPhraseList fieldPhraseList = new FieldPhraseList( fieldTermStack, fieldQuery, phraseLimit );
return fragListBuilder.createFieldFragList( fieldPhraseList, fragCharSize );
}
@ -190,4 +191,15 @@ public class FastVectorHighlighter {
* @return whether fieldMatch or not
*/
public boolean isFieldMatch(){ return fieldMatch; }
/**
* @return the maximum number of phrases to analyze when searching for the highest-scoring phrase.
*/
public int getPhraseLimit () { return phraseLimit; }
/**
* set the maximum number of phrases to analyze when searching for the highest-scoring phrase.
* The default is 5000. To ensure that all phrases are analyzed, use a negative number or Integer.MAX_VALUE.
*/
public void setPhraseLimit (int phraseLimit) { this.phraseLimit = phraseLimit; }
}

View File

@ -30,21 +30,32 @@ import org.apache.lucene.search.vectorhighlight.FieldTermStack.TermInfo;
public class FieldPhraseList {
LinkedList<WeightedPhraseInfo> phraseList = new LinkedList<WeightedPhraseInfo>();
/**
* create a FieldPhraseList that has no limit on the number of phrases to analyze
*
* @param fieldTermStack FieldTermStack object
* @param fieldQuery FieldQuery object
*/
public FieldPhraseList( FieldTermStack fieldTermStack, FieldQuery fieldQuery){
this (fieldTermStack, fieldQuery, Integer.MAX_VALUE);
}
/**
* a constructor.
*
* @param fieldTermStack FieldTermStack object
* @param fieldQuery FieldQuery object
* @param phraseLimit maximum size of phraseList
*/
public FieldPhraseList( FieldTermStack fieldTermStack, FieldQuery fieldQuery ){
public FieldPhraseList( FieldTermStack fieldTermStack, FieldQuery fieldQuery, int phraseLimit){
final String field = fieldTermStack.getFieldName();
LinkedList<TermInfo> phraseCandidate = new LinkedList<TermInfo>();
QueryPhraseMap currMap = null;
QueryPhraseMap nextMap = null;
while( !fieldTermStack.isEmpty() ){
while( !fieldTermStack.isEmpty() && (phraseList.size() < phraseLimit) )
{
phraseCandidate.clear();
TermInfo ti = fieldTermStack.pop();

View File

@ -188,4 +188,34 @@ public class FieldPhraseListTest extends AbstractTestCase {
assertEquals( 1, fpl.phraseList.size() );
assertEquals( "sppeeeed(1.0)((88,93))", fpl.phraseList.get( 0 ).toString() );
}
/* This test shows a big speedup from limiting the number of analyzed phrases in
* this bad case for FieldPhraseList */
/* But it is not reliable as a unit test since it is timing-dependent
public void testManyRepeatedTerms() throws Exception {
long t = System.currentTimeMillis();
testManyTermsWithLimit (-1);
long t1 = System.currentTimeMillis();
testManyTermsWithLimit (1);
long t2 = System.currentTimeMillis();
assertTrue (t2-t1 * 1000 < t1-t);
}
private void testManyTermsWithLimit (int limit) throws Exception {
StringBuilder buf = new StringBuilder ();
for (int i = 0; i < 16000; i++) {
buf.append("a b c ");
}
make1d1fIndex( buf.toString());
Query query = tq("a");
FieldQuery fq = new FieldQuery( query, true, true );
FieldTermStack stack = new FieldTermStack( reader, 0, F, fq );
FieldPhraseList fpl = new FieldPhraseList( stack, fq, limit);
if (limit < 0 || limit > 16000)
assertEquals( 16000, fpl.phraseList.size() );
else
assertEquals( limit, fpl.phraseList.size() );
assertEquals( "a(1.0)((0,1))", fpl.phraseList.get( 0 ).toString() );
}
*/
}

View File

@ -254,6 +254,9 @@ New Features
* SOLR-2458: post.jar enhanced to handle JSON, CSV and <optimize> (janhoy)
* LUCENE-3234: add a new parameter hl.phraseLimit for FastVectorHighlighter speed up.
(Mike Sokolov via koji)
Optimizations
----------------------

View File

@ -45,6 +45,7 @@ public interface HighlightParams {
public static final String USE_FVH = HIGHLIGHT + ".useFastVectorHighlighter";
public static final String TAG_PRE = HIGHLIGHT + ".tag.pre";
public static final String TAG_POST = HIGHLIGHT + ".tag.post";
public static final String PHRASE_LIMIT = HIGHLIGHT + ".phraseLimit";
public static final String MULTI_VALUED_SEPARATOR = HIGHLIGHT + ".multiValuedSeparatorChar";
// Formatter

View File

@ -362,6 +362,7 @@ public class DefaultSolrHighlighter extends SolrHighlighter implements PluginInf
params.getBool( HighlightParams.USE_PHRASE_HIGHLIGHTER, true ),
// FVH cannot process hl.requireFieldMatch parameter per-field basis
params.getBool( HighlightParams.FIELD_MATCH, false ) );
fvh.setPhraseLimit(params.getInt(HighlightParams.PHRASE_LIMIT, Integer.MAX_VALUE));
FieldQuery fieldQuery = fvh.getFieldQuery( query );
// Highlight each document