From 9d6f451871aa7b1ce0539ed0c1f13d870c332bc3 Mon Sep 17 00:00:00 2001 From: Koji Sekiguchi Date: Mon, 27 Jun 2011 01:44:43 +0000 Subject: [PATCH] LUCENE-3234: add phraseLimit parameter for FVH speed up git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1139995 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/contrib/CHANGES.txt | 11 ++++++- .../FastVectorHighlighter.java | 14 ++++++++- .../vectorhighlight/FieldPhraseList.java | 19 +++++++++--- .../vectorhighlight/FieldPhraseListTest.java | 30 +++++++++++++++++++ solr/CHANGES.txt | 3 ++ .../solr/common/params/HighlightParams.java | 1 + .../highlight/DefaultSolrHighlighter.java | 1 + 7 files changed, 73 insertions(+), 6 deletions(-) diff --git a/lucene/contrib/CHANGES.txt b/lucene/contrib/CHANGES.txt index 18de8d2aeb6..5cd189b37a2 100644 --- a/lucene/contrib/CHANGES.txt +++ b/lucene/contrib/CHANGES.txt @@ -55,7 +55,16 @@ Bug Fixes ======================= Lucene 3.x (not yet released) ================ -(No changes) +New Features + + * LUCENE-3234: provide a limit on phrase analysis in FastVectorHighlighter for + highlighting speed up. Use FastVectorHighlighter.setPhraseLimit() to set limit + (e.g. 5000). (Mike Sokolov via Koji Sekiguchi) + +API Changes + +Bug Fixes + ======================= Lucene 3.3.0 ======================= diff --git a/lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FastVectorHighlighter.java b/lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FastVectorHighlighter.java index 48fe2d62bc4..ba6d8bbcc1c 100644 --- a/lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FastVectorHighlighter.java +++ b/lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FastVectorHighlighter.java @@ -35,6 +35,7 @@ public class FastVectorHighlighter { private final boolean fieldMatch; private final FragListBuilder fragListBuilder; private final FragmentsBuilder fragmentsBuilder; + private int phraseLimit = Integer.MAX_VALUE; /** * the default constructor. @@ -173,7 +174,7 @@ public class FastVectorHighlighter { final FieldQuery fieldQuery, IndexReader reader, int docId, String fieldName, int fragCharSize ) throws IOException { FieldTermStack fieldTermStack = new FieldTermStack( reader, docId, fieldName, fieldQuery ); - FieldPhraseList fieldPhraseList = new FieldPhraseList( fieldTermStack, fieldQuery ); + FieldPhraseList fieldPhraseList = new FieldPhraseList( fieldTermStack, fieldQuery, phraseLimit ); return fragListBuilder.createFieldFragList( fieldPhraseList, fragCharSize ); } @@ -190,4 +191,15 @@ public class FastVectorHighlighter { * @return whether fieldMatch or not */ public boolean isFieldMatch(){ return fieldMatch; } + + /** + * @return the maximum number of phrases to analyze when searching for the highest-scoring phrase. + */ + public int getPhraseLimit () { return phraseLimit; } + + /** + * set the maximum number of phrases to analyze when searching for the highest-scoring phrase. + * The default is 5000. To ensure that all phrases are analyzed, use a negative number or Integer.MAX_VALUE. + */ + public void setPhraseLimit (int phraseLimit) { this.phraseLimit = phraseLimit; } } diff --git a/lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldPhraseList.java b/lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldPhraseList.java index 15ae634639d..f2263c53661 100644 --- a/lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldPhraseList.java +++ b/lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldPhraseList.java @@ -30,21 +30,32 @@ import org.apache.lucene.search.vectorhighlight.FieldTermStack.TermInfo; public class FieldPhraseList { LinkedList phraseList = new LinkedList(); - + + /** + * create a FieldPhraseList that has no limit on the number of phrases to analyze + * + * @param fieldTermStack FieldTermStack object + * @param fieldQuery FieldQuery object + */ + public FieldPhraseList( FieldTermStack fieldTermStack, FieldQuery fieldQuery){ + this (fieldTermStack, fieldQuery, Integer.MAX_VALUE); + } + /** * a constructor. * * @param fieldTermStack FieldTermStack object * @param fieldQuery FieldQuery object + * @param phraseLimit maximum size of phraseList */ - public FieldPhraseList( FieldTermStack fieldTermStack, FieldQuery fieldQuery ){ + public FieldPhraseList( FieldTermStack fieldTermStack, FieldQuery fieldQuery, int phraseLimit){ final String field = fieldTermStack.getFieldName(); LinkedList phraseCandidate = new LinkedList(); QueryPhraseMap currMap = null; QueryPhraseMap nextMap = null; - while( !fieldTermStack.isEmpty() ){ - + while( !fieldTermStack.isEmpty() && (phraseList.size() < phraseLimit) ) + { phraseCandidate.clear(); TermInfo ti = fieldTermStack.pop(); diff --git a/lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FieldPhraseListTest.java b/lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FieldPhraseListTest.java index a8cedea0ec0..df58435d1ee 100644 --- a/lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FieldPhraseListTest.java +++ b/lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FieldPhraseListTest.java @@ -188,4 +188,34 @@ public class FieldPhraseListTest extends AbstractTestCase { assertEquals( 1, fpl.phraseList.size() ); assertEquals( "sppeeeed(1.0)((88,93))", fpl.phraseList.get( 0 ).toString() ); } + + /* This test shows a big speedup from limiting the number of analyzed phrases in + * this bad case for FieldPhraseList */ + /* But it is not reliable as a unit test since it is timing-dependent + public void testManyRepeatedTerms() throws Exception { + long t = System.currentTimeMillis(); + testManyTermsWithLimit (-1); + long t1 = System.currentTimeMillis(); + testManyTermsWithLimit (1); + long t2 = System.currentTimeMillis(); + assertTrue (t2-t1 * 1000 < t1-t); + } + private void testManyTermsWithLimit (int limit) throws Exception { + StringBuilder buf = new StringBuilder (); + for (int i = 0; i < 16000; i++) { + buf.append("a b c "); + } + make1d1fIndex( buf.toString()); + + Query query = tq("a"); + FieldQuery fq = new FieldQuery( query, true, true ); + FieldTermStack stack = new FieldTermStack( reader, 0, F, fq ); + FieldPhraseList fpl = new FieldPhraseList( stack, fq, limit); + if (limit < 0 || limit > 16000) + assertEquals( 16000, fpl.phraseList.size() ); + else + assertEquals( limit, fpl.phraseList.size() ); + assertEquals( "a(1.0)((0,1))", fpl.phraseList.get( 0 ).toString() ); + } + */ } diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 667ac90c021..c7d00cf41e5 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -254,6 +254,9 @@ New Features * SOLR-2458: post.jar enhanced to handle JSON, CSV and (janhoy) +* LUCENE-3234: add a new parameter hl.phraseLimit for FastVectorHighlighter speed up. + (Mike Sokolov via koji) + Optimizations ---------------------- diff --git a/solr/src/common/org/apache/solr/common/params/HighlightParams.java b/solr/src/common/org/apache/solr/common/params/HighlightParams.java index b8af3ad631d..8d68e2c3efe 100644 --- a/solr/src/common/org/apache/solr/common/params/HighlightParams.java +++ b/solr/src/common/org/apache/solr/common/params/HighlightParams.java @@ -45,6 +45,7 @@ public interface HighlightParams { public static final String USE_FVH = HIGHLIGHT + ".useFastVectorHighlighter"; public static final String TAG_PRE = HIGHLIGHT + ".tag.pre"; public static final String TAG_POST = HIGHLIGHT + ".tag.post"; + public static final String PHRASE_LIMIT = HIGHLIGHT + ".phraseLimit"; public static final String MULTI_VALUED_SEPARATOR = HIGHLIGHT + ".multiValuedSeparatorChar"; // Formatter diff --git a/solr/src/java/org/apache/solr/highlight/DefaultSolrHighlighter.java b/solr/src/java/org/apache/solr/highlight/DefaultSolrHighlighter.java index f95a940ae37..7d2b37d9581 100644 --- a/solr/src/java/org/apache/solr/highlight/DefaultSolrHighlighter.java +++ b/solr/src/java/org/apache/solr/highlight/DefaultSolrHighlighter.java @@ -362,6 +362,7 @@ public class DefaultSolrHighlighter extends SolrHighlighter implements PluginInf params.getBool( HighlightParams.USE_PHRASE_HIGHLIGHTER, true ), // FVH cannot process hl.requireFieldMatch parameter per-field basis params.getBool( HighlightParams.FIELD_MATCH, false ) ); + fvh.setPhraseLimit(params.getInt(HighlightParams.PHRASE_LIMIT, Integer.MAX_VALUE)); FieldQuery fieldQuery = fvh.getFieldQuery( query ); // Highlight each document