LUCENE-5182: Terminate phrase searches early if max phrase window is exceeded

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1515847 13f79535-47bb-0310-9956-ffa450edef68
2013-08-20 15:45:03 +00:00 · 2013-08-20 15:45:03 +00:00 · 96bc27a195
parent cfbf8082d2
commit 96bc27a195
4 changed files with 64 additions and 2 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -212,6 +212,10 @@ Optimizations
  transition from DWPT into IndexWriter is now done via an Event-Queue
  processed from within the IndexWriter in order to prevent suituations
  where DWPT or DW calling int IW causing deadlocks. (Simon Willnauer)
 * LUCENE-5182: Terminate phrase searches early if max phrase window is 
  exceeded in FastVectorHighlighter to prevent very long running phrase
  extraction if phrase terms are high frequent. (Simon Willnauer)
 Documentation
--- a/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldPhraseList.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldPhraseList.java
@ -69,6 +69,9 @@ public class FieldPhraseList {
  }
  void extractPhrases(LinkedList<TermInfo> terms, QueryPhraseMap currMap, LinkedList<TermInfo> phraseCandidate, int longest) {
    if (phraseCandidate.size() > 1 && phraseCandidate.getLast().getPosition() - phraseCandidate.getFirst().getPosition() > currMap.getMaxPhraseWindow()) {
      return;
    }
    if (terms.isEmpty()) {
      if (longest > 0) {
        addIfNoOverlap( new WeightedPhraseInfo( phraseCandidate.subList(0, longest), currMap.getBoost(), currMap.getTermOrPhraseNumber() ) );
--- a/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldQuery.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldQuery.java
@ -30,7 +30,6 @@ import java.util.Set;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.queries.CommonTermsQuery;
 import org.apache.lucene.search.BooleanClause;
 import org.apache.lucene.search.BooleanQuery;
 import org.apache.lucene.search.ConstantScoreQuery;
@ -63,6 +62,8 @@ public class FieldQuery {
  // The maximum number of different matching terms accumulated from any one MultiTermQuery
  private static final int MAX_MTQ_TERMS = 1024;
  private int maxPhraseWindow = 1;
  FieldQuery( Query query, IndexReader reader, boolean phraseHighlight, boolean fieldMatch ) throws IOException {
    this.fieldMatch = fieldMatch;
@ -400,7 +401,7 @@ public class FieldQuery {
            return positions[i] - positions[j];
          }
        }.sort(0, terms.length);
-
+        
        addToMap(pq, terms, positions, 0, subMap, pq.getSlop());
      }
      else
@ -474,8 +475,18 @@ public class FieldQuery {
        this.boost = boost;
        this.termOrPhraseNumber = fieldQuery.nextTermOrPhraseNumber();
        this.positions = positions;
        if (positions != null) {
          fieldQuery.maxPhraseWindow = Math.max(fieldQuery.maxPhraseWindow, slop + positions[positions.length-1] - positions[0]);
        }
      }
    }
    /**
     * The max phrase window based on the actual phrase positions and slop.
     */ 
    int getMaxPhraseWindow() {
      return fieldQuery.maxPhraseWindow;
    }
    public boolean isTerminal(){
      return terminal;
--- a/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FastVectorHighlighterTest.java
+++ b/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FastVectorHighlighterTest.java
@ -47,6 +47,7 @@ import org.apache.lucene.search.TermQuery;
 import org.apache.lucene.search.TopDocs;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.util.LuceneTestCase;
 import org.apache.lucene.util._TestUtil;
 public class FastVectorHighlighterTest extends LuceneTestCase {
@ -298,6 +299,49 @@ public class FastVectorHighlighterTest extends LuceneTestCase {
    writer.close();
    dir.close();
  }
  public void testLotsOfPhrases() throws IOException {
    Directory dir = newDirectory();
    IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT,  new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET)));
    FieldType type = new FieldType(TextField.TYPE_STORED);
    type.setStoreTermVectorOffsets(true);
    type.setStoreTermVectorPositions(true);
    type.setStoreTermVectors(true);
    type.freeze();
    String[] terms = { "org", "apache", "lucene"};
    int iters = atLeast(1000);
    StringBuilder builder = new StringBuilder();
    for (int i = 0; i < iters; i++) {
      builder.append(terms[random().nextInt(terms.length)]).append(" ");
      if (random().nextInt(6) == 3) {
        builder.append("solr").append(" ");
      }
    }
      Document doc = new Document();
      Field field = new Field("field", builder.toString(), type);
      doc.add(field);
      writer.addDocument(doc);
    PhraseQuery query = new PhraseQuery();
    query.add(new Term("field", "org"));
    query.add(new Term("field", "apache"));
    query.add(new Term("field", "lucene"));
    FastVectorHighlighter highlighter = new FastVectorHighlighter();
    IndexReader reader = DirectoryReader.open(writer, true);
    IndexSearcher searcher = newSearcher(reader);
    TopDocs hits = searcher.search(query, 10);
    assertEquals(1, hits.totalHits);
    FieldQuery fieldQuery  = highlighter.getFieldQuery(query, reader);
    String[] bestFragments = highlighter.getBestFragments(fieldQuery, reader, hits.scoreDocs[0].doc, "field", 1000, 1);
    for (int i = 0; i < bestFragments.length; i++) {
      String result = bestFragments[i].replaceAll("<b>org apache lucene</b>", "FOOBAR");
      assertFalse(result.contains("org apache lucene"));
    }
    reader.close();
    writer.close();
    dir.close();
  }
  public void testOverlappingPhrases() throws IOException {
    final Analyzer analyzer = new Analyzer() {