diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 7d14c63e774..05be1fe587b 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -212,6 +212,10 @@ Optimizations transition from DWPT into IndexWriter is now done via an Event-Queue processed from within the IndexWriter in order to prevent suituations where DWPT or DW calling int IW causing deadlocks. (Simon Willnauer) + +* LUCENE-5182: Terminate phrase searches early if max phrase window is + exceeded in FastVectorHighlighter to prevent very long running phrase + extraction if phrase terms are high frequent. (Simon Willnauer) Documentation diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldPhraseList.java b/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldPhraseList.java index 0168bbe87ba..ae24de9a14f 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldPhraseList.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldPhraseList.java @@ -69,6 +69,9 @@ public class FieldPhraseList { } void extractPhrases(LinkedList terms, QueryPhraseMap currMap, LinkedList phraseCandidate, int longest) { + if (phraseCandidate.size() > 1 && phraseCandidate.getLast().getPosition() - phraseCandidate.getFirst().getPosition() > currMap.getMaxPhraseWindow()) { + return; + } if (terms.isEmpty()) { if (longest > 0) { addIfNoOverlap( new WeightedPhraseInfo( phraseCandidate.subList(0, longest), currMap.getBoost(), currMap.getTermOrPhraseNumber() ) ); diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldQuery.java b/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldQuery.java index 5333862692f..ae3b5d13894 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldQuery.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldQuery.java @@ -30,7 +30,6 @@ import java.util.Set; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; -import org.apache.lucene.queries.CommonTermsQuery; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.ConstantScoreQuery; @@ -63,6 +62,8 @@ public class FieldQuery { // The maximum number of different matching terms accumulated from any one MultiTermQuery private static final int MAX_MTQ_TERMS = 1024; + + private int maxPhraseWindow = 1; FieldQuery( Query query, IndexReader reader, boolean phraseHighlight, boolean fieldMatch ) throws IOException { this.fieldMatch = fieldMatch; @@ -400,7 +401,7 @@ public class FieldQuery { return positions[i] - positions[j]; } }.sort(0, terms.length); - + addToMap(pq, terms, positions, 0, subMap, pq.getSlop()); } else @@ -474,8 +475,18 @@ public class FieldQuery { this.boost = boost; this.termOrPhraseNumber = fieldQuery.nextTermOrPhraseNumber(); this.positions = positions; + if (positions != null) { + fieldQuery.maxPhraseWindow = Math.max(fieldQuery.maxPhraseWindow, slop + positions[positions.length-1] - positions[0]); + } } } + + /** + * The max phrase window based on the actual phrase positions and slop. + */ + int getMaxPhraseWindow() { + return fieldQuery.maxPhraseWindow; + } public boolean isTerminal(){ return terminal; diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FastVectorHighlighterTest.java b/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FastVectorHighlighterTest.java index 137effb8dc3..24a3220acef 100644 --- a/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FastVectorHighlighterTest.java +++ b/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FastVectorHighlighterTest.java @@ -47,6 +47,7 @@ import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util._TestUtil; public class FastVectorHighlighterTest extends LuceneTestCase { @@ -298,6 +299,49 @@ public class FastVectorHighlighterTest extends LuceneTestCase { writer.close(); dir.close(); } + + public void testLotsOfPhrases() throws IOException { + Directory dir = newDirectory(); + IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET))); + FieldType type = new FieldType(TextField.TYPE_STORED); + type.setStoreTermVectorOffsets(true); + type.setStoreTermVectorPositions(true); + type.setStoreTermVectors(true); + type.freeze(); + String[] terms = { "org", "apache", "lucene"}; + int iters = atLeast(1000); + StringBuilder builder = new StringBuilder(); + for (int i = 0; i < iters; i++) { + builder.append(terms[random().nextInt(terms.length)]).append(" "); + if (random().nextInt(6) == 3) { + builder.append("solr").append(" "); + } + } + Document doc = new Document(); + Field field = new Field("field", builder.toString(), type); + doc.add(field); + writer.addDocument(doc); + PhraseQuery query = new PhraseQuery(); + query.add(new Term("field", "org")); + query.add(new Term("field", "apache")); + query.add(new Term("field", "lucene")); + + + FastVectorHighlighter highlighter = new FastVectorHighlighter(); + IndexReader reader = DirectoryReader.open(writer, true); + IndexSearcher searcher = newSearcher(reader); + TopDocs hits = searcher.search(query, 10); + assertEquals(1, hits.totalHits); + FieldQuery fieldQuery = highlighter.getFieldQuery(query, reader); + String[] bestFragments = highlighter.getBestFragments(fieldQuery, reader, hits.scoreDocs[0].doc, "field", 1000, 1); + for (int i = 0; i < bestFragments.length; i++) { + String result = bestFragments[i].replaceAll("org apache lucene", "FOOBAR"); + assertFalse(result.contains("org apache lucene")); + } + reader.close(); + writer.close(); + dir.close(); + } public void testOverlappingPhrases() throws IOException { final Analyzer analyzer = new Analyzer() {