LUCENE-5182: Terminate phrase searches early if max phrase window is exceeded

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1515847 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Simon Willnauer 2013-08-20 15:45:03 +00:00
parent cfbf8082d2
commit 96bc27a195
4 changed files with 64 additions and 2 deletions

View File

@ -212,6 +212,10 @@ Optimizations
transition from DWPT into IndexWriter is now done via an Event-Queue transition from DWPT into IndexWriter is now done via an Event-Queue
processed from within the IndexWriter in order to prevent suituations processed from within the IndexWriter in order to prevent suituations
where DWPT or DW calling int IW causing deadlocks. (Simon Willnauer) where DWPT or DW calling int IW causing deadlocks. (Simon Willnauer)
* LUCENE-5182: Terminate phrase searches early if max phrase window is
exceeded in FastVectorHighlighter to prevent very long running phrase
extraction if phrase terms are high frequent. (Simon Willnauer)
Documentation Documentation

View File

@ -69,6 +69,9 @@ public class FieldPhraseList {
} }
void extractPhrases(LinkedList<TermInfo> terms, QueryPhraseMap currMap, LinkedList<TermInfo> phraseCandidate, int longest) { void extractPhrases(LinkedList<TermInfo> terms, QueryPhraseMap currMap, LinkedList<TermInfo> phraseCandidate, int longest) {
if (phraseCandidate.size() > 1 && phraseCandidate.getLast().getPosition() - phraseCandidate.getFirst().getPosition() > currMap.getMaxPhraseWindow()) {
return;
}
if (terms.isEmpty()) { if (terms.isEmpty()) {
if (longest > 0) { if (longest > 0) {
addIfNoOverlap( new WeightedPhraseInfo( phraseCandidate.subList(0, longest), currMap.getBoost(), currMap.getTermOrPhraseNumber() ) ); addIfNoOverlap( new WeightedPhraseInfo( phraseCandidate.subList(0, longest), currMap.getBoost(), currMap.getTermOrPhraseNumber() ) );

View File

@ -30,7 +30,6 @@ import java.util.Set;
import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term; import org.apache.lucene.index.Term;
import org.apache.lucene.queries.CommonTermsQuery;
import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.ConstantScoreQuery; import org.apache.lucene.search.ConstantScoreQuery;
@ -63,6 +62,8 @@ public class FieldQuery {
// The maximum number of different matching terms accumulated from any one MultiTermQuery // The maximum number of different matching terms accumulated from any one MultiTermQuery
private static final int MAX_MTQ_TERMS = 1024; private static final int MAX_MTQ_TERMS = 1024;
private int maxPhraseWindow = 1;
FieldQuery( Query query, IndexReader reader, boolean phraseHighlight, boolean fieldMatch ) throws IOException { FieldQuery( Query query, IndexReader reader, boolean phraseHighlight, boolean fieldMatch ) throws IOException {
this.fieldMatch = fieldMatch; this.fieldMatch = fieldMatch;
@ -400,7 +401,7 @@ public class FieldQuery {
return positions[i] - positions[j]; return positions[i] - positions[j];
} }
}.sort(0, terms.length); }.sort(0, terms.length);
addToMap(pq, terms, positions, 0, subMap, pq.getSlop()); addToMap(pq, terms, positions, 0, subMap, pq.getSlop());
} }
else else
@ -474,8 +475,18 @@ public class FieldQuery {
this.boost = boost; this.boost = boost;
this.termOrPhraseNumber = fieldQuery.nextTermOrPhraseNumber(); this.termOrPhraseNumber = fieldQuery.nextTermOrPhraseNumber();
this.positions = positions; this.positions = positions;
if (positions != null) {
fieldQuery.maxPhraseWindow = Math.max(fieldQuery.maxPhraseWindow, slop + positions[positions.length-1] - positions[0]);
}
} }
} }
/**
* The max phrase window based on the actual phrase positions and slop.
*/
int getMaxPhraseWindow() {
return fieldQuery.maxPhraseWindow;
}
public boolean isTerminal(){ public boolean isTerminal(){
return terminal; return terminal;

View File

@ -47,6 +47,7 @@ import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory; import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util._TestUtil;
public class FastVectorHighlighterTest extends LuceneTestCase { public class FastVectorHighlighterTest extends LuceneTestCase {
@ -298,6 +299,49 @@ public class FastVectorHighlighterTest extends LuceneTestCase {
writer.close(); writer.close();
dir.close(); dir.close();
} }
public void testLotsOfPhrases() throws IOException {
Directory dir = newDirectory();
IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET)));
FieldType type = new FieldType(TextField.TYPE_STORED);
type.setStoreTermVectorOffsets(true);
type.setStoreTermVectorPositions(true);
type.setStoreTermVectors(true);
type.freeze();
String[] terms = { "org", "apache", "lucene"};
int iters = atLeast(1000);
StringBuilder builder = new StringBuilder();
for (int i = 0; i < iters; i++) {
builder.append(terms[random().nextInt(terms.length)]).append(" ");
if (random().nextInt(6) == 3) {
builder.append("solr").append(" ");
}
}
Document doc = new Document();
Field field = new Field("field", builder.toString(), type);
doc.add(field);
writer.addDocument(doc);
PhraseQuery query = new PhraseQuery();
query.add(new Term("field", "org"));
query.add(new Term("field", "apache"));
query.add(new Term("field", "lucene"));
FastVectorHighlighter highlighter = new FastVectorHighlighter();
IndexReader reader = DirectoryReader.open(writer, true);
IndexSearcher searcher = newSearcher(reader);
TopDocs hits = searcher.search(query, 10);
assertEquals(1, hits.totalHits);
FieldQuery fieldQuery = highlighter.getFieldQuery(query, reader);
String[] bestFragments = highlighter.getBestFragments(fieldQuery, reader, hits.scoreDocs[0].doc, "field", 1000, 1);
for (int i = 0; i < bestFragments.length; i++) {
String result = bestFragments[i].replaceAll("<b>org apache lucene</b>", "FOOBAR");
assertFalse(result.contains("org apache lucene"));
}
reader.close();
writer.close();
dir.close();
}
public void testOverlappingPhrases() throws IOException { public void testOverlappingPhrases() throws IOException {
final Analyzer analyzer = new Analyzer() { final Analyzer analyzer = new Analyzer() {