mirror of https://github.com/apache/lucene.git
LUCENE-5182: Terminate phrase searches early if max phrase window is exceeded
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1515847 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
cfbf8082d2
commit
96bc27a195
|
@ -213,6 +213,10 @@ Optimizations
|
||||||
processed from within the IndexWriter in order to prevent suituations
|
processed from within the IndexWriter in order to prevent suituations
|
||||||
where DWPT or DW calling int IW causing deadlocks. (Simon Willnauer)
|
where DWPT or DW calling int IW causing deadlocks. (Simon Willnauer)
|
||||||
|
|
||||||
|
* LUCENE-5182: Terminate phrase searches early if max phrase window is
|
||||||
|
exceeded in FastVectorHighlighter to prevent very long running phrase
|
||||||
|
extraction if phrase terms are high frequent. (Simon Willnauer)
|
||||||
|
|
||||||
Documentation
|
Documentation
|
||||||
|
|
||||||
* LUCENE-4894: remove facet userguide as it was outdated. Partially absorbed into
|
* LUCENE-4894: remove facet userguide as it was outdated. Partially absorbed into
|
||||||
|
|
|
@ -69,6 +69,9 @@ public class FieldPhraseList {
|
||||||
}
|
}
|
||||||
|
|
||||||
void extractPhrases(LinkedList<TermInfo> terms, QueryPhraseMap currMap, LinkedList<TermInfo> phraseCandidate, int longest) {
|
void extractPhrases(LinkedList<TermInfo> terms, QueryPhraseMap currMap, LinkedList<TermInfo> phraseCandidate, int longest) {
|
||||||
|
if (phraseCandidate.size() > 1 && phraseCandidate.getLast().getPosition() - phraseCandidate.getFirst().getPosition() > currMap.getMaxPhraseWindow()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
if (terms.isEmpty()) {
|
if (terms.isEmpty()) {
|
||||||
if (longest > 0) {
|
if (longest > 0) {
|
||||||
addIfNoOverlap( new WeightedPhraseInfo( phraseCandidate.subList(0, longest), currMap.getBoost(), currMap.getTermOrPhraseNumber() ) );
|
addIfNoOverlap( new WeightedPhraseInfo( phraseCandidate.subList(0, longest), currMap.getBoost(), currMap.getTermOrPhraseNumber() ) );
|
||||||
|
|
|
@ -30,7 +30,6 @@ import java.util.Set;
|
||||||
|
|
||||||
import org.apache.lucene.index.IndexReader;
|
import org.apache.lucene.index.IndexReader;
|
||||||
import org.apache.lucene.index.Term;
|
import org.apache.lucene.index.Term;
|
||||||
import org.apache.lucene.queries.CommonTermsQuery;
|
|
||||||
import org.apache.lucene.search.BooleanClause;
|
import org.apache.lucene.search.BooleanClause;
|
||||||
import org.apache.lucene.search.BooleanQuery;
|
import org.apache.lucene.search.BooleanQuery;
|
||||||
import org.apache.lucene.search.ConstantScoreQuery;
|
import org.apache.lucene.search.ConstantScoreQuery;
|
||||||
|
@ -64,6 +63,8 @@ public class FieldQuery {
|
||||||
// The maximum number of different matching terms accumulated from any one MultiTermQuery
|
// The maximum number of different matching terms accumulated from any one MultiTermQuery
|
||||||
private static final int MAX_MTQ_TERMS = 1024;
|
private static final int MAX_MTQ_TERMS = 1024;
|
||||||
|
|
||||||
|
private int maxPhraseWindow = 1;
|
||||||
|
|
||||||
FieldQuery( Query query, IndexReader reader, boolean phraseHighlight, boolean fieldMatch ) throws IOException {
|
FieldQuery( Query query, IndexReader reader, boolean phraseHighlight, boolean fieldMatch ) throws IOException {
|
||||||
this.fieldMatch = fieldMatch;
|
this.fieldMatch = fieldMatch;
|
||||||
Set<Query> flatQueries = new LinkedHashSet<Query>();
|
Set<Query> flatQueries = new LinkedHashSet<Query>();
|
||||||
|
@ -474,8 +475,18 @@ public class FieldQuery {
|
||||||
this.boost = boost;
|
this.boost = boost;
|
||||||
this.termOrPhraseNumber = fieldQuery.nextTermOrPhraseNumber();
|
this.termOrPhraseNumber = fieldQuery.nextTermOrPhraseNumber();
|
||||||
this.positions = positions;
|
this.positions = positions;
|
||||||
|
if (positions != null) {
|
||||||
|
fieldQuery.maxPhraseWindow = Math.max(fieldQuery.maxPhraseWindow, slop + positions[positions.length-1] - positions[0]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The max phrase window based on the actual phrase positions and slop.
|
||||||
|
*/
|
||||||
|
int getMaxPhraseWindow() {
|
||||||
|
return fieldQuery.maxPhraseWindow;
|
||||||
|
}
|
||||||
|
|
||||||
public boolean isTerminal(){
|
public boolean isTerminal(){
|
||||||
return terminal;
|
return terminal;
|
||||||
|
|
|
@ -47,6 +47,7 @@ import org.apache.lucene.search.TermQuery;
|
||||||
import org.apache.lucene.search.TopDocs;
|
import org.apache.lucene.search.TopDocs;
|
||||||
import org.apache.lucene.store.Directory;
|
import org.apache.lucene.store.Directory;
|
||||||
import org.apache.lucene.util.LuceneTestCase;
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
|
import org.apache.lucene.util._TestUtil;
|
||||||
|
|
||||||
|
|
||||||
public class FastVectorHighlighterTest extends LuceneTestCase {
|
public class FastVectorHighlighterTest extends LuceneTestCase {
|
||||||
|
@ -299,6 +300,49 @@ public class FastVectorHighlighterTest extends LuceneTestCase {
|
||||||
dir.close();
|
dir.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testLotsOfPhrases() throws IOException {
|
||||||
|
Directory dir = newDirectory();
|
||||||
|
IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET)));
|
||||||
|
FieldType type = new FieldType(TextField.TYPE_STORED);
|
||||||
|
type.setStoreTermVectorOffsets(true);
|
||||||
|
type.setStoreTermVectorPositions(true);
|
||||||
|
type.setStoreTermVectors(true);
|
||||||
|
type.freeze();
|
||||||
|
String[] terms = { "org", "apache", "lucene"};
|
||||||
|
int iters = atLeast(1000);
|
||||||
|
StringBuilder builder = new StringBuilder();
|
||||||
|
for (int i = 0; i < iters; i++) {
|
||||||
|
builder.append(terms[random().nextInt(terms.length)]).append(" ");
|
||||||
|
if (random().nextInt(6) == 3) {
|
||||||
|
builder.append("solr").append(" ");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Document doc = new Document();
|
||||||
|
Field field = new Field("field", builder.toString(), type);
|
||||||
|
doc.add(field);
|
||||||
|
writer.addDocument(doc);
|
||||||
|
PhraseQuery query = new PhraseQuery();
|
||||||
|
query.add(new Term("field", "org"));
|
||||||
|
query.add(new Term("field", "apache"));
|
||||||
|
query.add(new Term("field", "lucene"));
|
||||||
|
|
||||||
|
|
||||||
|
FastVectorHighlighter highlighter = new FastVectorHighlighter();
|
||||||
|
IndexReader reader = DirectoryReader.open(writer, true);
|
||||||
|
IndexSearcher searcher = newSearcher(reader);
|
||||||
|
TopDocs hits = searcher.search(query, 10);
|
||||||
|
assertEquals(1, hits.totalHits);
|
||||||
|
FieldQuery fieldQuery = highlighter.getFieldQuery(query, reader);
|
||||||
|
String[] bestFragments = highlighter.getBestFragments(fieldQuery, reader, hits.scoreDocs[0].doc, "field", 1000, 1);
|
||||||
|
for (int i = 0; i < bestFragments.length; i++) {
|
||||||
|
String result = bestFragments[i].replaceAll("<b>org apache lucene</b>", "FOOBAR");
|
||||||
|
assertFalse(result.contains("org apache lucene"));
|
||||||
|
}
|
||||||
|
reader.close();
|
||||||
|
writer.close();
|
||||||
|
dir.close();
|
||||||
|
}
|
||||||
|
|
||||||
public void testOverlappingPhrases() throws IOException {
|
public void testOverlappingPhrases() throws IOException {
|
||||||
final Analyzer analyzer = new Analyzer() {
|
final Analyzer analyzer = new Analyzer() {
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue