Prevent FVH from entering a very long running loop on large docs with high freq phrase terms.

Terminate phrase searches early if max phrase window is exceeded in
FastVectorHighlighter to prevent very long running phrase
extraction if phrase terms are high frequent. See LUCENE-5182

Closes #3543
This commit is contained in:
Simon Willnauer 2013-08-20 16:05:33 +02:00
parent 65056a63a1
commit 9af7a850e9
3 changed files with 99 additions and 0 deletions

View File

@ -70,6 +70,9 @@ public class XFieldPhraseList {
}
void extractPhrases(LinkedList<TermInfo> terms, QueryPhraseMap currMap, LinkedList<TermInfo> phraseCandidate, int longest) {
if (phraseCandidate.size() > 1 && phraseCandidate.getLast().getPosition() - phraseCandidate.getFirst().getPosition() > currMap.getMaxPhraseWindow()) {
return;
}
if (terms.isEmpty()) {
if (longest > 0) {
addIfNoOverlap( new WeightedPhraseInfo( phraseCandidate.subList(0, longest), currMap.getBoost(), currMap.getTermOrPhraseNumber() ) );

View File

@ -43,6 +43,8 @@ public class XFieldQuery {
Map<String, Set<String>> termSetMap = new HashMap<String, Set<String>>();
int termOrPhraseNumber; // used for colored tag support
private int maxPhraseWindow = 1;
// The maximum number of different matching terms accumulated from any one MultiTermQuery
private static final int MAX_MTQ_TERMS = 1024;
@ -456,9 +458,16 @@ public class XFieldQuery {
this.boost = boost;
this.termOrPhraseNumber = fieldQuery.nextTermOrPhraseNumber();
this.positions = positions;
if (positions != null) {
fieldQuery.maxPhraseWindow = Math.max(fieldQuery.maxPhraseWindow, slop + positions[positions.length-1] - positions[0]);
}
}
}
public int getMaxPhraseWindow() {
return fieldQuery.maxPhraseWindow;
}
public boolean isTerminal(){
return terminal;
}

View File

@ -0,0 +1,87 @@
package org.apache.lucene.search.vectorhighlight;
/*
* Licensed to ElasticSearch and Shay Banon under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. ElasticSearch licenses this
* file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockTokenFilter;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase;
import org.junit.Test;
import java.io.IOException;
public class XFastVectorHighlighterTest extends LuceneTestCase {
@Test
public void testLotsOfPhrases() throws IOException {
Directory dir = newDirectory();
IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET)));
FieldType type = new FieldType(TextField.TYPE_STORED);
type.setStoreTermVectorOffsets(true);
type.setStoreTermVectorPositions(true);
type.setStoreTermVectors(true);
type.freeze();
String[] terms = { "org", "apache", "lucene"};
int iters = atLeast(1000);
StringBuilder builder = new StringBuilder();
for (int i = 0; i < iters; i++) {
builder.append(terms[random().nextInt(terms.length)]).append(" ");
if (random().nextInt(6) == 3) {
builder.append("elasticsearch").append(" ");
}
}
Document doc = new Document();
Field field = new Field("field", builder.toString(), type);
doc.add(field);
writer.addDocument(doc);
PhraseQuery query = new PhraseQuery();
query.add(new Term("field", "org"));
query.add(new Term("field", "apache"));
query.add(new Term("field", "lucene"));
XFastVectorHighlighter highlighter = new XFastVectorHighlighter();
IndexReader reader = DirectoryReader.open(writer, true);
IndexSearcher searcher = newSearcher(reader);
TopDocs hits = searcher.search(query, 10);
assertEquals(1, hits.totalHits);
XFieldQuery fieldQuery = highlighter.getFieldQuery(query, reader);
String[] bestFragments = highlighter.getBestFragments(fieldQuery, reader, hits.scoreDocs[0].doc, "field", 1000, 1);
for (int i = 0; i < bestFragments.length; i++) {
String result = bestFragments[i].replaceAll("<b>org apache lucene</b>", "FOOBAR");
assertFalse(result.contains("org apache lucene"));
}
reader.close();
writer.close();
dir.close();
}
}