LUCENE-4728: Unknown and not explicitly mapped queries are now rewritten against the highlighting IndexReader to obtain primitive queries before discarding the query entirely.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1442590 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Simon Willnauer 2013-02-05 13:59:26 +00:00
parent d0428c106f
commit 81c7ba4601
10 changed files with 321 additions and 59 deletions

View File

@ -61,6 +61,11 @@
<artifactId>lucene-memory</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>${project.groupId}</groupId>
<artifactId>lucene-queries</artifactId>
<version>${project.version}</version>
</dependency>
</dependencies>
<build>
<sourceDirectory>${module-path}/src/java</sourceDirectory>

View File

@ -94,6 +94,12 @@ New Features
* LUCENE-4723: Add AnalyzerFactoryTask to benchmark, and enable analyzer
creation via the resulting factories using NewAnalyzerTask. (Steve Rowe)
* LUCENE-4728: Unknown and not explicitly mapped queries are now rewritten
against the highlighting IndexReader to obtain primitive queries before
discarding the query entirely. WeightedSpanTermExtractor now builds a
MemoryIndex only once even if multiple fields are highlighted.
(Simon Willnauer)
API Changes
* LUCENE-4709: FacetResultNode no longer has a residue field. (Shai Erera)

View File

@ -27,6 +27,7 @@
<path id="classpath">
<pathelement path="${memory.jar}"/>
<pathelement path="${queries.jar}"/>
<path refid="base.classpath"/>
</path>

View File

@ -189,9 +189,7 @@ public class Highlighter
CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class);
OffsetAttribute offsetAtt = tokenStream.addAttribute(OffsetAttribute.class);
tokenStream.addAttribute(PositionIncrementAttribute.class);
tokenStream.reset();
TextFragment currentFrag = new TextFragment(newText,newText.length(), docFrags.size());
if (fragmentScorer instanceof QueryScorer) {

View File

@ -18,7 +18,7 @@ package org.apache.lucene.search.highlight;
*/
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
@ -29,11 +29,18 @@ import java.util.TreeSet;
import org.apache.lucene.analysis.CachingTokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.index.AtomicReader;
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.FilterAtomicReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermContext;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.memory.MemoryIndex;
import org.apache.lucene.queries.CommonTermsQuery;
import org.apache.lucene.search.*;
import org.apache.lucene.search.spans.FieldMaskingSpanQuery;
import org.apache.lucene.search.spans.SpanFirstQuery;
@ -44,6 +51,8 @@ import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.search.spans.Spans;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.IOUtils;
/**
* Class used to extract {@link WeightedSpanTerm}s from a {@link Query} based on whether
@ -53,12 +62,13 @@ public class WeightedSpanTermExtractor {
private String fieldName;
private TokenStream tokenStream;
private Map<String,AtomicReaderContext> readers = new HashMap<String,AtomicReaderContext>(10);
private String defaultField;
private boolean expandMultiTermQuery;
private boolean cachedTokenStream;
private boolean wrapToCaching = true;
private int maxDocCharsToAnalyze;
private AtomicReader reader = null;
public WeightedSpanTermExtractor() {
}
@ -69,18 +79,6 @@ public class WeightedSpanTermExtractor {
}
}
private void closeReaders() {
Collection<AtomicReaderContext> ctxSet = readers.values();
for (final AtomicReaderContext ctx : ctxSet) {
try {
ctx.reader().close();
} catch (IOException e) {
// alert?
}
}
}
/**
* Fills a <code>Map</code> with <@link WeightedSpanTerm>s using the terms from the supplied <code>Query</code>.
*
@ -146,21 +144,14 @@ public class WeightedSpanTermExtractor {
if (q != null) {
extract(q, terms);
}
} else if (query instanceof CommonTermsQuery) {
// specialized since rewriting would change the result query
// this query is TermContext sensitive.
extractWeightedTerms(terms, query);
} else if (query instanceof DisjunctionMaxQuery) {
for (Iterator<Query> iterator = ((DisjunctionMaxQuery) query).iterator(); iterator.hasNext();) {
extract(iterator.next(), terms);
}
} else if (query instanceof MultiTermQuery && expandMultiTermQuery) {
MultiTermQuery mtq = ((MultiTermQuery)query);
if(mtq.getRewriteMethod() != MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE) {
mtq = (MultiTermQuery) mtq.clone();
mtq.setRewriteMethod(MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE);
query = mtq;
}
if (mtq.getField() != null) {
IndexReader ir = getLeafContextForField(mtq.getField()).reader();
extract(query.rewrite(ir), terms);
}
} else if (query instanceof MultiPhraseQuery) {
final MultiPhraseQuery mpq = (MultiPhraseQuery) query;
final List<Term[]> termArrays = mpq.getTermArrays();
@ -210,12 +201,30 @@ public class WeightedSpanTermExtractor {
sp.setBoost(query.getBoost());
extractWeightedSpanTerms(terms, sp);
}
} else {
Query origQuery = query;
if (query instanceof MultiTermQuery) {
if (!expandMultiTermQuery) {
return;
}
MultiTermQuery copy = (MultiTermQuery) query.clone();
copy.setRewriteMethod(MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE);
origQuery = copy;
}
final IndexReader reader = getLeafContext().reader();
Query rewritten = origQuery.rewrite(reader);
if (rewritten != origQuery) {
// only rewrite once and then flatten again - the rewritten query could have a speacial treatment
// if this method is overwritten in a subclass or above in the next recursion
extract(rewritten, terms);
}
}
extractUnknownQuery(query, terms);
}
protected void extractUnknownQuery(Query query,
Map<String, WeightedSpanTerm> terms) throws IOException {
// for sub-classing to extract custom queries
}
@ -249,7 +258,7 @@ public class WeightedSpanTermExtractor {
final boolean mustRewriteQuery = mustRewriteQuery(spanQuery);
if (mustRewriteQuery) {
for (final String field : fieldNames) {
final SpanQuery rewrittenQuery = (SpanQuery) spanQuery.rewrite(getLeafContextForField(field).reader());
final SpanQuery rewrittenQuery = (SpanQuery) spanQuery.rewrite(getLeafContext().reader());
queries.put(field, rewrittenQuery);
rewrittenQuery.extractTerms(nonWeightedTerms);
}
@ -266,7 +275,7 @@ public class WeightedSpanTermExtractor {
} else {
q = spanQuery;
}
AtomicReaderContext context = getLeafContextForField(field);
AtomicReaderContext context = getLeafContext();
Map<Term,TermContext> termContexts = new HashMap<Term,TermContext>();
TreeSet<Term> extractedTerms = new TreeSet<Term>();
q.extractTerms(extractedTerms);
@ -338,23 +347,69 @@ public class WeightedSpanTermExtractor {
return rv;
}
protected AtomicReaderContext getLeafContextForField(String field) throws IOException {
if(wrapToCaching && !cachedTokenStream && !(tokenStream instanceof CachingTokenFilter)) {
protected AtomicReaderContext getLeafContext() throws IOException {
if (reader == null) {
if(wrapToCaching && !(tokenStream instanceof CachingTokenFilter)) {
assert !cachedTokenStream;
tokenStream = new CachingTokenFilter(new OffsetLimitTokenFilter(tokenStream, maxDocCharsToAnalyze));
cachedTokenStream = true;
}
AtomicReaderContext context = readers.get(field);
if (context == null) {
MemoryIndex indexer = new MemoryIndex();
indexer.addField(field, new OffsetLimitTokenFilter(tokenStream, maxDocCharsToAnalyze));
final MemoryIndex indexer = new MemoryIndex(true);
indexer.addField(DelegatingAtomicReader.FIELD_NAME, tokenStream);
tokenStream.reset();
IndexSearcher searcher = indexer.createSearcher();
final IndexSearcher searcher = indexer.createSearcher();
// MEM index has only atomic ctx
context = (AtomicReaderContext) searcher.getTopReaderContext();
readers.put(field, context);
reader = new DelegatingAtomicReader(((AtomicReaderContext)searcher.getTopReaderContext()).reader());
}
return reader.getContext();
}
return context;
/*
* This reader will just delegate every call to a single field in the wrapped
* AtomicReader. This way we only need to build this field once rather than
* N-Times
*/
static final class DelegatingAtomicReader extends FilterAtomicReader {
private static final String FIELD_NAME = "shadowed_field";
DelegatingAtomicReader(AtomicReader in) {
super(in);
}
@Override
public FieldInfos getFieldInfos() {
throw new UnsupportedOperationException();
}
@Override
public Fields fields() throws IOException {
return new FilterFields(super.fields()) {
@Override
public Terms terms(String field) throws IOException {
return super.terms(DelegatingAtomicReader.FIELD_NAME);
}
@Override
public Iterator<String> iterator() {
return Collections.singletonList(DelegatingAtomicReader.FIELD_NAME).iterator();
}
@Override
public int size() {
return 1;
}
};
}
@Override
public DocValues docValues(String field) throws IOException {
return super.docValues(FIELD_NAME);
}
@Override
public DocValues normValues(String field) throws IOException {
return super.normValues(FIELD_NAME);
}
}
/**
@ -401,7 +456,7 @@ public class WeightedSpanTermExtractor {
try {
extract(query, terms);
} finally {
closeReaders();
IOUtils.close(reader);
}
return terms;
@ -449,8 +504,7 @@ public class WeightedSpanTermExtractor {
weightedSpanTerm.weight *= idf;
}
} finally {
closeReaders();
IOUtils.close(reader);
}
return terms;

View File

@ -28,9 +28,12 @@ import java.util.Set;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.queries.CommonTermsQuery;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.ConstantScoreQuery;
import org.apache.lucene.search.DisjunctionMaxQuery;
import org.apache.lucene.search.FilteredQuery;
import org.apache.lucene.search.MultiTermQuery;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.Query;
@ -92,8 +95,7 @@ public class FieldQuery {
if( !clause.isProhibited() )
flatten( clause.getQuery(), reader, flatQueries );
}
}
else if( sourceQuery instanceof DisjunctionMaxQuery ){
} else if( sourceQuery instanceof DisjunctionMaxQuery ){
DisjunctionMaxQuery dmq = (DisjunctionMaxQuery)sourceQuery;
for( Query query : dmq ){
flatten( query, reader, flatQueries );
@ -103,12 +105,6 @@ public class FieldQuery {
if( !flatQueries.contains( sourceQuery ) )
flatQueries.add( sourceQuery );
}
else if (sourceQuery instanceof MultiTermQuery && reader != null) {
MultiTermQuery copy = (MultiTermQuery) sourceQuery.clone();
copy.setRewriteMethod(new MultiTermQuery.TopTermsScoringBooleanQueryRewrite(MAX_MTQ_TERMS));
BooleanQuery mtqTerms = (BooleanQuery) copy.rewrite(reader);
flatten(mtqTerms, reader, flatQueries);
}
else if( sourceQuery instanceof PhraseQuery ){
if( !flatQueries.contains( sourceQuery ) ){
PhraseQuery pq = (PhraseQuery)sourceQuery;
@ -118,6 +114,31 @@ public class FieldQuery {
flatQueries.add( new TermQuery( pq.getTerms()[0] ) );
}
}
} else if (sourceQuery instanceof ConstantScoreQuery) {
final Query q = ((ConstantScoreQuery) sourceQuery).getQuery();
if (q != null) {
flatten(q, reader, flatQueries);
}
} else if (sourceQuery instanceof FilteredQuery) {
final Query q = ((FilteredQuery) sourceQuery).getQuery();
if (q != null) {
flatten(q, reader, flatQueries);
}
} else if (reader != null){
Query query = sourceQuery;
if (sourceQuery instanceof MultiTermQuery) {
MultiTermQuery copy = (MultiTermQuery) sourceQuery.clone();
copy.setRewriteMethod(new MultiTermQuery.TopTermsScoringBooleanQueryRewrite(MAX_MTQ_TERMS));
query = copy;
}
Query rewritten = query.rewrite(reader);
if (rewritten != query) {
// only rewrite once and then flatten again - the rewritten query could have a speacial treatment
// if this method is overwritten in a subclass.
flatten(rewritten, reader, flatQueries);
}
// if the query is already rewritten we discard it
}
// else discard queries
}

View File

@ -46,6 +46,7 @@ import org.apache.lucene.index.StoredDocument;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.queries.CommonTermsQuery;
import org.apache.lucene.search.*;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.highlight.SynonymTokenizer.TestHighlightRunner;
@ -114,6 +115,87 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
}
}
public void testHighlightingCommonTermsQuery() throws Exception {
Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);
CommonTermsQuery query = new CommonTermsQuery(Occur.MUST, Occur.SHOULD, 3);
query.add(new Term(FIELD_NAME, "this"));
query.add(new Term(FIELD_NAME, "long"));
query.add(new Term(FIELD_NAME, "very"));
searcher = new IndexSearcher(reader);
TopDocs hits = searcher.search(query, 10);
assertEquals(2, hits.totalHits);
QueryScorer scorer = new QueryScorer(query, FIELD_NAME);
Highlighter highlighter = new Highlighter(scorer);
StoredDocument doc = searcher.doc(hits.scoreDocs[0].doc);
String storedField = doc.get(FIELD_NAME);
TokenStream stream = TokenSources.getAnyTokenStream(searcher
.getIndexReader(), hits.scoreDocs[0].doc, FIELD_NAME, doc, analyzer);
Fragmenter fragmenter = new SimpleSpanFragmenter(scorer);
highlighter.setTextFragmenter(fragmenter);
String fragment = highlighter.getBestFragment(stream, storedField);
assertEquals("Hello <B>this</B> is a piece of text that is <B>very</B> <B>long</B> and contains too much preamble and the meat is really here which says kennedy has been shot", fragment);
doc = searcher.doc(hits.scoreDocs[1].doc);
storedField = doc.get(FIELD_NAME);
stream = TokenSources.getAnyTokenStream(searcher
.getIndexReader(), hits.scoreDocs[1].doc, FIELD_NAME, doc, analyzer);
highlighter.setTextFragmenter(new SimpleSpanFragmenter(scorer));
fragment = highlighter.getBestFragment(stream, storedField);
assertEquals("<B>This</B> piece of text refers to Kennedy at the beginning then has a longer piece of text that is <B>very</B>", fragment);
}
public void testHighlightUnknowQueryAfterRewrite() throws IOException, InvalidTokenOffsetsException {
Query query = new Query() {
@Override
public Query rewrite(IndexReader reader) throws IOException {
CommonTermsQuery query = new CommonTermsQuery(Occur.MUST, Occur.SHOULD, 3);
query.add(new Term(FIELD_NAME, "this"));
query.add(new Term(FIELD_NAME, "long"));
query.add(new Term(FIELD_NAME, "very"));
return query;
}
@Override
public String toString(String field) {
return null;
}
};
Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);
searcher = new IndexSearcher(reader);
TopDocs hits = searcher.search(query, 10);
assertEquals(2, hits.totalHits);
QueryScorer scorer = new QueryScorer(query, FIELD_NAME);
Highlighter highlighter = new Highlighter(scorer);
StoredDocument doc = searcher.doc(hits.scoreDocs[0].doc);
String storedField = doc.get(FIELD_NAME);
TokenStream stream = TokenSources.getAnyTokenStream(searcher
.getIndexReader(), hits.scoreDocs[0].doc, FIELD_NAME, doc, analyzer);
Fragmenter fragmenter = new SimpleSpanFragmenter(scorer);
highlighter.setTextFragmenter(fragmenter);
String fragment = highlighter.getBestFragment(stream, storedField);
assertEquals("Hello <B>this</B> is a piece of text that is <B>very</B> <B>long</B> and contains too much preamble and the meat is really here which says kennedy has been shot", fragment);
doc = searcher.doc(hits.scoreDocs[1].doc);
storedField = doc.get(FIELD_NAME);
stream = TokenSources.getAnyTokenStream(searcher
.getIndexReader(), hits.scoreDocs[1].doc, FIELD_NAME, doc, analyzer);
highlighter.setTextFragmenter(new SimpleSpanFragmenter(scorer));
fragment = highlighter.getBestFragment(stream, storedField);
assertEquals("<B>This</B> piece of text refers to Kennedy at the beginning then has a longer piece of text that is <B>very</B>", fragment);
}
public void testHighlightingWithDefaultField() throws Exception {
String s1 = "I call our world Flatland, not because we call it so,";

View File

@ -18,6 +18,8 @@ package org.apache.lucene.search.vectorhighlight;
import java.io.IOException;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockTokenFilter;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
@ -26,7 +28,13 @@ import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.queries.CommonTermsQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.highlight.SimpleSpanFragmenter;
import org.apache.lucene.search.highlight.TokenSources;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase;
@ -62,4 +70,47 @@ public class FastVectorHighlighterTest extends LuceneTestCase {
writer.close();
dir.close();
}
public void testCommonTermsQueryHighlightTest() throws IOException {
Directory dir = newDirectory();
IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true)));
FieldType type = new FieldType(TextField.TYPE_STORED);
type.setStoreTermVectorOffsets(true);
type.setStoreTermVectorPositions(true);
type.setStoreTermVectors(true);
type.freeze();
String[] texts = {
"Hello this is a piece of text that is very long and contains too much preamble and the meat is really here which says kennedy has been shot",
"This piece of text refers to Kennedy at the beginning then has a longer piece of text that is very long in the middle and finally ends with another reference to Kennedy",
"JFK has been shot", "John Kennedy has been shot",
"This text has a typo in referring to Keneddy",
"wordx wordy wordz wordx wordy wordx worda wordb wordy wordc", "y z x y z a b", "lets is a the lets is a the lets is a the lets" };
for (int i = 0; i < texts.length; i++) {
Document doc = new Document();
Field field = new Field("field", texts[i], type);
doc.add(field);
writer.addDocument(doc);
}
CommonTermsQuery query = new CommonTermsQuery(Occur.MUST, Occur.SHOULD, 2);
query.add(new Term("field", "text"));
query.add(new Term("field", "long"));
query.add(new Term("field", "very"));
FastVectorHighlighter highlighter = new FastVectorHighlighter();
IndexReader reader = DirectoryReader.open(writer, true);
IndexSearcher searcher = new IndexSearcher(reader);
TopDocs hits = searcher.search(query, 10);
assertEquals(2, hits.totalHits);
FieldQuery fieldQuery = highlighter.getFieldQuery(query, reader);
String[] bestFragments = highlighter.getBestFragments(fieldQuery, reader, hits.scoreDocs[0].doc, "field", 1000, 1);
assertEquals("This piece of <b>text</b> refers to Kennedy at the beginning then has a longer piece of <b>text</b> that is <b>very</b> <b>long</b> in the middle and finally ends with another reference to Kennedy", bestFragments[0]);
fieldQuery = highlighter.getFieldQuery(query, reader);
bestFragments = highlighter.getBestFragments(fieldQuery, reader, hits.scoreDocs[1].doc, "field", 1000, 1);
assertEquals("Hello this is a piece of <b>text</b> that is <b>very</b> <b>long</b> and contains too much preamble and the meat is really here which says kennedy has been shot", bestFragments[0]);
reader.close();
writer.close();
dir.close();
}
}

View File

@ -23,8 +23,13 @@ import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.ConstantScoreQuery;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.Filter;
import org.apache.lucene.search.FilteredQuery;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.Query;
@ -35,6 +40,7 @@ import org.apache.lucene.search.TermRangeQuery;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.search.vectorhighlight.FieldQuery.QueryPhraseMap;
import org.apache.lucene.search.vectorhighlight.FieldTermStack.TermInfo;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
public class FieldQueryTest extends AbstractTestCase {
@ -905,4 +911,40 @@ public class FieldQueryTest extends AbstractTestCase {
assertNotNull (fq.searchPhrase(F, phraseCandidate));
}
public void testStopRewrite() throws Exception {
Query q = new Query() {
@Override
public String toString(String field) {
return "DummyQuery";
}
};
make1d1fIndex( "a" );
assertNotNull(reader);
new FieldQuery(q, reader, true, true );
}
public void testFlattenFilteredQuery() throws Exception {
Query query = new FilteredQuery(pqF( "A" ), new Filter() {
@Override
public DocIdSet getDocIdSet(AtomicReaderContext context, Bits acceptDocs)
throws IOException {
return null;
}
});
FieldQuery fq = new FieldQuery( query, true, true );
Set<Query> flatQueries = new HashSet<Query>();
fq.flatten( query, reader, flatQueries );
assertCollectionQueries( flatQueries, tq( "A" ) );
}
public void testFlattenConstantScoreQuery() throws Exception {
Query query = new ConstantScoreQuery(pqF( "A" ));
FieldQuery fq = new FieldQuery( query, true, true );
Set<Query> flatQueries = new HashSet<Query>();
fq.flatten( query, reader, flatQueries );
assertCollectionQueries( flatQueries, tq( "A" ) );
}
}

View File

@ -465,7 +465,9 @@ public class MemoryIndex {
throw new RuntimeException(e);
} finally {
try {
if (stream != null) stream.close();
if (stream != null) {
stream.close();
}
} catch (IOException e2) {
throw new RuntimeException(e2);
}