mirror of https://github.com/apache/lucene.git
LUCENE-4728: Unknown and not explicitly mapped queries are now rewritten against the highlighting IndexReader to obtain primitive queries before discarding the query entirely.
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1442590 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
d0428c106f
commit
81c7ba4601
|
@ -61,6 +61,11 @@
|
|||
<artifactId>lucene-memory</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>${project.groupId}</groupId>
|
||||
<artifactId>lucene-queries</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
<build>
|
||||
<sourceDirectory>${module-path}/src/java</sourceDirectory>
|
||||
|
|
|
@ -94,6 +94,12 @@ New Features
|
|||
* LUCENE-4723: Add AnalyzerFactoryTask to benchmark, and enable analyzer
|
||||
creation via the resulting factories using NewAnalyzerTask. (Steve Rowe)
|
||||
|
||||
* LUCENE-4728: Unknown and not explicitly mapped queries are now rewritten
|
||||
against the highlighting IndexReader to obtain primitive queries before
|
||||
discarding the query entirely. WeightedSpanTermExtractor now builds a
|
||||
MemoryIndex only once even if multiple fields are highlighted.
|
||||
(Simon Willnauer)
|
||||
|
||||
API Changes
|
||||
|
||||
* LUCENE-4709: FacetResultNode no longer has a residue field. (Shai Erera)
|
||||
|
|
|
@ -27,6 +27,7 @@
|
|||
|
||||
<path id="classpath">
|
||||
<pathelement path="${memory.jar}"/>
|
||||
<pathelement path="${queries.jar}"/>
|
||||
<path refid="base.classpath"/>
|
||||
</path>
|
||||
|
||||
|
|
|
@ -189,9 +189,7 @@ public class Highlighter
|
|||
|
||||
CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class);
|
||||
OffsetAttribute offsetAtt = tokenStream.addAttribute(OffsetAttribute.class);
|
||||
tokenStream.addAttribute(PositionIncrementAttribute.class);
|
||||
tokenStream.reset();
|
||||
|
||||
TextFragment currentFrag = new TextFragment(newText,newText.length(), docFrags.size());
|
||||
|
||||
if (fragmentScorer instanceof QueryScorer) {
|
||||
|
|
|
@ -18,7 +18,7 @@ package org.apache.lucene.search.highlight;
|
|||
*/
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
|
@ -29,11 +29,18 @@ import java.util.TreeSet;
|
|||
|
||||
import org.apache.lucene.analysis.CachingTokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.index.AtomicReader;
|
||||
import org.apache.lucene.index.AtomicReaderContext;
|
||||
import org.apache.lucene.index.DocValues;
|
||||
import org.apache.lucene.index.FieldInfos;
|
||||
import org.apache.lucene.index.Fields;
|
||||
import org.apache.lucene.index.FilterAtomicReader;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.TermContext;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.memory.MemoryIndex;
|
||||
import org.apache.lucene.queries.CommonTermsQuery;
|
||||
import org.apache.lucene.search.*;
|
||||
import org.apache.lucene.search.spans.FieldMaskingSpanQuery;
|
||||
import org.apache.lucene.search.spans.SpanFirstQuery;
|
||||
|
@ -44,6 +51,8 @@ import org.apache.lucene.search.spans.SpanQuery;
|
|||
import org.apache.lucene.search.spans.SpanTermQuery;
|
||||
import org.apache.lucene.search.spans.Spans;
|
||||
import org.apache.lucene.util.Bits;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
|
||||
|
||||
/**
|
||||
* Class used to extract {@link WeightedSpanTerm}s from a {@link Query} based on whether
|
||||
|
@ -53,12 +62,13 @@ public class WeightedSpanTermExtractor {
|
|||
|
||||
private String fieldName;
|
||||
private TokenStream tokenStream;
|
||||
private Map<String,AtomicReaderContext> readers = new HashMap<String,AtomicReaderContext>(10);
|
||||
private String defaultField;
|
||||
private boolean expandMultiTermQuery;
|
||||
private boolean cachedTokenStream;
|
||||
private boolean wrapToCaching = true;
|
||||
private int maxDocCharsToAnalyze;
|
||||
private AtomicReader reader = null;
|
||||
|
||||
|
||||
public WeightedSpanTermExtractor() {
|
||||
}
|
||||
|
@ -69,18 +79,6 @@ public class WeightedSpanTermExtractor {
|
|||
}
|
||||
}
|
||||
|
||||
private void closeReaders() {
|
||||
Collection<AtomicReaderContext> ctxSet = readers.values();
|
||||
|
||||
for (final AtomicReaderContext ctx : ctxSet) {
|
||||
try {
|
||||
ctx.reader().close();
|
||||
} catch (IOException e) {
|
||||
// alert?
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Fills a <code>Map</code> with <@link WeightedSpanTerm>s using the terms from the supplied <code>Query</code>.
|
||||
*
|
||||
|
@ -146,21 +144,14 @@ public class WeightedSpanTermExtractor {
|
|||
if (q != null) {
|
||||
extract(q, terms);
|
||||
}
|
||||
} else if (query instanceof CommonTermsQuery) {
|
||||
// specialized since rewriting would change the result query
|
||||
// this query is TermContext sensitive.
|
||||
extractWeightedTerms(terms, query);
|
||||
} else if (query instanceof DisjunctionMaxQuery) {
|
||||
for (Iterator<Query> iterator = ((DisjunctionMaxQuery) query).iterator(); iterator.hasNext();) {
|
||||
extract(iterator.next(), terms);
|
||||
}
|
||||
} else if (query instanceof MultiTermQuery && expandMultiTermQuery) {
|
||||
MultiTermQuery mtq = ((MultiTermQuery)query);
|
||||
if(mtq.getRewriteMethod() != MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE) {
|
||||
mtq = (MultiTermQuery) mtq.clone();
|
||||
mtq.setRewriteMethod(MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE);
|
||||
query = mtq;
|
||||
}
|
||||
if (mtq.getField() != null) {
|
||||
IndexReader ir = getLeafContextForField(mtq.getField()).reader();
|
||||
extract(query.rewrite(ir), terms);
|
||||
}
|
||||
} else if (query instanceof MultiPhraseQuery) {
|
||||
final MultiPhraseQuery mpq = (MultiPhraseQuery) query;
|
||||
final List<Term[]> termArrays = mpq.getTermArrays();
|
||||
|
@ -210,12 +201,30 @@ public class WeightedSpanTermExtractor {
|
|||
sp.setBoost(query.getBoost());
|
||||
extractWeightedSpanTerms(terms, sp);
|
||||
}
|
||||
} else {
|
||||
Query origQuery = query;
|
||||
if (query instanceof MultiTermQuery) {
|
||||
if (!expandMultiTermQuery) {
|
||||
return;
|
||||
}
|
||||
MultiTermQuery copy = (MultiTermQuery) query.clone();
|
||||
copy.setRewriteMethod(MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE);
|
||||
origQuery = copy;
|
||||
}
|
||||
final IndexReader reader = getLeafContext().reader();
|
||||
Query rewritten = origQuery.rewrite(reader);
|
||||
if (rewritten != origQuery) {
|
||||
// only rewrite once and then flatten again - the rewritten query could have a speacial treatment
|
||||
// if this method is overwritten in a subclass or above in the next recursion
|
||||
extract(rewritten, terms);
|
||||
}
|
||||
}
|
||||
extractUnknownQuery(query, terms);
|
||||
}
|
||||
|
||||
protected void extractUnknownQuery(Query query,
|
||||
Map<String, WeightedSpanTerm> terms) throws IOException {
|
||||
|
||||
// for sub-classing to extract custom queries
|
||||
}
|
||||
|
||||
|
@ -249,7 +258,7 @@ public class WeightedSpanTermExtractor {
|
|||
final boolean mustRewriteQuery = mustRewriteQuery(spanQuery);
|
||||
if (mustRewriteQuery) {
|
||||
for (final String field : fieldNames) {
|
||||
final SpanQuery rewrittenQuery = (SpanQuery) spanQuery.rewrite(getLeafContextForField(field).reader());
|
||||
final SpanQuery rewrittenQuery = (SpanQuery) spanQuery.rewrite(getLeafContext().reader());
|
||||
queries.put(field, rewrittenQuery);
|
||||
rewrittenQuery.extractTerms(nonWeightedTerms);
|
||||
}
|
||||
|
@ -266,7 +275,7 @@ public class WeightedSpanTermExtractor {
|
|||
} else {
|
||||
q = spanQuery;
|
||||
}
|
||||
AtomicReaderContext context = getLeafContextForField(field);
|
||||
AtomicReaderContext context = getLeafContext();
|
||||
Map<Term,TermContext> termContexts = new HashMap<Term,TermContext>();
|
||||
TreeSet<Term> extractedTerms = new TreeSet<Term>();
|
||||
q.extractTerms(extractedTerms);
|
||||
|
@ -338,23 +347,69 @@ public class WeightedSpanTermExtractor {
|
|||
return rv;
|
||||
}
|
||||
|
||||
protected AtomicReaderContext getLeafContextForField(String field) throws IOException {
|
||||
if(wrapToCaching && !cachedTokenStream && !(tokenStream instanceof CachingTokenFilter)) {
|
||||
protected AtomicReaderContext getLeafContext() throws IOException {
|
||||
if (reader == null) {
|
||||
if(wrapToCaching && !(tokenStream instanceof CachingTokenFilter)) {
|
||||
assert !cachedTokenStream;
|
||||
tokenStream = new CachingTokenFilter(new OffsetLimitTokenFilter(tokenStream, maxDocCharsToAnalyze));
|
||||
cachedTokenStream = true;
|
||||
}
|
||||
AtomicReaderContext context = readers.get(field);
|
||||
if (context == null) {
|
||||
MemoryIndex indexer = new MemoryIndex();
|
||||
indexer.addField(field, new OffsetLimitTokenFilter(tokenStream, maxDocCharsToAnalyze));
|
||||
final MemoryIndex indexer = new MemoryIndex(true);
|
||||
indexer.addField(DelegatingAtomicReader.FIELD_NAME, tokenStream);
|
||||
tokenStream.reset();
|
||||
IndexSearcher searcher = indexer.createSearcher();
|
||||
final IndexSearcher searcher = indexer.createSearcher();
|
||||
// MEM index has only atomic ctx
|
||||
context = (AtomicReaderContext) searcher.getTopReaderContext();
|
||||
readers.put(field, context);
|
||||
reader = new DelegatingAtomicReader(((AtomicReaderContext)searcher.getTopReaderContext()).reader());
|
||||
}
|
||||
return reader.getContext();
|
||||
}
|
||||
|
||||
return context;
|
||||
/*
|
||||
* This reader will just delegate every call to a single field in the wrapped
|
||||
* AtomicReader. This way we only need to build this field once rather than
|
||||
* N-Times
|
||||
*/
|
||||
static final class DelegatingAtomicReader extends FilterAtomicReader {
|
||||
private static final String FIELD_NAME = "shadowed_field";
|
||||
|
||||
DelegatingAtomicReader(AtomicReader in) {
|
||||
super(in);
|
||||
}
|
||||
|
||||
@Override
|
||||
public FieldInfos getFieldInfos() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Fields fields() throws IOException {
|
||||
return new FilterFields(super.fields()) {
|
||||
@Override
|
||||
public Terms terms(String field) throws IOException {
|
||||
return super.terms(DelegatingAtomicReader.FIELD_NAME);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Iterator<String> iterator() {
|
||||
return Collections.singletonList(DelegatingAtomicReader.FIELD_NAME).iterator();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int size() {
|
||||
return 1;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
public DocValues docValues(String field) throws IOException {
|
||||
return super.docValues(FIELD_NAME);
|
||||
}
|
||||
|
||||
@Override
|
||||
public DocValues normValues(String field) throws IOException {
|
||||
return super.normValues(FIELD_NAME);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -401,7 +456,7 @@ public class WeightedSpanTermExtractor {
|
|||
try {
|
||||
extract(query, terms);
|
||||
} finally {
|
||||
closeReaders();
|
||||
IOUtils.close(reader);
|
||||
}
|
||||
|
||||
return terms;
|
||||
|
@ -449,8 +504,7 @@ public class WeightedSpanTermExtractor {
|
|||
weightedSpanTerm.weight *= idf;
|
||||
}
|
||||
} finally {
|
||||
|
||||
closeReaders();
|
||||
IOUtils.close(reader);
|
||||
}
|
||||
|
||||
return terms;
|
||||
|
|
|
@ -28,9 +28,12 @@ import java.util.Set;
|
|||
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.queries.CommonTermsQuery;
|
||||
import org.apache.lucene.search.BooleanClause;
|
||||
import org.apache.lucene.search.BooleanQuery;
|
||||
import org.apache.lucene.search.ConstantScoreQuery;
|
||||
import org.apache.lucene.search.DisjunctionMaxQuery;
|
||||
import org.apache.lucene.search.FilteredQuery;
|
||||
import org.apache.lucene.search.MultiTermQuery;
|
||||
import org.apache.lucene.search.PhraseQuery;
|
||||
import org.apache.lucene.search.Query;
|
||||
|
@ -92,8 +95,7 @@ public class FieldQuery {
|
|||
if( !clause.isProhibited() )
|
||||
flatten( clause.getQuery(), reader, flatQueries );
|
||||
}
|
||||
}
|
||||
else if( sourceQuery instanceof DisjunctionMaxQuery ){
|
||||
} else if( sourceQuery instanceof DisjunctionMaxQuery ){
|
||||
DisjunctionMaxQuery dmq = (DisjunctionMaxQuery)sourceQuery;
|
||||
for( Query query : dmq ){
|
||||
flatten( query, reader, flatQueries );
|
||||
|
@ -103,12 +105,6 @@ public class FieldQuery {
|
|||
if( !flatQueries.contains( sourceQuery ) )
|
||||
flatQueries.add( sourceQuery );
|
||||
}
|
||||
else if (sourceQuery instanceof MultiTermQuery && reader != null) {
|
||||
MultiTermQuery copy = (MultiTermQuery) sourceQuery.clone();
|
||||
copy.setRewriteMethod(new MultiTermQuery.TopTermsScoringBooleanQueryRewrite(MAX_MTQ_TERMS));
|
||||
BooleanQuery mtqTerms = (BooleanQuery) copy.rewrite(reader);
|
||||
flatten(mtqTerms, reader, flatQueries);
|
||||
}
|
||||
else if( sourceQuery instanceof PhraseQuery ){
|
||||
if( !flatQueries.contains( sourceQuery ) ){
|
||||
PhraseQuery pq = (PhraseQuery)sourceQuery;
|
||||
|
@ -118,6 +114,31 @@ public class FieldQuery {
|
|||
flatQueries.add( new TermQuery( pq.getTerms()[0] ) );
|
||||
}
|
||||
}
|
||||
} else if (sourceQuery instanceof ConstantScoreQuery) {
|
||||
final Query q = ((ConstantScoreQuery) sourceQuery).getQuery();
|
||||
if (q != null) {
|
||||
flatten(q, reader, flatQueries);
|
||||
}
|
||||
} else if (sourceQuery instanceof FilteredQuery) {
|
||||
final Query q = ((FilteredQuery) sourceQuery).getQuery();
|
||||
if (q != null) {
|
||||
flatten(q, reader, flatQueries);
|
||||
}
|
||||
} else if (reader != null){
|
||||
Query query = sourceQuery;
|
||||
if (sourceQuery instanceof MultiTermQuery) {
|
||||
MultiTermQuery copy = (MultiTermQuery) sourceQuery.clone();
|
||||
copy.setRewriteMethod(new MultiTermQuery.TopTermsScoringBooleanQueryRewrite(MAX_MTQ_TERMS));
|
||||
query = copy;
|
||||
}
|
||||
Query rewritten = query.rewrite(reader);
|
||||
if (rewritten != query) {
|
||||
// only rewrite once and then flatten again - the rewritten query could have a speacial treatment
|
||||
// if this method is overwritten in a subclass.
|
||||
flatten(rewritten, reader, flatQueries);
|
||||
|
||||
}
|
||||
// if the query is already rewritten we discard it
|
||||
}
|
||||
// else discard queries
|
||||
}
|
||||
|
|
|
@ -46,6 +46,7 @@ import org.apache.lucene.index.StoredDocument;
|
|||
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
|
||||
import org.apache.lucene.index.IndexWriterConfig;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.queries.CommonTermsQuery;
|
||||
import org.apache.lucene.search.*;
|
||||
import org.apache.lucene.search.BooleanClause.Occur;
|
||||
import org.apache.lucene.search.highlight.SynonymTokenizer.TestHighlightRunner;
|
||||
|
@ -114,6 +115,87 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
|||
}
|
||||
}
|
||||
|
||||
public void testHighlightingCommonTermsQuery() throws Exception {
|
||||
Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);
|
||||
CommonTermsQuery query = new CommonTermsQuery(Occur.MUST, Occur.SHOULD, 3);
|
||||
query.add(new Term(FIELD_NAME, "this"));
|
||||
query.add(new Term(FIELD_NAME, "long"));
|
||||
query.add(new Term(FIELD_NAME, "very"));
|
||||
|
||||
searcher = new IndexSearcher(reader);
|
||||
TopDocs hits = searcher.search(query, 10);
|
||||
assertEquals(2, hits.totalHits);
|
||||
QueryScorer scorer = new QueryScorer(query, FIELD_NAME);
|
||||
Highlighter highlighter = new Highlighter(scorer);
|
||||
|
||||
StoredDocument doc = searcher.doc(hits.scoreDocs[0].doc);
|
||||
String storedField = doc.get(FIELD_NAME);
|
||||
|
||||
TokenStream stream = TokenSources.getAnyTokenStream(searcher
|
||||
.getIndexReader(), hits.scoreDocs[0].doc, FIELD_NAME, doc, analyzer);
|
||||
Fragmenter fragmenter = new SimpleSpanFragmenter(scorer);
|
||||
highlighter.setTextFragmenter(fragmenter);
|
||||
String fragment = highlighter.getBestFragment(stream, storedField);
|
||||
assertEquals("Hello <B>this</B> is a piece of text that is <B>very</B> <B>long</B> and contains too much preamble and the meat is really here which says kennedy has been shot", fragment);
|
||||
|
||||
doc = searcher.doc(hits.scoreDocs[1].doc);
|
||||
storedField = doc.get(FIELD_NAME);
|
||||
|
||||
stream = TokenSources.getAnyTokenStream(searcher
|
||||
.getIndexReader(), hits.scoreDocs[1].doc, FIELD_NAME, doc, analyzer);
|
||||
highlighter.setTextFragmenter(new SimpleSpanFragmenter(scorer));
|
||||
fragment = highlighter.getBestFragment(stream, storedField);
|
||||
assertEquals("<B>This</B> piece of text refers to Kennedy at the beginning then has a longer piece of text that is <B>very</B>", fragment);
|
||||
}
|
||||
|
||||
public void testHighlightUnknowQueryAfterRewrite() throws IOException, InvalidTokenOffsetsException {
|
||||
Query query = new Query() {
|
||||
|
||||
@Override
|
||||
public Query rewrite(IndexReader reader) throws IOException {
|
||||
CommonTermsQuery query = new CommonTermsQuery(Occur.MUST, Occur.SHOULD, 3);
|
||||
query.add(new Term(FIELD_NAME, "this"));
|
||||
query.add(new Term(FIELD_NAME, "long"));
|
||||
query.add(new Term(FIELD_NAME, "very"));
|
||||
return query;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString(String field) {
|
||||
return null;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);
|
||||
|
||||
searcher = new IndexSearcher(reader);
|
||||
TopDocs hits = searcher.search(query, 10);
|
||||
assertEquals(2, hits.totalHits);
|
||||
QueryScorer scorer = new QueryScorer(query, FIELD_NAME);
|
||||
Highlighter highlighter = new Highlighter(scorer);
|
||||
|
||||
StoredDocument doc = searcher.doc(hits.scoreDocs[0].doc);
|
||||
String storedField = doc.get(FIELD_NAME);
|
||||
|
||||
TokenStream stream = TokenSources.getAnyTokenStream(searcher
|
||||
.getIndexReader(), hits.scoreDocs[0].doc, FIELD_NAME, doc, analyzer);
|
||||
Fragmenter fragmenter = new SimpleSpanFragmenter(scorer);
|
||||
highlighter.setTextFragmenter(fragmenter);
|
||||
String fragment = highlighter.getBestFragment(stream, storedField);
|
||||
assertEquals("Hello <B>this</B> is a piece of text that is <B>very</B> <B>long</B> and contains too much preamble and the meat is really here which says kennedy has been shot", fragment);
|
||||
|
||||
doc = searcher.doc(hits.scoreDocs[1].doc);
|
||||
storedField = doc.get(FIELD_NAME);
|
||||
|
||||
stream = TokenSources.getAnyTokenStream(searcher
|
||||
.getIndexReader(), hits.scoreDocs[1].doc, FIELD_NAME, doc, analyzer);
|
||||
highlighter.setTextFragmenter(new SimpleSpanFragmenter(scorer));
|
||||
fragment = highlighter.getBestFragment(stream, storedField);
|
||||
assertEquals("<B>This</B> piece of text refers to Kennedy at the beginning then has a longer piece of text that is <B>very</B>", fragment);
|
||||
|
||||
}
|
||||
|
||||
public void testHighlightingWithDefaultField() throws Exception {
|
||||
|
||||
String s1 = "I call our world Flatland, not because we call it so,";
|
||||
|
|
|
@ -18,6 +18,8 @@ package org.apache.lucene.search.vectorhighlight;
|
|||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.analysis.MockTokenFilter;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.FieldType;
|
||||
|
@ -26,7 +28,13 @@ import org.apache.lucene.index.DirectoryReader;
|
|||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.queries.CommonTermsQuery;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.search.TopDocs;
|
||||
import org.apache.lucene.search.BooleanClause.Occur;
|
||||
import org.apache.lucene.search.highlight.SimpleSpanFragmenter;
|
||||
import org.apache.lucene.search.highlight.TokenSources;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
|
@ -62,4 +70,47 @@ public class FastVectorHighlighterTest extends LuceneTestCase {
|
|||
writer.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
public void testCommonTermsQueryHighlightTest() throws IOException {
|
||||
Directory dir = newDirectory();
|
||||
IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true)));
|
||||
FieldType type = new FieldType(TextField.TYPE_STORED);
|
||||
type.setStoreTermVectorOffsets(true);
|
||||
type.setStoreTermVectorPositions(true);
|
||||
type.setStoreTermVectors(true);
|
||||
type.freeze();
|
||||
String[] texts = {
|
||||
"Hello this is a piece of text that is very long and contains too much preamble and the meat is really here which says kennedy has been shot",
|
||||
"This piece of text refers to Kennedy at the beginning then has a longer piece of text that is very long in the middle and finally ends with another reference to Kennedy",
|
||||
"JFK has been shot", "John Kennedy has been shot",
|
||||
"This text has a typo in referring to Keneddy",
|
||||
"wordx wordy wordz wordx wordy wordx worda wordb wordy wordc", "y z x y z a b", "lets is a the lets is a the lets is a the lets" };
|
||||
for (int i = 0; i < texts.length; i++) {
|
||||
Document doc = new Document();
|
||||
Field field = new Field("field", texts[i], type);
|
||||
doc.add(field);
|
||||
writer.addDocument(doc);
|
||||
}
|
||||
CommonTermsQuery query = new CommonTermsQuery(Occur.MUST, Occur.SHOULD, 2);
|
||||
query.add(new Term("field", "text"));
|
||||
query.add(new Term("field", "long"));
|
||||
query.add(new Term("field", "very"));
|
||||
|
||||
FastVectorHighlighter highlighter = new FastVectorHighlighter();
|
||||
IndexReader reader = DirectoryReader.open(writer, true);
|
||||
IndexSearcher searcher = new IndexSearcher(reader);
|
||||
TopDocs hits = searcher.search(query, 10);
|
||||
assertEquals(2, hits.totalHits);
|
||||
FieldQuery fieldQuery = highlighter.getFieldQuery(query, reader);
|
||||
String[] bestFragments = highlighter.getBestFragments(fieldQuery, reader, hits.scoreDocs[0].doc, "field", 1000, 1);
|
||||
assertEquals("This piece of <b>text</b> refers to Kennedy at the beginning then has a longer piece of <b>text</b> that is <b>very</b> <b>long</b> in the middle and finally ends with another reference to Kennedy", bestFragments[0]);
|
||||
|
||||
fieldQuery = highlighter.getFieldQuery(query, reader);
|
||||
bestFragments = highlighter.getBestFragments(fieldQuery, reader, hits.scoreDocs[1].doc, "field", 1000, 1);
|
||||
assertEquals("Hello this is a piece of <b>text</b> that is <b>very</b> <b>long</b> and contains too much preamble and the meat is really here which says kennedy has been shot", bestFragments[0]);
|
||||
|
||||
reader.close();
|
||||
writer.close();
|
||||
dir.close();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -23,8 +23,13 @@ import java.util.List;
|
|||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.index.AtomicReaderContext;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.BooleanQuery;
|
||||
import org.apache.lucene.search.ConstantScoreQuery;
|
||||
import org.apache.lucene.search.DocIdSet;
|
||||
import org.apache.lucene.search.Filter;
|
||||
import org.apache.lucene.search.FilteredQuery;
|
||||
import org.apache.lucene.search.PhraseQuery;
|
||||
import org.apache.lucene.search.PrefixQuery;
|
||||
import org.apache.lucene.search.Query;
|
||||
|
@ -35,6 +40,7 @@ import org.apache.lucene.search.TermRangeQuery;
|
|||
import org.apache.lucene.search.WildcardQuery;
|
||||
import org.apache.lucene.search.vectorhighlight.FieldQuery.QueryPhraseMap;
|
||||
import org.apache.lucene.search.vectorhighlight.FieldTermStack.TermInfo;
|
||||
import org.apache.lucene.util.Bits;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
||||
public class FieldQueryTest extends AbstractTestCase {
|
||||
|
@ -905,4 +911,40 @@ public class FieldQueryTest extends AbstractTestCase {
|
|||
assertNotNull (fq.searchPhrase(F, phraseCandidate));
|
||||
}
|
||||
|
||||
public void testStopRewrite() throws Exception {
|
||||
Query q = new Query() {
|
||||
|
||||
@Override
|
||||
public String toString(String field) {
|
||||
return "DummyQuery";
|
||||
}
|
||||
|
||||
};
|
||||
make1d1fIndex( "a" );
|
||||
assertNotNull(reader);
|
||||
new FieldQuery(q, reader, true, true );
|
||||
}
|
||||
|
||||
public void testFlattenFilteredQuery() throws Exception {
|
||||
Query query = new FilteredQuery(pqF( "A" ), new Filter() {
|
||||
@Override
|
||||
public DocIdSet getDocIdSet(AtomicReaderContext context, Bits acceptDocs)
|
||||
throws IOException {
|
||||
return null;
|
||||
}
|
||||
});
|
||||
FieldQuery fq = new FieldQuery( query, true, true );
|
||||
Set<Query> flatQueries = new HashSet<Query>();
|
||||
fq.flatten( query, reader, flatQueries );
|
||||
assertCollectionQueries( flatQueries, tq( "A" ) );
|
||||
}
|
||||
|
||||
public void testFlattenConstantScoreQuery() throws Exception {
|
||||
Query query = new ConstantScoreQuery(pqF( "A" ));
|
||||
FieldQuery fq = new FieldQuery( query, true, true );
|
||||
Set<Query> flatQueries = new HashSet<Query>();
|
||||
fq.flatten( query, reader, flatQueries );
|
||||
assertCollectionQueries( flatQueries, tq( "A" ) );
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -465,7 +465,9 @@ public class MemoryIndex {
|
|||
throw new RuntimeException(e);
|
||||
} finally {
|
||||
try {
|
||||
if (stream != null) stream.close();
|
||||
if (stream != null) {
|
||||
stream.close();
|
||||
}
|
||||
} catch (IOException e2) {
|
||||
throw new RuntimeException(e2);
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue