LUCENE-6034: Highlighter QueryScorer/WeightedSpanTermExtractor shouldn't re-invert a term vector based TokenStream. It can now highlight payload-sensitive queries.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1643316 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
David Wayne Smiley 2014-12-05 15:08:57 +00:00
parent e460ce328c
commit 978aac3184
6 changed files with 383 additions and 96 deletions

View File

@ -119,6 +119,10 @@ New Features
* LUCENE-6088: TermsFilter implements Accountable. (Adrien Grand)
* LUCENE-6034: The default highlighter when used with QueryScorer will highlight payload-sensitive
queries provided that term vectors with positions, offsets, and payloads are present. This is the
only highlighter that can highlight such queries accurately. (David Smiley)
Optimizations
* LUCENE-5960: Use a more efficient bitset, not a Set<Integer>, to
@ -161,6 +165,10 @@ Optimizations
* LUCENE-6089, LUCENE-6090: Tune CompressionMode.HIGH_COMPRESSION for
better compression and less cpu usage. (Adrien Grand, Robert Muir)
* LUCENE-6034: QueryScorer, used by the default highlighter, needn't re-index the provided
TokenStream with MemoryIndex when it comes from TokenSources (term vectors) with offsets and
positions. (David Smiley)
API Changes
* LUCENE-5900: Deprecated more constructors taking Version in *InfixSuggester and

View File

@ -265,7 +265,8 @@ public class QueryScorer implements Scorer {
* {@link CachingTokenFilter} are wrapped in a {@link CachingTokenFilter} to
* ensure an efficient reset - if you are already using a different caching
* {@link TokenStream} impl and you don't want it to be wrapped, set this to
* false.
* false. Note that term-vector based tokenstreams are detected and won't be
* wrapped either.
*/
public void setWrapIfNotCachingTokenFilter(boolean wrap) {
this.wrapToCaching = wrap;

View File

@ -0,0 +1,176 @@
package org.apache.lucene.search.highlight;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.Collections;
import java.util.Iterator;
import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.index.DocValuesType;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.index.SortedNumericDocValues;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.index.StoredFieldVisitor;
import org.apache.lucene.index.Terms;
import org.apache.lucene.util.Bits;
/**
* Wraps a Terms with a {@link org.apache.lucene.index.LeafReader}, typically from term vectors.
*
* @lucene.experimental
*/
public class TermVectorLeafReader extends LeafReader {
private final Fields fields;
private final FieldInfos fieldInfos;
public TermVectorLeafReader(String field, Terms terms) {
fields = new Fields() {
@Override
public Iterator<String> iterator() {
return Collections.singletonList(field).iterator();
}
@Override
public Terms terms(String fld) throws IOException {
if (!field.equals(fld)) {
return null;
}
return terms;
}
@Override
public int size() {
return 1;
}
};
IndexOptions indexOptions;
if (!terms.hasFreqs()) {
indexOptions = IndexOptions.DOCS;
} else if (!terms.hasPositions()) {
indexOptions = IndexOptions.DOCS_AND_FREQS;
} else if (!terms.hasOffsets()) {
indexOptions = IndexOptions.DOCS_AND_FREQS_AND_POSITIONS;
} else {
indexOptions = IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS;
}
FieldInfo fieldInfo = new FieldInfo(field, 0,
true, true, terms.hasPayloads(),
indexOptions, DocValuesType.NONE, -1, null);
fieldInfos = new FieldInfos(new FieldInfo[]{fieldInfo});
}
@Override
public void addCoreClosedListener(CoreClosedListener listener) {
addCoreClosedListenerAsReaderClosedListener(this, listener);
}
@Override
public void removeCoreClosedListener(CoreClosedListener listener) {
removeCoreClosedListenerAsReaderClosedListener(this, listener);
}
@Override
protected void doClose() throws IOException {
}
@Override
public Fields fields() throws IOException {
return fields;
}
@Override
public NumericDocValues getNumericDocValues(String field) throws IOException {
return null;
}
@Override
public BinaryDocValues getBinaryDocValues(String field) throws IOException {
return null;
}
@Override
public SortedDocValues getSortedDocValues(String field) throws IOException {
return null;
}
@Override
public SortedNumericDocValues getSortedNumericDocValues(String field) throws IOException {
return null;
}
@Override
public SortedSetDocValues getSortedSetDocValues(String field) throws IOException {
return null;
}
@Override
public Bits getDocsWithField(String field) throws IOException {
return null;
}
@Override
public NumericDocValues getNormValues(String field) throws IOException {
return null;//Is this needed? See MemoryIndex for a way to do it.
}
@Override
public FieldInfos getFieldInfos() {
return fieldInfos;
}
@Override
public Bits getLiveDocs() {
return null;
}
@Override
public void checkIntegrity() throws IOException {
}
@Override
public Fields getTermVectors(int docID) throws IOException {
if (docID != 0) {
return null;
}
return fields();
}
@Override
public int numDocs() {
return 1;
}
@Override
public int maxDoc() {
return 1;
}
@Override
public void document(int docID, StoredFieldVisitor visitor) throws IOException {
}
}

View File

@ -36,7 +36,7 @@ import org.apache.lucene.index.Terms;
*/
public class TokenSources {
/**
* A convenience method that tries to first get a TermPositionVector for the
* A convenience method that tries to first get a {@link TokenStreamFromTermVector} for the
* specified docId, then, falls back to using the passed in
* {@link org.apache.lucene.document.Document} to retrieve the TokenStream.
* This is useful when you already have the document, but would prefer to use

View File

@ -16,6 +16,7 @@ package org.apache.lucene.search.highlight;
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
@ -29,13 +30,13 @@ import java.util.TreeSet;
import org.apache.lucene.analysis.CachingTokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.index.FilterLeafReader;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.FilterLeafReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.index.Term;
@ -43,7 +44,18 @@ import org.apache.lucene.index.TermContext;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.memory.MemoryIndex;
import org.apache.lucene.queries.CommonTermsQuery;
import org.apache.lucene.search.*;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.ConstantScoreQuery;
import org.apache.lucene.search.DisjunctionMaxQuery;
import org.apache.lucene.search.FilteredQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.MultiPhraseQuery;
import org.apache.lucene.search.MultiTermQuery;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.join.ToChildBlockJoinQuery;
import org.apache.lucene.search.join.ToParentBlockJoinQuery;
import org.apache.lucene.search.spans.FieldMaskingSpanQuery;
@ -65,7 +77,7 @@ import org.apache.lucene.util.IOUtils;
public class WeightedSpanTermExtractor {
private String fieldName;
private TokenStream tokenStream;
private TokenStream tokenStream;//set subsequent to getWeightedSpanTerms* methods
private String defaultField;
private boolean expandMultiTermQuery;
private boolean cachedTokenStream;
@ -209,6 +221,8 @@ public class WeightedSpanTermExtractor {
sp.setBoost(query.getBoost());
extractWeightedSpanTerms(terms, sp);
}
} else if (query instanceof MatchAllDocsQuery) {
//nothing
} else {
Query origQuery = query;
if (query instanceof MultiTermQuery) {
@ -357,18 +371,39 @@ public class WeightedSpanTermExtractor {
protected LeafReaderContext getLeafContext() throws IOException {
if (internalReader == null) {
if(wrapToCaching && !(tokenStream instanceof CachingTokenFilter)) {
assert !cachedTokenStream;
tokenStream = new CachingTokenFilter(new OffsetLimitTokenFilter(tokenStream, maxDocCharsToAnalyze));
cachedTokenStream = true;
boolean cacheIt = wrapToCaching && !(tokenStream instanceof CachingTokenFilter);
// If it's from term vectors, simply wrap the underlying Terms in a reader
if (tokenStream instanceof TokenStreamFromTermVector) {
cacheIt = false;
Terms termVectorTerms = ((TokenStreamFromTermVector) tokenStream).getTermVectorTerms();
if (termVectorTerms.hasPositions() && termVectorTerms.hasOffsets()) {
internalReader = new TermVectorLeafReader(DelegatingLeafReader.FIELD_NAME, termVectorTerms);
}
}
final MemoryIndex indexer = new MemoryIndex(true);
indexer.addField(DelegatingLeafReader.FIELD_NAME, tokenStream);
tokenStream.reset();
final IndexSearcher searcher = indexer.createSearcher();
// MEM index has only atomic ctx
internalReader = new DelegatingLeafReader(((LeafReaderContext)searcher.getTopReaderContext()).reader());
// Use MemoryIndex (index/invert this tokenStream now)
if (internalReader == null) {
final MemoryIndex indexer = new MemoryIndex(true);
if (cacheIt) {
assert !cachedTokenStream;
tokenStream = new CachingTokenFilter(new OffsetLimitTokenFilter(tokenStream, maxDocCharsToAnalyze));
cachedTokenStream = true;
indexer.addField(DelegatingLeafReader.FIELD_NAME, tokenStream);
} else {
indexer.addField(DelegatingLeafReader.FIELD_NAME,
new OffsetLimitTokenFilter(tokenStream, maxDocCharsToAnalyze));
}
tokenStream.reset();//reset to beginning when we return
final IndexSearcher searcher = indexer.createSearcher();
// MEM index has only atomic ctx
internalReader = ((LeafReaderContext) searcher.getTopReaderContext()).reader();
}
//Now wrap it so we always use a common field.
this.internalReader = new DelegatingLeafReader(internalReader);
}
return internalReader.getContext();
}
@ -532,7 +567,7 @@ public class WeightedSpanTermExtractor {
return terms;
}
protected void collectSpanQueryFields(SpanQuery spanQuery, Set<String> fieldNames) {
if (spanQuery instanceof FieldMaskingSpanQuery) {
collectSpanQueryFields(((FieldMaskingSpanQuery)spanQuery).getMaskedQuery(), fieldNames);
@ -622,8 +657,11 @@ public class WeightedSpanTermExtractor {
public boolean isCachedTokenStream() {
return cachedTokenStream;
}
/** Returns the tokenStream which may have been wrapped in a CachingTokenFilter.
* getWeightedSpanTerms* sets the tokenStream, so don't call this before. */
public TokenStream getTokenStream() {
assert tokenStream != null;
return tokenStream;
}
@ -632,12 +670,16 @@ public class WeightedSpanTermExtractor {
* {@link CachingTokenFilter} are wrapped in a {@link CachingTokenFilter} to
* ensure an efficient reset - if you are already using a different caching
* {@link TokenStream} impl and you don't want it to be wrapped, set this to
* false.
* false. This setting is ignored when a term vector based TokenStream is supplied,
* since it can be reset efficiently.
*/
public void setWrapIfNotCachingTokenFilter(boolean wrap) {
this.wrapToCaching = wrap;
}
/** A threshold of number of characters to analyze. When a TokenStream based on
* term vectors with offsets and positions are supplied, this setting
* does not apply. */
protected final void setMaxDocCharsToAnalyze(int maxDocCharsToAnalyze) {
this.maxDocCharsToAnalyze = maxDocCharsToAnalyze;
}

View File

@ -22,6 +22,7 @@ import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
@ -36,6 +37,7 @@ import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.IntField;
import org.apache.lucene.document.StoredField;
import org.apache.lucene.document.TextField;
@ -78,8 +80,20 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
Directory ramDir;
public IndexSearcher searcher = null;
int numHighlights = 0;
Analyzer analyzer;
MockAnalyzer analyzer;
TopDocs hits;
FieldType fieldType;//see doc()
final FieldType FIELD_TYPE_TV;
{
FieldType fieldType = new FieldType(TextField.TYPE_STORED);
fieldType.setStoreTermVectors(true);
fieldType.setStoreTermVectorPositions(true);
fieldType.setStoreTermVectorPayloads(true);
fieldType.setStoreTermVectorOffsets(true);
fieldType.freeze();
FIELD_TYPE_TV = fieldType;
}
String[] texts = {
"Hello this is a piece of text that is very long and contains too much preamble and the meat is really here which says kennedy has been shot",
@ -121,9 +135,8 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
}
public void testHighlightingCommonTermsQuery() throws Exception {
Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);
CommonTermsQuery query = new CommonTermsQuery(Occur.MUST, Occur.SHOULD, 3);
query.add(new Term(FIELD_NAME, "this"));
query.add(new Term(FIELD_NAME, "this"));//stop-word
query.add(new Term(FIELD_NAME, "long"));
query.add(new Term(FIELD_NAME, "very"));
@ -141,7 +154,7 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
Fragmenter fragmenter = new SimpleSpanFragmenter(scorer);
highlighter.setTextFragmenter(fragmenter);
String fragment = highlighter.getBestFragment(stream, storedField);
assertEquals("Hello <B>this</B> is a piece of text that is <B>very</B> <B>long</B> and contains too much preamble and the meat is really here which says kennedy has been shot", fragment);
assertEquals("Hello this is a piece of text that is <B>very</B> <B>long</B> and contains too much preamble and the meat is really here which says kennedy has been shot", fragment);
doc = searcher.doc(hits.scoreDocs[1].doc);
storedField = doc.get(FIELD_NAME);
@ -150,7 +163,7 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
.getIndexReader(), hits.scoreDocs[1].doc, FIELD_NAME, doc, analyzer);
highlighter.setTextFragmenter(new SimpleSpanFragmenter(scorer));
fragment = highlighter.getBestFragment(stream, storedField);
assertEquals("<B>This</B> piece of text refers to Kennedy at the beginning then has a longer piece of text that is <B>very</B>", fragment);
assertEquals("This piece of text refers to Kennedy at the beginning then has a longer piece of text that is <B>very</B>", fragment);
}
public void testHighlightUnknowQueryAfterRewrite() throws IOException, InvalidTokenOffsetsException {
@ -159,7 +172,7 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
@Override
public Query rewrite(IndexReader reader) throws IOException {
CommonTermsQuery query = new CommonTermsQuery(Occur.MUST, Occur.SHOULD, 3);
query.add(new Term(FIELD_NAME, "this"));
query.add(new Term(FIELD_NAME, "this"));//stop-word
query.add(new Term(FIELD_NAME, "long"));
query.add(new Term(FIELD_NAME, "very"));
return query;
@ -180,9 +193,7 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
return super.equals(obj);
}
};
Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);
searcher = newSearcher(reader);
TopDocs hits = searcher.search(query, 10);
assertEquals(2, hits.totalHits);
@ -197,7 +208,7 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
Fragmenter fragmenter = new SimpleSpanFragmenter(scorer);
highlighter.setTextFragmenter(fragmenter);
String fragment = highlighter.getBestFragment(stream, storedField);
assertEquals("Hello <B>this</B> is a piece of text that is <B>very</B> <B>long</B> and contains too much preamble and the meat is really here which says kennedy has been shot", fragment);
assertEquals("Hello this is a piece of text that is <B>very</B> <B>long</B> and contains too much preamble and the meat is really here which says kennedy has been shot", fragment);
doc = searcher.doc(hits.scoreDocs[1].doc);
storedField = doc.get(FIELD_NAME);
@ -206,7 +217,7 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
.getIndexReader(), hits.scoreDocs[1].doc, FIELD_NAME, doc, analyzer);
highlighter.setTextFragmenter(new SimpleSpanFragmenter(scorer));
fragment = highlighter.getBestFragment(stream, storedField);
assertEquals("<B>This</B> piece of text refers to Kennedy at the beginning then has a longer piece of text that is <B>very</B>", fragment);
assertEquals("This piece of text refers to Kennedy at the beginning then has a longer piece of text that is <B>very</B>", fragment);
}
@ -252,8 +263,7 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
*/
private String highlightField(Query query, String fieldName, String text)
throws IOException, InvalidTokenOffsetsException {
TokenStream tokenStream = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET)
.tokenStream(fieldName, text);
TokenStream tokenStream = analyzer.tokenStream(fieldName, text);
// Assuming "<B>", "</B>" used to highlight
SimpleHTMLFormatter formatter = new SimpleHTMLFormatter();
QueryScorer scorer = new QueryScorer(query, fieldName, FIELD_NAME);
@ -351,8 +361,9 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
Highlighter highlighter = new Highlighter(this, scorer);
for (int i = 0; i < hits.totalHits; i++) {
String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, text);
final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
String text = doc.get(FIELD_NAME);
TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
highlighter.setTextFragmenter(new SimpleFragmenter(40));
@ -380,8 +391,9 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
highlighter = new Highlighter(this, scorer);
for (int i = 0; i < hits.totalHits; i++) {
String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, text);
final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
String text = doc.get(FIELD_NAME);
TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
highlighter.setTextFragmenter(new SimpleFragmenter(40));
@ -409,8 +421,9 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
highlighter = new Highlighter(this, scorer);
for (int i = 0; i < hits.totalHits; i++) {
String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, text);
final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
String text = doc.get(FIELD_NAME);
TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
highlighter.setTextFragmenter(new SimpleFragmenter(40));
@ -434,8 +447,9 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
Highlighter highlighter = new Highlighter(this, scorer);
for (int i = 0; i < hits.totalHits; i++) {
String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, text);
final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
String text = doc.get(FIELD_NAME);
TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
highlighter.setTextFragmenter(new SimpleFragmenter(40));
@ -458,8 +472,9 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
Highlighter highlighter = new Highlighter(this, scorer);
for (int i = 0; i < hits.totalHits; i++) {
String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, text);
final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
String text = doc.get(FIELD_NAME);
TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
highlighter.setTextFragmenter(new SimpleFragmenter(40));
@ -482,8 +497,9 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
Highlighter highlighter = new Highlighter(this, scorer);
for (int i = 0; i < hits.totalHits; i++) {
String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, text);
final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
String text = doc.get(FIELD_NAME);
TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
highlighter.setTextFragmenter(new SimpleFragmenter(40));
@ -567,7 +583,8 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
for (int i = 0; i < hits.totalHits; i++) {
String text = "parent document";
TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, text);
StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
highlighter.setTextFragmenter(new SimpleFragmenter(40));
highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired, "...");
@ -592,8 +609,9 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
highlighter.setTextFragmenter(new SimpleFragmenter(40));
for (int i = 0; i < hits.totalHits; i++) {
String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, text);
final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
String text = doc.get(FIELD_NAME);
TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired,
"...");
@ -614,8 +632,9 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
int maxNumFragmentsRequired = 2;
for (int i = 0; i < hits.totalHits; i++) {
String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, text);
final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
String text = doc.get(FIELD_NAME);
TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
QueryScorer scorer = new QueryScorer(query, FIELD_NAME);
Highlighter highlighter = new Highlighter(this, scorer);
@ -644,8 +663,9 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
Highlighter highlighter = new Highlighter(this, scorer);
for (int i = 0; i < hits.totalHits; i++) {
String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, text);
final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
String text = doc.get(FIELD_NAME);
TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
highlighter.setTextFragmenter(new SimpleSpanFragmenter(scorer, 5));
@ -698,8 +718,9 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
Highlighter highlighter = new Highlighter(this,scorer);
for (int i = 0; i < hits.totalHits; i++) {
String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, text);
final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
String text = doc.get(FIELD_NAME);
TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
highlighter.setTextFragmenter(new SimpleFragmenter(40));
@ -769,8 +790,9 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
highlighter.setTextFragmenter(new SimpleFragmenter(40));
int maxNumFragmentsRequired = 2;
for (int i = 0; i < hits.totalHits; i++) {
String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, text);
final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
String text = doc.get(FIELD_NAME);
TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired,
"...");
@ -963,11 +985,11 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
hits = searcher.search(query, null, 1000);
for (int i = 0; i < hits.totalHits; i++) {
String text = searcher.doc(hits.scoreDocs[i].doc).get(HighlighterTest.FIELD_NAME);
int maxNumFragmentsRequired = 2;
final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
String text = doc.get(FIELD_NAME);
TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer); int maxNumFragmentsRequired = 2;
String fragmentSeparator = "...";
QueryScorer scorer = new QueryScorer(query, HighlighterTest.FIELD_NAME);
TokenStream tokenStream = analyzer.tokenStream(HighlighterTest.FIELD_NAME, text);
Highlighter highlighter = new Highlighter(this, scorer);
@ -987,11 +1009,12 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
numHighlights = 0;
for (int i = 0; i < hits.totalHits; i++) {
String text = searcher.doc(hits.scoreDocs[i].doc).get(HighlighterTest.FIELD_NAME);
final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
String text = doc.get(FIELD_NAME);
TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
int maxNumFragmentsRequired = 2;
String fragmentSeparator = "...";
QueryScorer scorer = new QueryScorer(query, null);
TokenStream tokenStream = analyzer.tokenStream(HighlighterTest.FIELD_NAME, text);
Highlighter highlighter = new Highlighter(this, scorer);
@ -1011,11 +1034,11 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
numHighlights = 0;
for (int i = 0; i < hits.totalHits; i++) {
String text = searcher.doc(hits.scoreDocs[i].doc).get(HighlighterTest.FIELD_NAME);
int maxNumFragmentsRequired = 2;
final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
String text = doc.get(FIELD_NAME);
TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer); int maxNumFragmentsRequired = 2;
String fragmentSeparator = "...";
QueryScorer scorer = new QueryScorer(query, "random_field", HighlighterTest.FIELD_NAME);
TokenStream tokenStream = analyzer.tokenStream(HighlighterTest.FIELD_NAME, text);
Highlighter highlighter = new Highlighter(this, scorer);
@ -1185,8 +1208,9 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
doSearching(new TermQuery(new Term(FIELD_NAME, "kennedy")));
numHighlights = 0;
for (int i = 0; i < hits.totalHits; i++) {
String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, text);
final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
String text = doc.get(FIELD_NAME);
TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
Highlighter highlighter = getHighlighter(query, FIELD_NAME,
HighlighterTest.this);
@ -1199,21 +1223,25 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
numHighlights = 0;
for (int i = 0; i < hits.totalHits; i++) {
String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
String text = doc.get(FIELD_NAME);
TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
Highlighter highlighter = getHighlighter(query, FIELD_NAME,
HighlighterTest.this);
highlighter.getBestFragment(analyzer, FIELD_NAME, text);
highlighter.getBestFragment(tokenStream, text);
}
assertTrue("Failed to find correct number of highlights " + numHighlights + " found",
numHighlights == 4);
numHighlights = 0;
for (int i = 0; i < hits.totalHits; i++) {
String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
String text = doc.get(FIELD_NAME);
TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
Highlighter highlighter = getHighlighter(query, FIELD_NAME,
HighlighterTest.this);
highlighter.getBestFragments(analyzer, FIELD_NAME, text, 10);
highlighter.getBestFragments(tokenStream, text, 10);
}
assertTrue("Failed to find correct number of highlights " + numHighlights + " found",
numHighlights == 4);
@ -1339,8 +1367,9 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
doSearching(new TermQuery(new Term(FIELD_NAME, "kennedy")));
for (int i = 0; i < hits.totalHits; i++) {
String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, text);
final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
String text = doc.get(FIELD_NAME);
TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
Highlighter highlighter = getHighlighter(query, FIELD_NAME,
HighlighterTest.this);// new Highlighter(this, new
@ -1368,9 +1397,10 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
}
public void testMaxSizeHighlight() throws Exception {
final MockAnalyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET);
// we disable MockTokenizer checks because we will forcefully limit the
// we disable MockTokenizer checks because we will forcefully limit the
// tokenstream and call end() before incrementToken() returns false.
// But we first need to clear the re-used tokenstream components that have enableChecks.
analyzer.getReuseStrategy().setReusableComponents(analyzer, FIELD_NAME, null);
analyzer.setEnableChecks(false);
TestHighlightRunner helper = new TestHighlightRunner() {
@ -1471,8 +1501,7 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
numHighlights = 0;
// test to show how rewritten query can still be used
searcher = newSearcher(reader);
Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET);
BooleanQuery query = new BooleanQuery();
query.add(new WildcardQuery(new Term(FIELD_NAME, "jf?")), Occur.SHOULD);
query.add(new WildcardQuery(new Term(FIELD_NAME, "kenned*")), Occur.SHOULD);
@ -1491,8 +1520,9 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
int maxNumFragmentsRequired = 3;
for (int i = 0; i < hits.totalHits; i++) {
String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, text);
final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
String text = doc.get(FIELD_NAME);
TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
Highlighter highlighter = getHighlighter(query, FIELD_NAME, HighlighterTest.this, false);
highlighter.setTextFragmenter(new SimpleFragmenter(40));
@ -1823,12 +1853,6 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
searchIndex();
}
private Document doc( String f, String v ){
Document doc = new Document();
doc.add( new TextField( f, v, Field.Store.YES));
return doc;
}
private void makeIndex() throws IOException {
IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false)));
writer.addDocument( doc( "t_text1", "random words for highlighting tests del" ) );
@ -1867,6 +1891,34 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
reader.close();
}
/** If we have term vectors, we can highlight based on payloads */
public void testPayloadQuery() throws IOException, InvalidTokenOffsetsException {
final String text = "random words and words";//"words" at positions 1 & 4
Analyzer analyzer = new MockPayloadAnalyzer();//sets payload to "pos: X" (where X is position #)
try (IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(analyzer))) {
writer.deleteAll();
Document doc = new Document();
doc.add(new Field(FIELD_NAME, text, FIELD_TYPE_TV));
writer.addDocument(doc);
writer.commit();
}
try (IndexReader reader = DirectoryReader.open(dir)) {
Query query = new SpanPayloadCheckQuery(new SpanTermQuery(new Term(FIELD_NAME, "words")),
Collections.singleton("pos: 1".getBytes("UTF-8")));//just match the first "word" occurrence
IndexSearcher searcher = newSearcher(reader);
Scorer scorer = new QueryScorer(query, searcher.getIndexReader(), FIELD_NAME);
Highlighter h = new Highlighter(scorer);
TopDocs hits = searcher.search(query, null, 10);
assertEquals(1, hits.scoreDocs.length);
TokenStream stream = TokenSources.getAnyTokenStream(searcher.getIndexReader(), 0, FIELD_NAME, analyzer);
String result = h.getBestFragment(stream, text);
assertEquals("random <B>words</B> and words", result);//only highlight first "word"
}
}
/*
*
* public void testBigramAnalyzer() throws IOException, ParseException {
@ -1934,14 +1986,21 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
public void setUp() throws Exception {
super.setUp();
//Not many use this setup:
a = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false);
analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET);
dir = newDirectory();
//Most tests use this setup:
analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET);
ramDir = newDirectory();
IndexWriter writer = new IndexWriter(ramDir, newIndexWriterConfig(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET)));
fieldType = random().nextBoolean() ? FIELD_TYPE_TV : TextField.TYPE_STORED;
IndexWriter writer = new IndexWriter(ramDir, newIndexWriterConfig(analyzer));
for (String text : texts) {
addDoc(writer, text);
writer.addDocument(doc(FIELD_NAME, text));
}
// a few tests need other docs...:
Document doc = new Document();
doc.add(new IntField(NUMERIC_FIELD_NAME, 1, Field.Store.NO));
doc.add(new StoredField(NUMERIC_FIELD_NAME, 1));
@ -1969,6 +2028,8 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
writer.forceMerge(1);
writer.close();
reader = DirectoryReader.open(ramDir);
//Misc:
numHighlights = 0;
}
@ -1979,13 +2040,11 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
ramDir.close();
super.tearDown();
}
private void addDoc(IndexWriter writer, String text) throws IOException {
private Document doc(String name, String value) {
Document d = new Document();
Field f = new TextField(FIELD_NAME, text, Field.Store.YES);
d.add(f);
writer.addDocument(d);
d.add(new Field(name, value, fieldType));//fieldType is randomly chosen for term vectors in setUp
return d;
}
private static Token createToken(String term, int start, int offset)
@ -2164,11 +2223,13 @@ final class SynonymTokenizer extends TokenStream {
throws Exception {
for (int i = 0; i < hits.totalHits; i++) {
String text = searcher.doc(hits.scoreDocs[i].doc).get(HighlighterTest.FIELD_NAME);
final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
String text = doc.get(HighlighterTest.FIELD_NAME);
int maxNumFragmentsRequired = 2;
String fragmentSeparator = "...";
Scorer scorer = null;
TokenStream tokenStream = analyzer.tokenStream(HighlighterTest.FIELD_NAME, text);
TokenStream tokenStream = TokenSources.getAnyTokenStream(searcher.getIndexReader(),
hits.scoreDocs[i].doc, HighlighterTest.FIELD_NAME, doc, analyzer);
if (mode == QUERY) {
scorer = new QueryScorer(query);
} else if (mode == QUERY_TERM) {
@ -2176,7 +2237,6 @@ final class SynonymTokenizer extends TokenStream {
}
Highlighter highlighter = new Highlighter(formatter, scorer);
highlighter.setTextFragmenter(frag);
String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired,
fragmentSeparator);
if (LuceneTestCase.VERBOSE) System.out.println("\t" + result);