mirror of https://github.com/apache/lucene.git
LUCENE-6034: Highlighter QueryScorer/WeightedSpanTermExtractor shouldn't re-invert a term vector based TokenStream. It can now highlight payload-sensitive queries.
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1643316 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
e460ce328c
commit
978aac3184
|
@ -119,6 +119,10 @@ New Features
|
|||
|
||||
* LUCENE-6088: TermsFilter implements Accountable. (Adrien Grand)
|
||||
|
||||
* LUCENE-6034: The default highlighter when used with QueryScorer will highlight payload-sensitive
|
||||
queries provided that term vectors with positions, offsets, and payloads are present. This is the
|
||||
only highlighter that can highlight such queries accurately. (David Smiley)
|
||||
|
||||
Optimizations
|
||||
|
||||
* LUCENE-5960: Use a more efficient bitset, not a Set<Integer>, to
|
||||
|
@ -161,6 +165,10 @@ Optimizations
|
|||
* LUCENE-6089, LUCENE-6090: Tune CompressionMode.HIGH_COMPRESSION for
|
||||
better compression and less cpu usage. (Adrien Grand, Robert Muir)
|
||||
|
||||
* LUCENE-6034: QueryScorer, used by the default highlighter, needn't re-index the provided
|
||||
TokenStream with MemoryIndex when it comes from TokenSources (term vectors) with offsets and
|
||||
positions. (David Smiley)
|
||||
|
||||
API Changes
|
||||
|
||||
* LUCENE-5900: Deprecated more constructors taking Version in *InfixSuggester and
|
||||
|
|
|
@ -265,7 +265,8 @@ public class QueryScorer implements Scorer {
|
|||
* {@link CachingTokenFilter} are wrapped in a {@link CachingTokenFilter} to
|
||||
* ensure an efficient reset - if you are already using a different caching
|
||||
* {@link TokenStream} impl and you don't want it to be wrapped, set this to
|
||||
* false.
|
||||
* false. Note that term-vector based tokenstreams are detected and won't be
|
||||
* wrapped either.
|
||||
*/
|
||||
public void setWrapIfNotCachingTokenFilter(boolean wrap) {
|
||||
this.wrapToCaching = wrap;
|
||||
|
|
|
@ -0,0 +1,176 @@
|
|||
package org.apache.lucene.search.highlight;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Collections;
|
||||
import java.util.Iterator;
|
||||
|
||||
import org.apache.lucene.index.BinaryDocValues;
|
||||
import org.apache.lucene.index.DocValuesType;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.FieldInfos;
|
||||
import org.apache.lucene.index.Fields;
|
||||
import org.apache.lucene.index.IndexOptions;
|
||||
import org.apache.lucene.index.LeafReader;
|
||||
import org.apache.lucene.index.NumericDocValues;
|
||||
import org.apache.lucene.index.SortedDocValues;
|
||||
import org.apache.lucene.index.SortedNumericDocValues;
|
||||
import org.apache.lucene.index.SortedSetDocValues;
|
||||
import org.apache.lucene.index.StoredFieldVisitor;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.util.Bits;
|
||||
|
||||
/**
|
||||
* Wraps a Terms with a {@link org.apache.lucene.index.LeafReader}, typically from term vectors.
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class TermVectorLeafReader extends LeafReader {
|
||||
|
||||
private final Fields fields;
|
||||
private final FieldInfos fieldInfos;
|
||||
|
||||
public TermVectorLeafReader(String field, Terms terms) {
|
||||
fields = new Fields() {
|
||||
@Override
|
||||
public Iterator<String> iterator() {
|
||||
return Collections.singletonList(field).iterator();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Terms terms(String fld) throws IOException {
|
||||
if (!field.equals(fld)) {
|
||||
return null;
|
||||
}
|
||||
return terms;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int size() {
|
||||
return 1;
|
||||
}
|
||||
};
|
||||
|
||||
IndexOptions indexOptions;
|
||||
if (!terms.hasFreqs()) {
|
||||
indexOptions = IndexOptions.DOCS;
|
||||
} else if (!terms.hasPositions()) {
|
||||
indexOptions = IndexOptions.DOCS_AND_FREQS;
|
||||
} else if (!terms.hasOffsets()) {
|
||||
indexOptions = IndexOptions.DOCS_AND_FREQS_AND_POSITIONS;
|
||||
} else {
|
||||
indexOptions = IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS;
|
||||
}
|
||||
FieldInfo fieldInfo = new FieldInfo(field, 0,
|
||||
true, true, terms.hasPayloads(),
|
||||
indexOptions, DocValuesType.NONE, -1, null);
|
||||
fieldInfos = new FieldInfos(new FieldInfo[]{fieldInfo});
|
||||
}
|
||||
|
||||
@Override
|
||||
public void addCoreClosedListener(CoreClosedListener listener) {
|
||||
addCoreClosedListenerAsReaderClosedListener(this, listener);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void removeCoreClosedListener(CoreClosedListener listener) {
|
||||
removeCoreClosedListenerAsReaderClosedListener(this, listener);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void doClose() throws IOException {
|
||||
}
|
||||
|
||||
@Override
|
||||
public Fields fields() throws IOException {
|
||||
return fields;
|
||||
}
|
||||
|
||||
@Override
|
||||
public NumericDocValues getNumericDocValues(String field) throws IOException {
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public BinaryDocValues getBinaryDocValues(String field) throws IOException {
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public SortedDocValues getSortedDocValues(String field) throws IOException {
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public SortedNumericDocValues getSortedNumericDocValues(String field) throws IOException {
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public SortedSetDocValues getSortedSetDocValues(String field) throws IOException {
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Bits getDocsWithField(String field) throws IOException {
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public NumericDocValues getNormValues(String field) throws IOException {
|
||||
return null;//Is this needed? See MemoryIndex for a way to do it.
|
||||
}
|
||||
|
||||
@Override
|
||||
public FieldInfos getFieldInfos() {
|
||||
return fieldInfos;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Bits getLiveDocs() {
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void checkIntegrity() throws IOException {
|
||||
}
|
||||
|
||||
@Override
|
||||
public Fields getTermVectors(int docID) throws IOException {
|
||||
if (docID != 0) {
|
||||
return null;
|
||||
}
|
||||
return fields();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int numDocs() {
|
||||
return 1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int maxDoc() {
|
||||
return 1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void document(int docID, StoredFieldVisitor visitor) throws IOException {
|
||||
}
|
||||
|
||||
}
|
|
@ -36,7 +36,7 @@ import org.apache.lucene.index.Terms;
|
|||
*/
|
||||
public class TokenSources {
|
||||
/**
|
||||
* A convenience method that tries to first get a TermPositionVector for the
|
||||
* A convenience method that tries to first get a {@link TokenStreamFromTermVector} for the
|
||||
* specified docId, then, falls back to using the passed in
|
||||
* {@link org.apache.lucene.document.Document} to retrieve the TokenStream.
|
||||
* This is useful when you already have the document, but would prefer to use
|
||||
|
|
|
@ -16,6 +16,7 @@ package org.apache.lucene.search.highlight;
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
|
@ -29,13 +30,13 @@ import java.util.TreeSet;
|
|||
|
||||
import org.apache.lucene.analysis.CachingTokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.index.FilterLeafReader;
|
||||
import org.apache.lucene.index.LeafReader;
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.index.BinaryDocValues;
|
||||
import org.apache.lucene.index.FieldInfos;
|
||||
import org.apache.lucene.index.Fields;
|
||||
import org.apache.lucene.index.FilterLeafReader;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.LeafReader;
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.index.NumericDocValues;
|
||||
import org.apache.lucene.index.SortedDocValues;
|
||||
import org.apache.lucene.index.Term;
|
||||
|
@ -43,7 +44,18 @@ import org.apache.lucene.index.TermContext;
|
|||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.memory.MemoryIndex;
|
||||
import org.apache.lucene.queries.CommonTermsQuery;
|
||||
import org.apache.lucene.search.*;
|
||||
import org.apache.lucene.search.BooleanClause;
|
||||
import org.apache.lucene.search.BooleanQuery;
|
||||
import org.apache.lucene.search.ConstantScoreQuery;
|
||||
import org.apache.lucene.search.DisjunctionMaxQuery;
|
||||
import org.apache.lucene.search.FilteredQuery;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.MatchAllDocsQuery;
|
||||
import org.apache.lucene.search.MultiPhraseQuery;
|
||||
import org.apache.lucene.search.MultiTermQuery;
|
||||
import org.apache.lucene.search.PhraseQuery;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.search.join.ToChildBlockJoinQuery;
|
||||
import org.apache.lucene.search.join.ToParentBlockJoinQuery;
|
||||
import org.apache.lucene.search.spans.FieldMaskingSpanQuery;
|
||||
|
@ -65,7 +77,7 @@ import org.apache.lucene.util.IOUtils;
|
|||
public class WeightedSpanTermExtractor {
|
||||
|
||||
private String fieldName;
|
||||
private TokenStream tokenStream;
|
||||
private TokenStream tokenStream;//set subsequent to getWeightedSpanTerms* methods
|
||||
private String defaultField;
|
||||
private boolean expandMultiTermQuery;
|
||||
private boolean cachedTokenStream;
|
||||
|
@ -209,6 +221,8 @@ public class WeightedSpanTermExtractor {
|
|||
sp.setBoost(query.getBoost());
|
||||
extractWeightedSpanTerms(terms, sp);
|
||||
}
|
||||
} else if (query instanceof MatchAllDocsQuery) {
|
||||
//nothing
|
||||
} else {
|
||||
Query origQuery = query;
|
||||
if (query instanceof MultiTermQuery) {
|
||||
|
@ -357,18 +371,39 @@ public class WeightedSpanTermExtractor {
|
|||
|
||||
protected LeafReaderContext getLeafContext() throws IOException {
|
||||
if (internalReader == null) {
|
||||
if(wrapToCaching && !(tokenStream instanceof CachingTokenFilter)) {
|
||||
assert !cachedTokenStream;
|
||||
tokenStream = new CachingTokenFilter(new OffsetLimitTokenFilter(tokenStream, maxDocCharsToAnalyze));
|
||||
cachedTokenStream = true;
|
||||
boolean cacheIt = wrapToCaching && !(tokenStream instanceof CachingTokenFilter);
|
||||
|
||||
// If it's from term vectors, simply wrap the underlying Terms in a reader
|
||||
if (tokenStream instanceof TokenStreamFromTermVector) {
|
||||
cacheIt = false;
|
||||
Terms termVectorTerms = ((TokenStreamFromTermVector) tokenStream).getTermVectorTerms();
|
||||
if (termVectorTerms.hasPositions() && termVectorTerms.hasOffsets()) {
|
||||
internalReader = new TermVectorLeafReader(DelegatingLeafReader.FIELD_NAME, termVectorTerms);
|
||||
}
|
||||
}
|
||||
final MemoryIndex indexer = new MemoryIndex(true);
|
||||
indexer.addField(DelegatingLeafReader.FIELD_NAME, tokenStream);
|
||||
tokenStream.reset();
|
||||
final IndexSearcher searcher = indexer.createSearcher();
|
||||
// MEM index has only atomic ctx
|
||||
internalReader = new DelegatingLeafReader(((LeafReaderContext)searcher.getTopReaderContext()).reader());
|
||||
|
||||
// Use MemoryIndex (index/invert this tokenStream now)
|
||||
if (internalReader == null) {
|
||||
final MemoryIndex indexer = new MemoryIndex(true);
|
||||
if (cacheIt) {
|
||||
assert !cachedTokenStream;
|
||||
tokenStream = new CachingTokenFilter(new OffsetLimitTokenFilter(tokenStream, maxDocCharsToAnalyze));
|
||||
cachedTokenStream = true;
|
||||
indexer.addField(DelegatingLeafReader.FIELD_NAME, tokenStream);
|
||||
} else {
|
||||
indexer.addField(DelegatingLeafReader.FIELD_NAME,
|
||||
new OffsetLimitTokenFilter(tokenStream, maxDocCharsToAnalyze));
|
||||
}
|
||||
tokenStream.reset();//reset to beginning when we return
|
||||
final IndexSearcher searcher = indexer.createSearcher();
|
||||
// MEM index has only atomic ctx
|
||||
internalReader = ((LeafReaderContext) searcher.getTopReaderContext()).reader();
|
||||
}
|
||||
|
||||
//Now wrap it so we always use a common field.
|
||||
this.internalReader = new DelegatingLeafReader(internalReader);
|
||||
}
|
||||
|
||||
return internalReader.getContext();
|
||||
}
|
||||
|
||||
|
@ -532,7 +567,7 @@ public class WeightedSpanTermExtractor {
|
|||
|
||||
return terms;
|
||||
}
|
||||
|
||||
|
||||
protected void collectSpanQueryFields(SpanQuery spanQuery, Set<String> fieldNames) {
|
||||
if (spanQuery instanceof FieldMaskingSpanQuery) {
|
||||
collectSpanQueryFields(((FieldMaskingSpanQuery)spanQuery).getMaskedQuery(), fieldNames);
|
||||
|
@ -622,8 +657,11 @@ public class WeightedSpanTermExtractor {
|
|||
public boolean isCachedTokenStream() {
|
||||
return cachedTokenStream;
|
||||
}
|
||||
|
||||
|
||||
/** Returns the tokenStream which may have been wrapped in a CachingTokenFilter.
|
||||
* getWeightedSpanTerms* sets the tokenStream, so don't call this before. */
|
||||
public TokenStream getTokenStream() {
|
||||
assert tokenStream != null;
|
||||
return tokenStream;
|
||||
}
|
||||
|
||||
|
@ -632,12 +670,16 @@ public class WeightedSpanTermExtractor {
|
|||
* {@link CachingTokenFilter} are wrapped in a {@link CachingTokenFilter} to
|
||||
* ensure an efficient reset - if you are already using a different caching
|
||||
* {@link TokenStream} impl and you don't want it to be wrapped, set this to
|
||||
* false.
|
||||
* false. This setting is ignored when a term vector based TokenStream is supplied,
|
||||
* since it can be reset efficiently.
|
||||
*/
|
||||
public void setWrapIfNotCachingTokenFilter(boolean wrap) {
|
||||
this.wrapToCaching = wrap;
|
||||
}
|
||||
|
||||
/** A threshold of number of characters to analyze. When a TokenStream based on
|
||||
* term vectors with offsets and positions are supplied, this setting
|
||||
* does not apply. */
|
||||
protected final void setMaxDocCharsToAnalyze(int maxDocCharsToAnalyze) {
|
||||
this.maxDocCharsToAnalyze = maxDocCharsToAnalyze;
|
||||
}
|
||||
|
|
|
@ -22,6 +22,7 @@ import java.io.IOException;
|
|||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
@ -36,6 +37,7 @@ import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
|||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.FieldType;
|
||||
import org.apache.lucene.document.IntField;
|
||||
import org.apache.lucene.document.StoredField;
|
||||
import org.apache.lucene.document.TextField;
|
||||
|
@ -78,8 +80,20 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
|||
Directory ramDir;
|
||||
public IndexSearcher searcher = null;
|
||||
int numHighlights = 0;
|
||||
Analyzer analyzer;
|
||||
MockAnalyzer analyzer;
|
||||
TopDocs hits;
|
||||
FieldType fieldType;//see doc()
|
||||
|
||||
final FieldType FIELD_TYPE_TV;
|
||||
{
|
||||
FieldType fieldType = new FieldType(TextField.TYPE_STORED);
|
||||
fieldType.setStoreTermVectors(true);
|
||||
fieldType.setStoreTermVectorPositions(true);
|
||||
fieldType.setStoreTermVectorPayloads(true);
|
||||
fieldType.setStoreTermVectorOffsets(true);
|
||||
fieldType.freeze();
|
||||
FIELD_TYPE_TV = fieldType;
|
||||
}
|
||||
|
||||
String[] texts = {
|
||||
"Hello this is a piece of text that is very long and contains too much preamble and the meat is really here which says kennedy has been shot",
|
||||
|
@ -121,9 +135,8 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
|||
}
|
||||
|
||||
public void testHighlightingCommonTermsQuery() throws Exception {
|
||||
Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);
|
||||
CommonTermsQuery query = new CommonTermsQuery(Occur.MUST, Occur.SHOULD, 3);
|
||||
query.add(new Term(FIELD_NAME, "this"));
|
||||
query.add(new Term(FIELD_NAME, "this"));//stop-word
|
||||
query.add(new Term(FIELD_NAME, "long"));
|
||||
query.add(new Term(FIELD_NAME, "very"));
|
||||
|
||||
|
@ -141,7 +154,7 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
|||
Fragmenter fragmenter = new SimpleSpanFragmenter(scorer);
|
||||
highlighter.setTextFragmenter(fragmenter);
|
||||
String fragment = highlighter.getBestFragment(stream, storedField);
|
||||
assertEquals("Hello <B>this</B> is a piece of text that is <B>very</B> <B>long</B> and contains too much preamble and the meat is really here which says kennedy has been shot", fragment);
|
||||
assertEquals("Hello this is a piece of text that is <B>very</B> <B>long</B> and contains too much preamble and the meat is really here which says kennedy has been shot", fragment);
|
||||
|
||||
doc = searcher.doc(hits.scoreDocs[1].doc);
|
||||
storedField = doc.get(FIELD_NAME);
|
||||
|
@ -150,7 +163,7 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
|||
.getIndexReader(), hits.scoreDocs[1].doc, FIELD_NAME, doc, analyzer);
|
||||
highlighter.setTextFragmenter(new SimpleSpanFragmenter(scorer));
|
||||
fragment = highlighter.getBestFragment(stream, storedField);
|
||||
assertEquals("<B>This</B> piece of text refers to Kennedy at the beginning then has a longer piece of text that is <B>very</B>", fragment);
|
||||
assertEquals("This piece of text refers to Kennedy at the beginning then has a longer piece of text that is <B>very</B>", fragment);
|
||||
}
|
||||
|
||||
public void testHighlightUnknowQueryAfterRewrite() throws IOException, InvalidTokenOffsetsException {
|
||||
|
@ -159,7 +172,7 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
|||
@Override
|
||||
public Query rewrite(IndexReader reader) throws IOException {
|
||||
CommonTermsQuery query = new CommonTermsQuery(Occur.MUST, Occur.SHOULD, 3);
|
||||
query.add(new Term(FIELD_NAME, "this"));
|
||||
query.add(new Term(FIELD_NAME, "this"));//stop-word
|
||||
query.add(new Term(FIELD_NAME, "long"));
|
||||
query.add(new Term(FIELD_NAME, "very"));
|
||||
return query;
|
||||
|
@ -180,9 +193,7 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
|||
return super.equals(obj);
|
||||
}
|
||||
};
|
||||
|
||||
Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);
|
||||
|
||||
|
||||
searcher = newSearcher(reader);
|
||||
TopDocs hits = searcher.search(query, 10);
|
||||
assertEquals(2, hits.totalHits);
|
||||
|
@ -197,7 +208,7 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
|||
Fragmenter fragmenter = new SimpleSpanFragmenter(scorer);
|
||||
highlighter.setTextFragmenter(fragmenter);
|
||||
String fragment = highlighter.getBestFragment(stream, storedField);
|
||||
assertEquals("Hello <B>this</B> is a piece of text that is <B>very</B> <B>long</B> and contains too much preamble and the meat is really here which says kennedy has been shot", fragment);
|
||||
assertEquals("Hello this is a piece of text that is <B>very</B> <B>long</B> and contains too much preamble and the meat is really here which says kennedy has been shot", fragment);
|
||||
|
||||
doc = searcher.doc(hits.scoreDocs[1].doc);
|
||||
storedField = doc.get(FIELD_NAME);
|
||||
|
@ -206,7 +217,7 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
|||
.getIndexReader(), hits.scoreDocs[1].doc, FIELD_NAME, doc, analyzer);
|
||||
highlighter.setTextFragmenter(new SimpleSpanFragmenter(scorer));
|
||||
fragment = highlighter.getBestFragment(stream, storedField);
|
||||
assertEquals("<B>This</B> piece of text refers to Kennedy at the beginning then has a longer piece of text that is <B>very</B>", fragment);
|
||||
assertEquals("This piece of text refers to Kennedy at the beginning then has a longer piece of text that is <B>very</B>", fragment);
|
||||
|
||||
}
|
||||
|
||||
|
@ -252,8 +263,7 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
|||
*/
|
||||
private String highlightField(Query query, String fieldName, String text)
|
||||
throws IOException, InvalidTokenOffsetsException {
|
||||
TokenStream tokenStream = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET)
|
||||
.tokenStream(fieldName, text);
|
||||
TokenStream tokenStream = analyzer.tokenStream(fieldName, text);
|
||||
// Assuming "<B>", "</B>" used to highlight
|
||||
SimpleHTMLFormatter formatter = new SimpleHTMLFormatter();
|
||||
QueryScorer scorer = new QueryScorer(query, fieldName, FIELD_NAME);
|
||||
|
@ -351,8 +361,9 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
|||
Highlighter highlighter = new Highlighter(this, scorer);
|
||||
|
||||
for (int i = 0; i < hits.totalHits; i++) {
|
||||
String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
|
||||
TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, text);
|
||||
final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
|
||||
String text = doc.get(FIELD_NAME);
|
||||
TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
|
||||
|
||||
highlighter.setTextFragmenter(new SimpleFragmenter(40));
|
||||
|
||||
|
@ -380,8 +391,9 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
|||
highlighter = new Highlighter(this, scorer);
|
||||
|
||||
for (int i = 0; i < hits.totalHits; i++) {
|
||||
String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
|
||||
TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, text);
|
||||
final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
|
||||
String text = doc.get(FIELD_NAME);
|
||||
TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
|
||||
|
||||
highlighter.setTextFragmenter(new SimpleFragmenter(40));
|
||||
|
||||
|
@ -409,8 +421,9 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
|||
highlighter = new Highlighter(this, scorer);
|
||||
|
||||
for (int i = 0; i < hits.totalHits; i++) {
|
||||
String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
|
||||
TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, text);
|
||||
final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
|
||||
String text = doc.get(FIELD_NAME);
|
||||
TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
|
||||
|
||||
highlighter.setTextFragmenter(new SimpleFragmenter(40));
|
||||
|
||||
|
@ -434,8 +447,9 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
|||
Highlighter highlighter = new Highlighter(this, scorer);
|
||||
|
||||
for (int i = 0; i < hits.totalHits; i++) {
|
||||
String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
|
||||
TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, text);
|
||||
final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
|
||||
String text = doc.get(FIELD_NAME);
|
||||
TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
|
||||
|
||||
highlighter.setTextFragmenter(new SimpleFragmenter(40));
|
||||
|
||||
|
@ -458,8 +472,9 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
|||
Highlighter highlighter = new Highlighter(this, scorer);
|
||||
|
||||
for (int i = 0; i < hits.totalHits; i++) {
|
||||
String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
|
||||
TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, text);
|
||||
final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
|
||||
String text = doc.get(FIELD_NAME);
|
||||
TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
|
||||
|
||||
highlighter.setTextFragmenter(new SimpleFragmenter(40));
|
||||
|
||||
|
@ -482,8 +497,9 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
|||
Highlighter highlighter = new Highlighter(this, scorer);
|
||||
|
||||
for (int i = 0; i < hits.totalHits; i++) {
|
||||
String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
|
||||
TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, text);
|
||||
final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
|
||||
String text = doc.get(FIELD_NAME);
|
||||
TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
|
||||
|
||||
highlighter.setTextFragmenter(new SimpleFragmenter(40));
|
||||
|
||||
|
@ -567,7 +583,8 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
|||
|
||||
for (int i = 0; i < hits.totalHits; i++) {
|
||||
String text = "parent document";
|
||||
TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, text);
|
||||
StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
|
||||
TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
|
||||
|
||||
highlighter.setTextFragmenter(new SimpleFragmenter(40));
|
||||
highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired, "...");
|
||||
|
@ -592,8 +609,9 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
|||
highlighter.setTextFragmenter(new SimpleFragmenter(40));
|
||||
|
||||
for (int i = 0; i < hits.totalHits; i++) {
|
||||
String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
|
||||
TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, text);
|
||||
final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
|
||||
String text = doc.get(FIELD_NAME);
|
||||
TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
|
||||
|
||||
String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired,
|
||||
"...");
|
||||
|
@ -614,8 +632,9 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
|||
int maxNumFragmentsRequired = 2;
|
||||
|
||||
for (int i = 0; i < hits.totalHits; i++) {
|
||||
String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
|
||||
TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, text);
|
||||
final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
|
||||
String text = doc.get(FIELD_NAME);
|
||||
TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
|
||||
QueryScorer scorer = new QueryScorer(query, FIELD_NAME);
|
||||
Highlighter highlighter = new Highlighter(this, scorer);
|
||||
|
||||
|
@ -644,8 +663,9 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
|||
Highlighter highlighter = new Highlighter(this, scorer);
|
||||
|
||||
for (int i = 0; i < hits.totalHits; i++) {
|
||||
String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
|
||||
TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, text);
|
||||
final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
|
||||
String text = doc.get(FIELD_NAME);
|
||||
TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
|
||||
|
||||
highlighter.setTextFragmenter(new SimpleSpanFragmenter(scorer, 5));
|
||||
|
||||
|
@ -698,8 +718,9 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
|||
Highlighter highlighter = new Highlighter(this,scorer);
|
||||
|
||||
for (int i = 0; i < hits.totalHits; i++) {
|
||||
String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
|
||||
TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, text);
|
||||
final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
|
||||
String text = doc.get(FIELD_NAME);
|
||||
TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
|
||||
|
||||
highlighter.setTextFragmenter(new SimpleFragmenter(40));
|
||||
|
||||
|
@ -769,8 +790,9 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
|||
highlighter.setTextFragmenter(new SimpleFragmenter(40));
|
||||
int maxNumFragmentsRequired = 2;
|
||||
for (int i = 0; i < hits.totalHits; i++) {
|
||||
String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
|
||||
TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, text);
|
||||
final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
|
||||
String text = doc.get(FIELD_NAME);
|
||||
TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
|
||||
|
||||
String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired,
|
||||
"...");
|
||||
|
@ -963,11 +985,11 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
|||
hits = searcher.search(query, null, 1000);
|
||||
|
||||
for (int i = 0; i < hits.totalHits; i++) {
|
||||
String text = searcher.doc(hits.scoreDocs[i].doc).get(HighlighterTest.FIELD_NAME);
|
||||
int maxNumFragmentsRequired = 2;
|
||||
final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
|
||||
String text = doc.get(FIELD_NAME);
|
||||
TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer); int maxNumFragmentsRequired = 2;
|
||||
String fragmentSeparator = "...";
|
||||
QueryScorer scorer = new QueryScorer(query, HighlighterTest.FIELD_NAME);
|
||||
TokenStream tokenStream = analyzer.tokenStream(HighlighterTest.FIELD_NAME, text);
|
||||
|
||||
Highlighter highlighter = new Highlighter(this, scorer);
|
||||
|
||||
|
@ -987,11 +1009,12 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
|||
numHighlights = 0;
|
||||
|
||||
for (int i = 0; i < hits.totalHits; i++) {
|
||||
String text = searcher.doc(hits.scoreDocs[i].doc).get(HighlighterTest.FIELD_NAME);
|
||||
final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
|
||||
String text = doc.get(FIELD_NAME);
|
||||
TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
|
||||
int maxNumFragmentsRequired = 2;
|
||||
String fragmentSeparator = "...";
|
||||
QueryScorer scorer = new QueryScorer(query, null);
|
||||
TokenStream tokenStream = analyzer.tokenStream(HighlighterTest.FIELD_NAME, text);
|
||||
|
||||
Highlighter highlighter = new Highlighter(this, scorer);
|
||||
|
||||
|
@ -1011,11 +1034,11 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
|||
numHighlights = 0;
|
||||
|
||||
for (int i = 0; i < hits.totalHits; i++) {
|
||||
String text = searcher.doc(hits.scoreDocs[i].doc).get(HighlighterTest.FIELD_NAME);
|
||||
int maxNumFragmentsRequired = 2;
|
||||
final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
|
||||
String text = doc.get(FIELD_NAME);
|
||||
TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer); int maxNumFragmentsRequired = 2;
|
||||
String fragmentSeparator = "...";
|
||||
QueryScorer scorer = new QueryScorer(query, "random_field", HighlighterTest.FIELD_NAME);
|
||||
TokenStream tokenStream = analyzer.tokenStream(HighlighterTest.FIELD_NAME, text);
|
||||
|
||||
Highlighter highlighter = new Highlighter(this, scorer);
|
||||
|
||||
|
@ -1185,8 +1208,9 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
|||
doSearching(new TermQuery(new Term(FIELD_NAME, "kennedy")));
|
||||
numHighlights = 0;
|
||||
for (int i = 0; i < hits.totalHits; i++) {
|
||||
String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
|
||||
TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, text);
|
||||
final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
|
||||
String text = doc.get(FIELD_NAME);
|
||||
TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
|
||||
|
||||
Highlighter highlighter = getHighlighter(query, FIELD_NAME,
|
||||
HighlighterTest.this);
|
||||
|
@ -1199,21 +1223,25 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
|||
|
||||
numHighlights = 0;
|
||||
for (int i = 0; i < hits.totalHits; i++) {
|
||||
String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
|
||||
final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
|
||||
String text = doc.get(FIELD_NAME);
|
||||
TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
|
||||
Highlighter highlighter = getHighlighter(query, FIELD_NAME,
|
||||
HighlighterTest.this);
|
||||
highlighter.getBestFragment(analyzer, FIELD_NAME, text);
|
||||
highlighter.getBestFragment(tokenStream, text);
|
||||
}
|
||||
assertTrue("Failed to find correct number of highlights " + numHighlights + " found",
|
||||
numHighlights == 4);
|
||||
|
||||
numHighlights = 0;
|
||||
for (int i = 0; i < hits.totalHits; i++) {
|
||||
String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
|
||||
final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
|
||||
String text = doc.get(FIELD_NAME);
|
||||
TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
|
||||
|
||||
Highlighter highlighter = getHighlighter(query, FIELD_NAME,
|
||||
HighlighterTest.this);
|
||||
highlighter.getBestFragments(analyzer, FIELD_NAME, text, 10);
|
||||
highlighter.getBestFragments(tokenStream, text, 10);
|
||||
}
|
||||
assertTrue("Failed to find correct number of highlights " + numHighlights + " found",
|
||||
numHighlights == 4);
|
||||
|
@ -1339,8 +1367,9 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
|||
doSearching(new TermQuery(new Term(FIELD_NAME, "kennedy")));
|
||||
|
||||
for (int i = 0; i < hits.totalHits; i++) {
|
||||
String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
|
||||
TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, text);
|
||||
final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
|
||||
String text = doc.get(FIELD_NAME);
|
||||
TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
|
||||
|
||||
Highlighter highlighter = getHighlighter(query, FIELD_NAME,
|
||||
HighlighterTest.this);// new Highlighter(this, new
|
||||
|
@ -1368,9 +1397,10 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
|||
}
|
||||
|
||||
public void testMaxSizeHighlight() throws Exception {
|
||||
final MockAnalyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET);
|
||||
// we disable MockTokenizer checks because we will forcefully limit the
|
||||
// we disable MockTokenizer checks because we will forcefully limit the
|
||||
// tokenstream and call end() before incrementToken() returns false.
|
||||
// But we first need to clear the re-used tokenstream components that have enableChecks.
|
||||
analyzer.getReuseStrategy().setReusableComponents(analyzer, FIELD_NAME, null);
|
||||
analyzer.setEnableChecks(false);
|
||||
TestHighlightRunner helper = new TestHighlightRunner() {
|
||||
|
||||
|
@ -1471,8 +1501,7 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
|||
numHighlights = 0;
|
||||
// test to show how rewritten query can still be used
|
||||
searcher = newSearcher(reader);
|
||||
Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET);
|
||||
|
||||
|
||||
BooleanQuery query = new BooleanQuery();
|
||||
query.add(new WildcardQuery(new Term(FIELD_NAME, "jf?")), Occur.SHOULD);
|
||||
query.add(new WildcardQuery(new Term(FIELD_NAME, "kenned*")), Occur.SHOULD);
|
||||
|
@ -1491,8 +1520,9 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
|||
int maxNumFragmentsRequired = 3;
|
||||
|
||||
for (int i = 0; i < hits.totalHits; i++) {
|
||||
String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
|
||||
TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, text);
|
||||
final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
|
||||
String text = doc.get(FIELD_NAME);
|
||||
TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
|
||||
Highlighter highlighter = getHighlighter(query, FIELD_NAME, HighlighterTest.this, false);
|
||||
|
||||
highlighter.setTextFragmenter(new SimpleFragmenter(40));
|
||||
|
@ -1823,12 +1853,6 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
|||
searchIndex();
|
||||
}
|
||||
|
||||
private Document doc( String f, String v ){
|
||||
Document doc = new Document();
|
||||
doc.add( new TextField( f, v, Field.Store.YES));
|
||||
return doc;
|
||||
}
|
||||
|
||||
private void makeIndex() throws IOException {
|
||||
IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false)));
|
||||
writer.addDocument( doc( "t_text1", "random words for highlighting tests del" ) );
|
||||
|
@ -1867,6 +1891,34 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
|||
reader.close();
|
||||
}
|
||||
|
||||
/** If we have term vectors, we can highlight based on payloads */
|
||||
public void testPayloadQuery() throws IOException, InvalidTokenOffsetsException {
|
||||
final String text = "random words and words";//"words" at positions 1 & 4
|
||||
|
||||
Analyzer analyzer = new MockPayloadAnalyzer();//sets payload to "pos: X" (where X is position #)
|
||||
try (IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(analyzer))) {
|
||||
writer.deleteAll();
|
||||
Document doc = new Document();
|
||||
|
||||
doc.add(new Field(FIELD_NAME, text, FIELD_TYPE_TV));
|
||||
writer.addDocument(doc);
|
||||
writer.commit();
|
||||
}
|
||||
try (IndexReader reader = DirectoryReader.open(dir)) {
|
||||
Query query = new SpanPayloadCheckQuery(new SpanTermQuery(new Term(FIELD_NAME, "words")),
|
||||
Collections.singleton("pos: 1".getBytes("UTF-8")));//just match the first "word" occurrence
|
||||
IndexSearcher searcher = newSearcher(reader);
|
||||
Scorer scorer = new QueryScorer(query, searcher.getIndexReader(), FIELD_NAME);
|
||||
Highlighter h = new Highlighter(scorer);
|
||||
|
||||
TopDocs hits = searcher.search(query, null, 10);
|
||||
assertEquals(1, hits.scoreDocs.length);
|
||||
TokenStream stream = TokenSources.getAnyTokenStream(searcher.getIndexReader(), 0, FIELD_NAME, analyzer);
|
||||
String result = h.getBestFragment(stream, text);
|
||||
assertEquals("random <B>words</B> and words", result);//only highlight first "word"
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
*
|
||||
* public void testBigramAnalyzer() throws IOException, ParseException {
|
||||
|
@ -1934,14 +1986,21 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
|||
public void setUp() throws Exception {
|
||||
super.setUp();
|
||||
|
||||
//Not many use this setup:
|
||||
a = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false);
|
||||
analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET);
|
||||
dir = newDirectory();
|
||||
|
||||
//Most tests use this setup:
|
||||
analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET);
|
||||
ramDir = newDirectory();
|
||||
IndexWriter writer = new IndexWriter(ramDir, newIndexWriterConfig(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET)));
|
||||
fieldType = random().nextBoolean() ? FIELD_TYPE_TV : TextField.TYPE_STORED;
|
||||
IndexWriter writer = new IndexWriter(ramDir, newIndexWriterConfig(analyzer));
|
||||
|
||||
for (String text : texts) {
|
||||
addDoc(writer, text);
|
||||
writer.addDocument(doc(FIELD_NAME, text));
|
||||
}
|
||||
|
||||
// a few tests need other docs...:
|
||||
Document doc = new Document();
|
||||
doc.add(new IntField(NUMERIC_FIELD_NAME, 1, Field.Store.NO));
|
||||
doc.add(new StoredField(NUMERIC_FIELD_NAME, 1));
|
||||
|
@ -1969,6 +2028,8 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
|||
writer.forceMerge(1);
|
||||
writer.close();
|
||||
reader = DirectoryReader.open(ramDir);
|
||||
|
||||
//Misc:
|
||||
numHighlights = 0;
|
||||
}
|
||||
|
||||
|
@ -1979,13 +2040,11 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
|||
ramDir.close();
|
||||
super.tearDown();
|
||||
}
|
||||
private void addDoc(IndexWriter writer, String text) throws IOException {
|
||||
|
||||
private Document doc(String name, String value) {
|
||||
Document d = new Document();
|
||||
|
||||
Field f = new TextField(FIELD_NAME, text, Field.Store.YES);
|
||||
d.add(f);
|
||||
writer.addDocument(d);
|
||||
|
||||
d.add(new Field(name, value, fieldType));//fieldType is randomly chosen for term vectors in setUp
|
||||
return d;
|
||||
}
|
||||
|
||||
private static Token createToken(String term, int start, int offset)
|
||||
|
@ -2164,11 +2223,13 @@ final class SynonymTokenizer extends TokenStream {
|
|||
throws Exception {
|
||||
|
||||
for (int i = 0; i < hits.totalHits; i++) {
|
||||
String text = searcher.doc(hits.scoreDocs[i].doc).get(HighlighterTest.FIELD_NAME);
|
||||
final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
|
||||
String text = doc.get(HighlighterTest.FIELD_NAME);
|
||||
int maxNumFragmentsRequired = 2;
|
||||
String fragmentSeparator = "...";
|
||||
Scorer scorer = null;
|
||||
TokenStream tokenStream = analyzer.tokenStream(HighlighterTest.FIELD_NAME, text);
|
||||
TokenStream tokenStream = TokenSources.getAnyTokenStream(searcher.getIndexReader(),
|
||||
hits.scoreDocs[i].doc, HighlighterTest.FIELD_NAME, doc, analyzer);
|
||||
if (mode == QUERY) {
|
||||
scorer = new QueryScorer(query);
|
||||
} else if (mode == QUERY_TERM) {
|
||||
|
@ -2176,7 +2237,6 @@ final class SynonymTokenizer extends TokenStream {
|
|||
}
|
||||
Highlighter highlighter = new Highlighter(formatter, scorer);
|
||||
highlighter.setTextFragmenter(frag);
|
||||
|
||||
String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired,
|
||||
fragmentSeparator);
|
||||
if (LuceneTestCase.VERBOSE) System.out.println("\t" + result);
|
||||
|
|
Loading…
Reference in New Issue