mirror of https://github.com/apache/lucene.git
LUCENE-6034: Highlighter QueryScorer/WeightedSpanTermExtractor shouldn't re-invert a term vector based TokenStream. It can now highlight payload-sensitive queries.
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1643316 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
e460ce328c
commit
978aac3184
|
@ -119,6 +119,10 @@ New Features
|
||||||
|
|
||||||
* LUCENE-6088: TermsFilter implements Accountable. (Adrien Grand)
|
* LUCENE-6088: TermsFilter implements Accountable. (Adrien Grand)
|
||||||
|
|
||||||
|
* LUCENE-6034: The default highlighter when used with QueryScorer will highlight payload-sensitive
|
||||||
|
queries provided that term vectors with positions, offsets, and payloads are present. This is the
|
||||||
|
only highlighter that can highlight such queries accurately. (David Smiley)
|
||||||
|
|
||||||
Optimizations
|
Optimizations
|
||||||
|
|
||||||
* LUCENE-5960: Use a more efficient bitset, not a Set<Integer>, to
|
* LUCENE-5960: Use a more efficient bitset, not a Set<Integer>, to
|
||||||
|
@ -161,6 +165,10 @@ Optimizations
|
||||||
* LUCENE-6089, LUCENE-6090: Tune CompressionMode.HIGH_COMPRESSION for
|
* LUCENE-6089, LUCENE-6090: Tune CompressionMode.HIGH_COMPRESSION for
|
||||||
better compression and less cpu usage. (Adrien Grand, Robert Muir)
|
better compression and less cpu usage. (Adrien Grand, Robert Muir)
|
||||||
|
|
||||||
|
* LUCENE-6034: QueryScorer, used by the default highlighter, needn't re-index the provided
|
||||||
|
TokenStream with MemoryIndex when it comes from TokenSources (term vectors) with offsets and
|
||||||
|
positions. (David Smiley)
|
||||||
|
|
||||||
API Changes
|
API Changes
|
||||||
|
|
||||||
* LUCENE-5900: Deprecated more constructors taking Version in *InfixSuggester and
|
* LUCENE-5900: Deprecated more constructors taking Version in *InfixSuggester and
|
||||||
|
|
|
@ -265,7 +265,8 @@ public class QueryScorer implements Scorer {
|
||||||
* {@link CachingTokenFilter} are wrapped in a {@link CachingTokenFilter} to
|
* {@link CachingTokenFilter} are wrapped in a {@link CachingTokenFilter} to
|
||||||
* ensure an efficient reset - if you are already using a different caching
|
* ensure an efficient reset - if you are already using a different caching
|
||||||
* {@link TokenStream} impl and you don't want it to be wrapped, set this to
|
* {@link TokenStream} impl and you don't want it to be wrapped, set this to
|
||||||
* false.
|
* false. Note that term-vector based tokenstreams are detected and won't be
|
||||||
|
* wrapped either.
|
||||||
*/
|
*/
|
||||||
public void setWrapIfNotCachingTokenFilter(boolean wrap) {
|
public void setWrapIfNotCachingTokenFilter(boolean wrap) {
|
||||||
this.wrapToCaching = wrap;
|
this.wrapToCaching = wrap;
|
||||||
|
|
|
@ -0,0 +1,176 @@
|
||||||
|
package org.apache.lucene.search.highlight;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.Iterator;
|
||||||
|
|
||||||
|
import org.apache.lucene.index.BinaryDocValues;
|
||||||
|
import org.apache.lucene.index.DocValuesType;
|
||||||
|
import org.apache.lucene.index.FieldInfo;
|
||||||
|
import org.apache.lucene.index.FieldInfos;
|
||||||
|
import org.apache.lucene.index.Fields;
|
||||||
|
import org.apache.lucene.index.IndexOptions;
|
||||||
|
import org.apache.lucene.index.LeafReader;
|
||||||
|
import org.apache.lucene.index.NumericDocValues;
|
||||||
|
import org.apache.lucene.index.SortedDocValues;
|
||||||
|
import org.apache.lucene.index.SortedNumericDocValues;
|
||||||
|
import org.apache.lucene.index.SortedSetDocValues;
|
||||||
|
import org.apache.lucene.index.StoredFieldVisitor;
|
||||||
|
import org.apache.lucene.index.Terms;
|
||||||
|
import org.apache.lucene.util.Bits;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Wraps a Terms with a {@link org.apache.lucene.index.LeafReader}, typically from term vectors.
|
||||||
|
*
|
||||||
|
* @lucene.experimental
|
||||||
|
*/
|
||||||
|
public class TermVectorLeafReader extends LeafReader {
|
||||||
|
|
||||||
|
private final Fields fields;
|
||||||
|
private final FieldInfos fieldInfos;
|
||||||
|
|
||||||
|
public TermVectorLeafReader(String field, Terms terms) {
|
||||||
|
fields = new Fields() {
|
||||||
|
@Override
|
||||||
|
public Iterator<String> iterator() {
|
||||||
|
return Collections.singletonList(field).iterator();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Terms terms(String fld) throws IOException {
|
||||||
|
if (!field.equals(fld)) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
return terms;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int size() {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
IndexOptions indexOptions;
|
||||||
|
if (!terms.hasFreqs()) {
|
||||||
|
indexOptions = IndexOptions.DOCS;
|
||||||
|
} else if (!terms.hasPositions()) {
|
||||||
|
indexOptions = IndexOptions.DOCS_AND_FREQS;
|
||||||
|
} else if (!terms.hasOffsets()) {
|
||||||
|
indexOptions = IndexOptions.DOCS_AND_FREQS_AND_POSITIONS;
|
||||||
|
} else {
|
||||||
|
indexOptions = IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS;
|
||||||
|
}
|
||||||
|
FieldInfo fieldInfo = new FieldInfo(field, 0,
|
||||||
|
true, true, terms.hasPayloads(),
|
||||||
|
indexOptions, DocValuesType.NONE, -1, null);
|
||||||
|
fieldInfos = new FieldInfos(new FieldInfo[]{fieldInfo});
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void addCoreClosedListener(CoreClosedListener listener) {
|
||||||
|
addCoreClosedListenerAsReaderClosedListener(this, listener);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void removeCoreClosedListener(CoreClosedListener listener) {
|
||||||
|
removeCoreClosedListenerAsReaderClosedListener(this, listener);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected void doClose() throws IOException {
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Fields fields() throws IOException {
|
||||||
|
return fields;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public NumericDocValues getNumericDocValues(String field) throws IOException {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public BinaryDocValues getBinaryDocValues(String field) throws IOException {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public SortedDocValues getSortedDocValues(String field) throws IOException {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public SortedNumericDocValues getSortedNumericDocValues(String field) throws IOException {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public SortedSetDocValues getSortedSetDocValues(String field) throws IOException {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Bits getDocsWithField(String field) throws IOException {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public NumericDocValues getNormValues(String field) throws IOException {
|
||||||
|
return null;//Is this needed? See MemoryIndex for a way to do it.
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public FieldInfos getFieldInfos() {
|
||||||
|
return fieldInfos;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Bits getLiveDocs() {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void checkIntegrity() throws IOException {
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Fields getTermVectors(int docID) throws IOException {
|
||||||
|
if (docID != 0) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
return fields();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int numDocs() {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int maxDoc() {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void document(int docID, StoredFieldVisitor visitor) throws IOException {
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -36,7 +36,7 @@ import org.apache.lucene.index.Terms;
|
||||||
*/
|
*/
|
||||||
public class TokenSources {
|
public class TokenSources {
|
||||||
/**
|
/**
|
||||||
* A convenience method that tries to first get a TermPositionVector for the
|
* A convenience method that tries to first get a {@link TokenStreamFromTermVector} for the
|
||||||
* specified docId, then, falls back to using the passed in
|
* specified docId, then, falls back to using the passed in
|
||||||
* {@link org.apache.lucene.document.Document} to retrieve the TokenStream.
|
* {@link org.apache.lucene.document.Document} to retrieve the TokenStream.
|
||||||
* This is useful when you already have the document, but would prefer to use
|
* This is useful when you already have the document, but would prefer to use
|
||||||
|
|
|
@ -16,6 +16,7 @@ package org.apache.lucene.search.highlight;
|
||||||
* See the License for the specific language governing permissions and
|
* See the License for the specific language governing permissions and
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
|
@ -29,13 +30,13 @@ import java.util.TreeSet;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.CachingTokenFilter;
|
import org.apache.lucene.analysis.CachingTokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.index.FilterLeafReader;
|
|
||||||
import org.apache.lucene.index.LeafReader;
|
|
||||||
import org.apache.lucene.index.LeafReaderContext;
|
|
||||||
import org.apache.lucene.index.BinaryDocValues;
|
import org.apache.lucene.index.BinaryDocValues;
|
||||||
import org.apache.lucene.index.FieldInfos;
|
import org.apache.lucene.index.FieldInfos;
|
||||||
import org.apache.lucene.index.Fields;
|
import org.apache.lucene.index.Fields;
|
||||||
|
import org.apache.lucene.index.FilterLeafReader;
|
||||||
import org.apache.lucene.index.IndexReader;
|
import org.apache.lucene.index.IndexReader;
|
||||||
|
import org.apache.lucene.index.LeafReader;
|
||||||
|
import org.apache.lucene.index.LeafReaderContext;
|
||||||
import org.apache.lucene.index.NumericDocValues;
|
import org.apache.lucene.index.NumericDocValues;
|
||||||
import org.apache.lucene.index.SortedDocValues;
|
import org.apache.lucene.index.SortedDocValues;
|
||||||
import org.apache.lucene.index.Term;
|
import org.apache.lucene.index.Term;
|
||||||
|
@ -43,7 +44,18 @@ import org.apache.lucene.index.TermContext;
|
||||||
import org.apache.lucene.index.Terms;
|
import org.apache.lucene.index.Terms;
|
||||||
import org.apache.lucene.index.memory.MemoryIndex;
|
import org.apache.lucene.index.memory.MemoryIndex;
|
||||||
import org.apache.lucene.queries.CommonTermsQuery;
|
import org.apache.lucene.queries.CommonTermsQuery;
|
||||||
import org.apache.lucene.search.*;
|
import org.apache.lucene.search.BooleanClause;
|
||||||
|
import org.apache.lucene.search.BooleanQuery;
|
||||||
|
import org.apache.lucene.search.ConstantScoreQuery;
|
||||||
|
import org.apache.lucene.search.DisjunctionMaxQuery;
|
||||||
|
import org.apache.lucene.search.FilteredQuery;
|
||||||
|
import org.apache.lucene.search.IndexSearcher;
|
||||||
|
import org.apache.lucene.search.MatchAllDocsQuery;
|
||||||
|
import org.apache.lucene.search.MultiPhraseQuery;
|
||||||
|
import org.apache.lucene.search.MultiTermQuery;
|
||||||
|
import org.apache.lucene.search.PhraseQuery;
|
||||||
|
import org.apache.lucene.search.Query;
|
||||||
|
import org.apache.lucene.search.TermQuery;
|
||||||
import org.apache.lucene.search.join.ToChildBlockJoinQuery;
|
import org.apache.lucene.search.join.ToChildBlockJoinQuery;
|
||||||
import org.apache.lucene.search.join.ToParentBlockJoinQuery;
|
import org.apache.lucene.search.join.ToParentBlockJoinQuery;
|
||||||
import org.apache.lucene.search.spans.FieldMaskingSpanQuery;
|
import org.apache.lucene.search.spans.FieldMaskingSpanQuery;
|
||||||
|
@ -65,7 +77,7 @@ import org.apache.lucene.util.IOUtils;
|
||||||
public class WeightedSpanTermExtractor {
|
public class WeightedSpanTermExtractor {
|
||||||
|
|
||||||
private String fieldName;
|
private String fieldName;
|
||||||
private TokenStream tokenStream;
|
private TokenStream tokenStream;//set subsequent to getWeightedSpanTerms* methods
|
||||||
private String defaultField;
|
private String defaultField;
|
||||||
private boolean expandMultiTermQuery;
|
private boolean expandMultiTermQuery;
|
||||||
private boolean cachedTokenStream;
|
private boolean cachedTokenStream;
|
||||||
|
@ -209,6 +221,8 @@ public class WeightedSpanTermExtractor {
|
||||||
sp.setBoost(query.getBoost());
|
sp.setBoost(query.getBoost());
|
||||||
extractWeightedSpanTerms(terms, sp);
|
extractWeightedSpanTerms(terms, sp);
|
||||||
}
|
}
|
||||||
|
} else if (query instanceof MatchAllDocsQuery) {
|
||||||
|
//nothing
|
||||||
} else {
|
} else {
|
||||||
Query origQuery = query;
|
Query origQuery = query;
|
||||||
if (query instanceof MultiTermQuery) {
|
if (query instanceof MultiTermQuery) {
|
||||||
|
@ -357,18 +371,39 @@ public class WeightedSpanTermExtractor {
|
||||||
|
|
||||||
protected LeafReaderContext getLeafContext() throws IOException {
|
protected LeafReaderContext getLeafContext() throws IOException {
|
||||||
if (internalReader == null) {
|
if (internalReader == null) {
|
||||||
if(wrapToCaching && !(tokenStream instanceof CachingTokenFilter)) {
|
boolean cacheIt = wrapToCaching && !(tokenStream instanceof CachingTokenFilter);
|
||||||
|
|
||||||
|
// If it's from term vectors, simply wrap the underlying Terms in a reader
|
||||||
|
if (tokenStream instanceof TokenStreamFromTermVector) {
|
||||||
|
cacheIt = false;
|
||||||
|
Terms termVectorTerms = ((TokenStreamFromTermVector) tokenStream).getTermVectorTerms();
|
||||||
|
if (termVectorTerms.hasPositions() && termVectorTerms.hasOffsets()) {
|
||||||
|
internalReader = new TermVectorLeafReader(DelegatingLeafReader.FIELD_NAME, termVectorTerms);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Use MemoryIndex (index/invert this tokenStream now)
|
||||||
|
if (internalReader == null) {
|
||||||
|
final MemoryIndex indexer = new MemoryIndex(true);
|
||||||
|
if (cacheIt) {
|
||||||
assert !cachedTokenStream;
|
assert !cachedTokenStream;
|
||||||
tokenStream = new CachingTokenFilter(new OffsetLimitTokenFilter(tokenStream, maxDocCharsToAnalyze));
|
tokenStream = new CachingTokenFilter(new OffsetLimitTokenFilter(tokenStream, maxDocCharsToAnalyze));
|
||||||
cachedTokenStream = true;
|
cachedTokenStream = true;
|
||||||
}
|
|
||||||
final MemoryIndex indexer = new MemoryIndex(true);
|
|
||||||
indexer.addField(DelegatingLeafReader.FIELD_NAME, tokenStream);
|
indexer.addField(DelegatingLeafReader.FIELD_NAME, tokenStream);
|
||||||
tokenStream.reset();
|
} else {
|
||||||
|
indexer.addField(DelegatingLeafReader.FIELD_NAME,
|
||||||
|
new OffsetLimitTokenFilter(tokenStream, maxDocCharsToAnalyze));
|
||||||
|
}
|
||||||
|
tokenStream.reset();//reset to beginning when we return
|
||||||
final IndexSearcher searcher = indexer.createSearcher();
|
final IndexSearcher searcher = indexer.createSearcher();
|
||||||
// MEM index has only atomic ctx
|
// MEM index has only atomic ctx
|
||||||
internalReader = new DelegatingLeafReader(((LeafReaderContext)searcher.getTopReaderContext()).reader());
|
internalReader = ((LeafReaderContext) searcher.getTopReaderContext()).reader();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//Now wrap it so we always use a common field.
|
||||||
|
this.internalReader = new DelegatingLeafReader(internalReader);
|
||||||
|
}
|
||||||
|
|
||||||
return internalReader.getContext();
|
return internalReader.getContext();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -623,7 +658,10 @@ public class WeightedSpanTermExtractor {
|
||||||
return cachedTokenStream;
|
return cachedTokenStream;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Returns the tokenStream which may have been wrapped in a CachingTokenFilter.
|
||||||
|
* getWeightedSpanTerms* sets the tokenStream, so don't call this before. */
|
||||||
public TokenStream getTokenStream() {
|
public TokenStream getTokenStream() {
|
||||||
|
assert tokenStream != null;
|
||||||
return tokenStream;
|
return tokenStream;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -632,12 +670,16 @@ public class WeightedSpanTermExtractor {
|
||||||
* {@link CachingTokenFilter} are wrapped in a {@link CachingTokenFilter} to
|
* {@link CachingTokenFilter} are wrapped in a {@link CachingTokenFilter} to
|
||||||
* ensure an efficient reset - if you are already using a different caching
|
* ensure an efficient reset - if you are already using a different caching
|
||||||
* {@link TokenStream} impl and you don't want it to be wrapped, set this to
|
* {@link TokenStream} impl and you don't want it to be wrapped, set this to
|
||||||
* false.
|
* false. This setting is ignored when a term vector based TokenStream is supplied,
|
||||||
|
* since it can be reset efficiently.
|
||||||
*/
|
*/
|
||||||
public void setWrapIfNotCachingTokenFilter(boolean wrap) {
|
public void setWrapIfNotCachingTokenFilter(boolean wrap) {
|
||||||
this.wrapToCaching = wrap;
|
this.wrapToCaching = wrap;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** A threshold of number of characters to analyze. When a TokenStream based on
|
||||||
|
* term vectors with offsets and positions are supplied, this setting
|
||||||
|
* does not apply. */
|
||||||
protected final void setMaxDocCharsToAnalyze(int maxDocCharsToAnalyze) {
|
protected final void setMaxDocCharsToAnalyze(int maxDocCharsToAnalyze) {
|
||||||
this.maxDocCharsToAnalyze = maxDocCharsToAnalyze;
|
this.maxDocCharsToAnalyze = maxDocCharsToAnalyze;
|
||||||
}
|
}
|
||||||
|
|
|
@ -22,6 +22,7 @@ import java.io.IOException;
|
||||||
import java.nio.charset.StandardCharsets;
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
|
import java.util.Collections;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
@ -36,6 +37,7 @@ import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.document.Field;
|
import org.apache.lucene.document.Field;
|
||||||
|
import org.apache.lucene.document.FieldType;
|
||||||
import org.apache.lucene.document.IntField;
|
import org.apache.lucene.document.IntField;
|
||||||
import org.apache.lucene.document.StoredField;
|
import org.apache.lucene.document.StoredField;
|
||||||
import org.apache.lucene.document.TextField;
|
import org.apache.lucene.document.TextField;
|
||||||
|
@ -78,8 +80,20 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
||||||
Directory ramDir;
|
Directory ramDir;
|
||||||
public IndexSearcher searcher = null;
|
public IndexSearcher searcher = null;
|
||||||
int numHighlights = 0;
|
int numHighlights = 0;
|
||||||
Analyzer analyzer;
|
MockAnalyzer analyzer;
|
||||||
TopDocs hits;
|
TopDocs hits;
|
||||||
|
FieldType fieldType;//see doc()
|
||||||
|
|
||||||
|
final FieldType FIELD_TYPE_TV;
|
||||||
|
{
|
||||||
|
FieldType fieldType = new FieldType(TextField.TYPE_STORED);
|
||||||
|
fieldType.setStoreTermVectors(true);
|
||||||
|
fieldType.setStoreTermVectorPositions(true);
|
||||||
|
fieldType.setStoreTermVectorPayloads(true);
|
||||||
|
fieldType.setStoreTermVectorOffsets(true);
|
||||||
|
fieldType.freeze();
|
||||||
|
FIELD_TYPE_TV = fieldType;
|
||||||
|
}
|
||||||
|
|
||||||
String[] texts = {
|
String[] texts = {
|
||||||
"Hello this is a piece of text that is very long and contains too much preamble and the meat is really here which says kennedy has been shot",
|
"Hello this is a piece of text that is very long and contains too much preamble and the meat is really here which says kennedy has been shot",
|
||||||
|
@ -121,9 +135,8 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testHighlightingCommonTermsQuery() throws Exception {
|
public void testHighlightingCommonTermsQuery() throws Exception {
|
||||||
Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);
|
|
||||||
CommonTermsQuery query = new CommonTermsQuery(Occur.MUST, Occur.SHOULD, 3);
|
CommonTermsQuery query = new CommonTermsQuery(Occur.MUST, Occur.SHOULD, 3);
|
||||||
query.add(new Term(FIELD_NAME, "this"));
|
query.add(new Term(FIELD_NAME, "this"));//stop-word
|
||||||
query.add(new Term(FIELD_NAME, "long"));
|
query.add(new Term(FIELD_NAME, "long"));
|
||||||
query.add(new Term(FIELD_NAME, "very"));
|
query.add(new Term(FIELD_NAME, "very"));
|
||||||
|
|
||||||
|
@ -141,7 +154,7 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
||||||
Fragmenter fragmenter = new SimpleSpanFragmenter(scorer);
|
Fragmenter fragmenter = new SimpleSpanFragmenter(scorer);
|
||||||
highlighter.setTextFragmenter(fragmenter);
|
highlighter.setTextFragmenter(fragmenter);
|
||||||
String fragment = highlighter.getBestFragment(stream, storedField);
|
String fragment = highlighter.getBestFragment(stream, storedField);
|
||||||
assertEquals("Hello <B>this</B> is a piece of text that is <B>very</B> <B>long</B> and contains too much preamble and the meat is really here which says kennedy has been shot", fragment);
|
assertEquals("Hello this is a piece of text that is <B>very</B> <B>long</B> and contains too much preamble and the meat is really here which says kennedy has been shot", fragment);
|
||||||
|
|
||||||
doc = searcher.doc(hits.scoreDocs[1].doc);
|
doc = searcher.doc(hits.scoreDocs[1].doc);
|
||||||
storedField = doc.get(FIELD_NAME);
|
storedField = doc.get(FIELD_NAME);
|
||||||
|
@ -150,7 +163,7 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
||||||
.getIndexReader(), hits.scoreDocs[1].doc, FIELD_NAME, doc, analyzer);
|
.getIndexReader(), hits.scoreDocs[1].doc, FIELD_NAME, doc, analyzer);
|
||||||
highlighter.setTextFragmenter(new SimpleSpanFragmenter(scorer));
|
highlighter.setTextFragmenter(new SimpleSpanFragmenter(scorer));
|
||||||
fragment = highlighter.getBestFragment(stream, storedField);
|
fragment = highlighter.getBestFragment(stream, storedField);
|
||||||
assertEquals("<B>This</B> piece of text refers to Kennedy at the beginning then has a longer piece of text that is <B>very</B>", fragment);
|
assertEquals("This piece of text refers to Kennedy at the beginning then has a longer piece of text that is <B>very</B>", fragment);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testHighlightUnknowQueryAfterRewrite() throws IOException, InvalidTokenOffsetsException {
|
public void testHighlightUnknowQueryAfterRewrite() throws IOException, InvalidTokenOffsetsException {
|
||||||
|
@ -159,7 +172,7 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
||||||
@Override
|
@Override
|
||||||
public Query rewrite(IndexReader reader) throws IOException {
|
public Query rewrite(IndexReader reader) throws IOException {
|
||||||
CommonTermsQuery query = new CommonTermsQuery(Occur.MUST, Occur.SHOULD, 3);
|
CommonTermsQuery query = new CommonTermsQuery(Occur.MUST, Occur.SHOULD, 3);
|
||||||
query.add(new Term(FIELD_NAME, "this"));
|
query.add(new Term(FIELD_NAME, "this"));//stop-word
|
||||||
query.add(new Term(FIELD_NAME, "long"));
|
query.add(new Term(FIELD_NAME, "long"));
|
||||||
query.add(new Term(FIELD_NAME, "very"));
|
query.add(new Term(FIELD_NAME, "very"));
|
||||||
return query;
|
return query;
|
||||||
|
@ -181,8 +194,6 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);
|
|
||||||
|
|
||||||
searcher = newSearcher(reader);
|
searcher = newSearcher(reader);
|
||||||
TopDocs hits = searcher.search(query, 10);
|
TopDocs hits = searcher.search(query, 10);
|
||||||
assertEquals(2, hits.totalHits);
|
assertEquals(2, hits.totalHits);
|
||||||
|
@ -197,7 +208,7 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
||||||
Fragmenter fragmenter = new SimpleSpanFragmenter(scorer);
|
Fragmenter fragmenter = new SimpleSpanFragmenter(scorer);
|
||||||
highlighter.setTextFragmenter(fragmenter);
|
highlighter.setTextFragmenter(fragmenter);
|
||||||
String fragment = highlighter.getBestFragment(stream, storedField);
|
String fragment = highlighter.getBestFragment(stream, storedField);
|
||||||
assertEquals("Hello <B>this</B> is a piece of text that is <B>very</B> <B>long</B> and contains too much preamble and the meat is really here which says kennedy has been shot", fragment);
|
assertEquals("Hello this is a piece of text that is <B>very</B> <B>long</B> and contains too much preamble and the meat is really here which says kennedy has been shot", fragment);
|
||||||
|
|
||||||
doc = searcher.doc(hits.scoreDocs[1].doc);
|
doc = searcher.doc(hits.scoreDocs[1].doc);
|
||||||
storedField = doc.get(FIELD_NAME);
|
storedField = doc.get(FIELD_NAME);
|
||||||
|
@ -206,7 +217,7 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
||||||
.getIndexReader(), hits.scoreDocs[1].doc, FIELD_NAME, doc, analyzer);
|
.getIndexReader(), hits.scoreDocs[1].doc, FIELD_NAME, doc, analyzer);
|
||||||
highlighter.setTextFragmenter(new SimpleSpanFragmenter(scorer));
|
highlighter.setTextFragmenter(new SimpleSpanFragmenter(scorer));
|
||||||
fragment = highlighter.getBestFragment(stream, storedField);
|
fragment = highlighter.getBestFragment(stream, storedField);
|
||||||
assertEquals("<B>This</B> piece of text refers to Kennedy at the beginning then has a longer piece of text that is <B>very</B>", fragment);
|
assertEquals("This piece of text refers to Kennedy at the beginning then has a longer piece of text that is <B>very</B>", fragment);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -252,8 +263,7 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
||||||
*/
|
*/
|
||||||
private String highlightField(Query query, String fieldName, String text)
|
private String highlightField(Query query, String fieldName, String text)
|
||||||
throws IOException, InvalidTokenOffsetsException {
|
throws IOException, InvalidTokenOffsetsException {
|
||||||
TokenStream tokenStream = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET)
|
TokenStream tokenStream = analyzer.tokenStream(fieldName, text);
|
||||||
.tokenStream(fieldName, text);
|
|
||||||
// Assuming "<B>", "</B>" used to highlight
|
// Assuming "<B>", "</B>" used to highlight
|
||||||
SimpleHTMLFormatter formatter = new SimpleHTMLFormatter();
|
SimpleHTMLFormatter formatter = new SimpleHTMLFormatter();
|
||||||
QueryScorer scorer = new QueryScorer(query, fieldName, FIELD_NAME);
|
QueryScorer scorer = new QueryScorer(query, fieldName, FIELD_NAME);
|
||||||
|
@ -351,8 +361,9 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
||||||
Highlighter highlighter = new Highlighter(this, scorer);
|
Highlighter highlighter = new Highlighter(this, scorer);
|
||||||
|
|
||||||
for (int i = 0; i < hits.totalHits; i++) {
|
for (int i = 0; i < hits.totalHits; i++) {
|
||||||
String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
|
final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
|
||||||
TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, text);
|
String text = doc.get(FIELD_NAME);
|
||||||
|
TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
|
||||||
|
|
||||||
highlighter.setTextFragmenter(new SimpleFragmenter(40));
|
highlighter.setTextFragmenter(new SimpleFragmenter(40));
|
||||||
|
|
||||||
|
@ -380,8 +391,9 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
||||||
highlighter = new Highlighter(this, scorer);
|
highlighter = new Highlighter(this, scorer);
|
||||||
|
|
||||||
for (int i = 0; i < hits.totalHits; i++) {
|
for (int i = 0; i < hits.totalHits; i++) {
|
||||||
String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
|
final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
|
||||||
TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, text);
|
String text = doc.get(FIELD_NAME);
|
||||||
|
TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
|
||||||
|
|
||||||
highlighter.setTextFragmenter(new SimpleFragmenter(40));
|
highlighter.setTextFragmenter(new SimpleFragmenter(40));
|
||||||
|
|
||||||
|
@ -409,8 +421,9 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
||||||
highlighter = new Highlighter(this, scorer);
|
highlighter = new Highlighter(this, scorer);
|
||||||
|
|
||||||
for (int i = 0; i < hits.totalHits; i++) {
|
for (int i = 0; i < hits.totalHits; i++) {
|
||||||
String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
|
final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
|
||||||
TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, text);
|
String text = doc.get(FIELD_NAME);
|
||||||
|
TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
|
||||||
|
|
||||||
highlighter.setTextFragmenter(new SimpleFragmenter(40));
|
highlighter.setTextFragmenter(new SimpleFragmenter(40));
|
||||||
|
|
||||||
|
@ -434,8 +447,9 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
||||||
Highlighter highlighter = new Highlighter(this, scorer);
|
Highlighter highlighter = new Highlighter(this, scorer);
|
||||||
|
|
||||||
for (int i = 0; i < hits.totalHits; i++) {
|
for (int i = 0; i < hits.totalHits; i++) {
|
||||||
String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
|
final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
|
||||||
TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, text);
|
String text = doc.get(FIELD_NAME);
|
||||||
|
TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
|
||||||
|
|
||||||
highlighter.setTextFragmenter(new SimpleFragmenter(40));
|
highlighter.setTextFragmenter(new SimpleFragmenter(40));
|
||||||
|
|
||||||
|
@ -458,8 +472,9 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
||||||
Highlighter highlighter = new Highlighter(this, scorer);
|
Highlighter highlighter = new Highlighter(this, scorer);
|
||||||
|
|
||||||
for (int i = 0; i < hits.totalHits; i++) {
|
for (int i = 0; i < hits.totalHits; i++) {
|
||||||
String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
|
final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
|
||||||
TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, text);
|
String text = doc.get(FIELD_NAME);
|
||||||
|
TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
|
||||||
|
|
||||||
highlighter.setTextFragmenter(new SimpleFragmenter(40));
|
highlighter.setTextFragmenter(new SimpleFragmenter(40));
|
||||||
|
|
||||||
|
@ -482,8 +497,9 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
||||||
Highlighter highlighter = new Highlighter(this, scorer);
|
Highlighter highlighter = new Highlighter(this, scorer);
|
||||||
|
|
||||||
for (int i = 0; i < hits.totalHits; i++) {
|
for (int i = 0; i < hits.totalHits; i++) {
|
||||||
String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
|
final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
|
||||||
TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, text);
|
String text = doc.get(FIELD_NAME);
|
||||||
|
TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
|
||||||
|
|
||||||
highlighter.setTextFragmenter(new SimpleFragmenter(40));
|
highlighter.setTextFragmenter(new SimpleFragmenter(40));
|
||||||
|
|
||||||
|
@ -567,7 +583,8 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
||||||
|
|
||||||
for (int i = 0; i < hits.totalHits; i++) {
|
for (int i = 0; i < hits.totalHits; i++) {
|
||||||
String text = "parent document";
|
String text = "parent document";
|
||||||
TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, text);
|
StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
|
||||||
|
TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
|
||||||
|
|
||||||
highlighter.setTextFragmenter(new SimpleFragmenter(40));
|
highlighter.setTextFragmenter(new SimpleFragmenter(40));
|
||||||
highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired, "...");
|
highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired, "...");
|
||||||
|
@ -592,8 +609,9 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
||||||
highlighter.setTextFragmenter(new SimpleFragmenter(40));
|
highlighter.setTextFragmenter(new SimpleFragmenter(40));
|
||||||
|
|
||||||
for (int i = 0; i < hits.totalHits; i++) {
|
for (int i = 0; i < hits.totalHits; i++) {
|
||||||
String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
|
final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
|
||||||
TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, text);
|
String text = doc.get(FIELD_NAME);
|
||||||
|
TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
|
||||||
|
|
||||||
String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired,
|
String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired,
|
||||||
"...");
|
"...");
|
||||||
|
@ -614,8 +632,9 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
||||||
int maxNumFragmentsRequired = 2;
|
int maxNumFragmentsRequired = 2;
|
||||||
|
|
||||||
for (int i = 0; i < hits.totalHits; i++) {
|
for (int i = 0; i < hits.totalHits; i++) {
|
||||||
String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
|
final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
|
||||||
TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, text);
|
String text = doc.get(FIELD_NAME);
|
||||||
|
TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
|
||||||
QueryScorer scorer = new QueryScorer(query, FIELD_NAME);
|
QueryScorer scorer = new QueryScorer(query, FIELD_NAME);
|
||||||
Highlighter highlighter = new Highlighter(this, scorer);
|
Highlighter highlighter = new Highlighter(this, scorer);
|
||||||
|
|
||||||
|
@ -644,8 +663,9 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
||||||
Highlighter highlighter = new Highlighter(this, scorer);
|
Highlighter highlighter = new Highlighter(this, scorer);
|
||||||
|
|
||||||
for (int i = 0; i < hits.totalHits; i++) {
|
for (int i = 0; i < hits.totalHits; i++) {
|
||||||
String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
|
final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
|
||||||
TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, text);
|
String text = doc.get(FIELD_NAME);
|
||||||
|
TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
|
||||||
|
|
||||||
highlighter.setTextFragmenter(new SimpleSpanFragmenter(scorer, 5));
|
highlighter.setTextFragmenter(new SimpleSpanFragmenter(scorer, 5));
|
||||||
|
|
||||||
|
@ -698,8 +718,9 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
||||||
Highlighter highlighter = new Highlighter(this,scorer);
|
Highlighter highlighter = new Highlighter(this,scorer);
|
||||||
|
|
||||||
for (int i = 0; i < hits.totalHits; i++) {
|
for (int i = 0; i < hits.totalHits; i++) {
|
||||||
String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
|
final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
|
||||||
TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, text);
|
String text = doc.get(FIELD_NAME);
|
||||||
|
TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
|
||||||
|
|
||||||
highlighter.setTextFragmenter(new SimpleFragmenter(40));
|
highlighter.setTextFragmenter(new SimpleFragmenter(40));
|
||||||
|
|
||||||
|
@ -769,8 +790,9 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
||||||
highlighter.setTextFragmenter(new SimpleFragmenter(40));
|
highlighter.setTextFragmenter(new SimpleFragmenter(40));
|
||||||
int maxNumFragmentsRequired = 2;
|
int maxNumFragmentsRequired = 2;
|
||||||
for (int i = 0; i < hits.totalHits; i++) {
|
for (int i = 0; i < hits.totalHits; i++) {
|
||||||
String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
|
final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
|
||||||
TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, text);
|
String text = doc.get(FIELD_NAME);
|
||||||
|
TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
|
||||||
|
|
||||||
String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired,
|
String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired,
|
||||||
"...");
|
"...");
|
||||||
|
@ -963,11 +985,11 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
||||||
hits = searcher.search(query, null, 1000);
|
hits = searcher.search(query, null, 1000);
|
||||||
|
|
||||||
for (int i = 0; i < hits.totalHits; i++) {
|
for (int i = 0; i < hits.totalHits; i++) {
|
||||||
String text = searcher.doc(hits.scoreDocs[i].doc).get(HighlighterTest.FIELD_NAME);
|
final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
|
||||||
int maxNumFragmentsRequired = 2;
|
String text = doc.get(FIELD_NAME);
|
||||||
|
TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer); int maxNumFragmentsRequired = 2;
|
||||||
String fragmentSeparator = "...";
|
String fragmentSeparator = "...";
|
||||||
QueryScorer scorer = new QueryScorer(query, HighlighterTest.FIELD_NAME);
|
QueryScorer scorer = new QueryScorer(query, HighlighterTest.FIELD_NAME);
|
||||||
TokenStream tokenStream = analyzer.tokenStream(HighlighterTest.FIELD_NAME, text);
|
|
||||||
|
|
||||||
Highlighter highlighter = new Highlighter(this, scorer);
|
Highlighter highlighter = new Highlighter(this, scorer);
|
||||||
|
|
||||||
|
@ -987,11 +1009,12 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
||||||
numHighlights = 0;
|
numHighlights = 0;
|
||||||
|
|
||||||
for (int i = 0; i < hits.totalHits; i++) {
|
for (int i = 0; i < hits.totalHits; i++) {
|
||||||
String text = searcher.doc(hits.scoreDocs[i].doc).get(HighlighterTest.FIELD_NAME);
|
final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
|
||||||
|
String text = doc.get(FIELD_NAME);
|
||||||
|
TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
|
||||||
int maxNumFragmentsRequired = 2;
|
int maxNumFragmentsRequired = 2;
|
||||||
String fragmentSeparator = "...";
|
String fragmentSeparator = "...";
|
||||||
QueryScorer scorer = new QueryScorer(query, null);
|
QueryScorer scorer = new QueryScorer(query, null);
|
||||||
TokenStream tokenStream = analyzer.tokenStream(HighlighterTest.FIELD_NAME, text);
|
|
||||||
|
|
||||||
Highlighter highlighter = new Highlighter(this, scorer);
|
Highlighter highlighter = new Highlighter(this, scorer);
|
||||||
|
|
||||||
|
@ -1011,11 +1034,11 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
||||||
numHighlights = 0;
|
numHighlights = 0;
|
||||||
|
|
||||||
for (int i = 0; i < hits.totalHits; i++) {
|
for (int i = 0; i < hits.totalHits; i++) {
|
||||||
String text = searcher.doc(hits.scoreDocs[i].doc).get(HighlighterTest.FIELD_NAME);
|
final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
|
||||||
int maxNumFragmentsRequired = 2;
|
String text = doc.get(FIELD_NAME);
|
||||||
|
TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer); int maxNumFragmentsRequired = 2;
|
||||||
String fragmentSeparator = "...";
|
String fragmentSeparator = "...";
|
||||||
QueryScorer scorer = new QueryScorer(query, "random_field", HighlighterTest.FIELD_NAME);
|
QueryScorer scorer = new QueryScorer(query, "random_field", HighlighterTest.FIELD_NAME);
|
||||||
TokenStream tokenStream = analyzer.tokenStream(HighlighterTest.FIELD_NAME, text);
|
|
||||||
|
|
||||||
Highlighter highlighter = new Highlighter(this, scorer);
|
Highlighter highlighter = new Highlighter(this, scorer);
|
||||||
|
|
||||||
|
@ -1185,8 +1208,9 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
||||||
doSearching(new TermQuery(new Term(FIELD_NAME, "kennedy")));
|
doSearching(new TermQuery(new Term(FIELD_NAME, "kennedy")));
|
||||||
numHighlights = 0;
|
numHighlights = 0;
|
||||||
for (int i = 0; i < hits.totalHits; i++) {
|
for (int i = 0; i < hits.totalHits; i++) {
|
||||||
String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
|
final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
|
||||||
TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, text);
|
String text = doc.get(FIELD_NAME);
|
||||||
|
TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
|
||||||
|
|
||||||
Highlighter highlighter = getHighlighter(query, FIELD_NAME,
|
Highlighter highlighter = getHighlighter(query, FIELD_NAME,
|
||||||
HighlighterTest.this);
|
HighlighterTest.this);
|
||||||
|
@ -1199,21 +1223,25 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
||||||
|
|
||||||
numHighlights = 0;
|
numHighlights = 0;
|
||||||
for (int i = 0; i < hits.totalHits; i++) {
|
for (int i = 0; i < hits.totalHits; i++) {
|
||||||
String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
|
final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
|
||||||
|
String text = doc.get(FIELD_NAME);
|
||||||
|
TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
|
||||||
Highlighter highlighter = getHighlighter(query, FIELD_NAME,
|
Highlighter highlighter = getHighlighter(query, FIELD_NAME,
|
||||||
HighlighterTest.this);
|
HighlighterTest.this);
|
||||||
highlighter.getBestFragment(analyzer, FIELD_NAME, text);
|
highlighter.getBestFragment(tokenStream, text);
|
||||||
}
|
}
|
||||||
assertTrue("Failed to find correct number of highlights " + numHighlights + " found",
|
assertTrue("Failed to find correct number of highlights " + numHighlights + " found",
|
||||||
numHighlights == 4);
|
numHighlights == 4);
|
||||||
|
|
||||||
numHighlights = 0;
|
numHighlights = 0;
|
||||||
for (int i = 0; i < hits.totalHits; i++) {
|
for (int i = 0; i < hits.totalHits; i++) {
|
||||||
String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
|
final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
|
||||||
|
String text = doc.get(FIELD_NAME);
|
||||||
|
TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
|
||||||
|
|
||||||
Highlighter highlighter = getHighlighter(query, FIELD_NAME,
|
Highlighter highlighter = getHighlighter(query, FIELD_NAME,
|
||||||
HighlighterTest.this);
|
HighlighterTest.this);
|
||||||
highlighter.getBestFragments(analyzer, FIELD_NAME, text, 10);
|
highlighter.getBestFragments(tokenStream, text, 10);
|
||||||
}
|
}
|
||||||
assertTrue("Failed to find correct number of highlights " + numHighlights + " found",
|
assertTrue("Failed to find correct number of highlights " + numHighlights + " found",
|
||||||
numHighlights == 4);
|
numHighlights == 4);
|
||||||
|
@ -1339,8 +1367,9 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
||||||
doSearching(new TermQuery(new Term(FIELD_NAME, "kennedy")));
|
doSearching(new TermQuery(new Term(FIELD_NAME, "kennedy")));
|
||||||
|
|
||||||
for (int i = 0; i < hits.totalHits; i++) {
|
for (int i = 0; i < hits.totalHits; i++) {
|
||||||
String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
|
final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
|
||||||
TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, text);
|
String text = doc.get(FIELD_NAME);
|
||||||
|
TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
|
||||||
|
|
||||||
Highlighter highlighter = getHighlighter(query, FIELD_NAME,
|
Highlighter highlighter = getHighlighter(query, FIELD_NAME,
|
||||||
HighlighterTest.this);// new Highlighter(this, new
|
HighlighterTest.this);// new Highlighter(this, new
|
||||||
|
@ -1368,9 +1397,10 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testMaxSizeHighlight() throws Exception {
|
public void testMaxSizeHighlight() throws Exception {
|
||||||
final MockAnalyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET);
|
|
||||||
// we disable MockTokenizer checks because we will forcefully limit the
|
// we disable MockTokenizer checks because we will forcefully limit the
|
||||||
// tokenstream and call end() before incrementToken() returns false.
|
// tokenstream and call end() before incrementToken() returns false.
|
||||||
|
// But we first need to clear the re-used tokenstream components that have enableChecks.
|
||||||
|
analyzer.getReuseStrategy().setReusableComponents(analyzer, FIELD_NAME, null);
|
||||||
analyzer.setEnableChecks(false);
|
analyzer.setEnableChecks(false);
|
||||||
TestHighlightRunner helper = new TestHighlightRunner() {
|
TestHighlightRunner helper = new TestHighlightRunner() {
|
||||||
|
|
||||||
|
@ -1471,7 +1501,6 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
||||||
numHighlights = 0;
|
numHighlights = 0;
|
||||||
// test to show how rewritten query can still be used
|
// test to show how rewritten query can still be used
|
||||||
searcher = newSearcher(reader);
|
searcher = newSearcher(reader);
|
||||||
Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET);
|
|
||||||
|
|
||||||
BooleanQuery query = new BooleanQuery();
|
BooleanQuery query = new BooleanQuery();
|
||||||
query.add(new WildcardQuery(new Term(FIELD_NAME, "jf?")), Occur.SHOULD);
|
query.add(new WildcardQuery(new Term(FIELD_NAME, "jf?")), Occur.SHOULD);
|
||||||
|
@ -1491,8 +1520,9 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
||||||
int maxNumFragmentsRequired = 3;
|
int maxNumFragmentsRequired = 3;
|
||||||
|
|
||||||
for (int i = 0; i < hits.totalHits; i++) {
|
for (int i = 0; i < hits.totalHits; i++) {
|
||||||
String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
|
final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
|
||||||
TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, text);
|
String text = doc.get(FIELD_NAME);
|
||||||
|
TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
|
||||||
Highlighter highlighter = getHighlighter(query, FIELD_NAME, HighlighterTest.this, false);
|
Highlighter highlighter = getHighlighter(query, FIELD_NAME, HighlighterTest.this, false);
|
||||||
|
|
||||||
highlighter.setTextFragmenter(new SimpleFragmenter(40));
|
highlighter.setTextFragmenter(new SimpleFragmenter(40));
|
||||||
|
@ -1823,12 +1853,6 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
||||||
searchIndex();
|
searchIndex();
|
||||||
}
|
}
|
||||||
|
|
||||||
private Document doc( String f, String v ){
|
|
||||||
Document doc = new Document();
|
|
||||||
doc.add( new TextField( f, v, Field.Store.YES));
|
|
||||||
return doc;
|
|
||||||
}
|
|
||||||
|
|
||||||
private void makeIndex() throws IOException {
|
private void makeIndex() throws IOException {
|
||||||
IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false)));
|
IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false)));
|
||||||
writer.addDocument( doc( "t_text1", "random words for highlighting tests del" ) );
|
writer.addDocument( doc( "t_text1", "random words for highlighting tests del" ) );
|
||||||
|
@ -1867,6 +1891,34 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
||||||
reader.close();
|
reader.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** If we have term vectors, we can highlight based on payloads */
|
||||||
|
public void testPayloadQuery() throws IOException, InvalidTokenOffsetsException {
|
||||||
|
final String text = "random words and words";//"words" at positions 1 & 4
|
||||||
|
|
||||||
|
Analyzer analyzer = new MockPayloadAnalyzer();//sets payload to "pos: X" (where X is position #)
|
||||||
|
try (IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(analyzer))) {
|
||||||
|
writer.deleteAll();
|
||||||
|
Document doc = new Document();
|
||||||
|
|
||||||
|
doc.add(new Field(FIELD_NAME, text, FIELD_TYPE_TV));
|
||||||
|
writer.addDocument(doc);
|
||||||
|
writer.commit();
|
||||||
|
}
|
||||||
|
try (IndexReader reader = DirectoryReader.open(dir)) {
|
||||||
|
Query query = new SpanPayloadCheckQuery(new SpanTermQuery(new Term(FIELD_NAME, "words")),
|
||||||
|
Collections.singleton("pos: 1".getBytes("UTF-8")));//just match the first "word" occurrence
|
||||||
|
IndexSearcher searcher = newSearcher(reader);
|
||||||
|
Scorer scorer = new QueryScorer(query, searcher.getIndexReader(), FIELD_NAME);
|
||||||
|
Highlighter h = new Highlighter(scorer);
|
||||||
|
|
||||||
|
TopDocs hits = searcher.search(query, null, 10);
|
||||||
|
assertEquals(1, hits.scoreDocs.length);
|
||||||
|
TokenStream stream = TokenSources.getAnyTokenStream(searcher.getIndexReader(), 0, FIELD_NAME, analyzer);
|
||||||
|
String result = h.getBestFragment(stream, text);
|
||||||
|
assertEquals("random <B>words</B> and words", result);//only highlight first "word"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
*
|
*
|
||||||
* public void testBigramAnalyzer() throws IOException, ParseException {
|
* public void testBigramAnalyzer() throws IOException, ParseException {
|
||||||
|
@ -1934,14 +1986,21 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
||||||
public void setUp() throws Exception {
|
public void setUp() throws Exception {
|
||||||
super.setUp();
|
super.setUp();
|
||||||
|
|
||||||
|
//Not many use this setup:
|
||||||
a = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false);
|
a = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false);
|
||||||
analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET);
|
|
||||||
dir = newDirectory();
|
dir = newDirectory();
|
||||||
|
|
||||||
|
//Most tests use this setup:
|
||||||
|
analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET);
|
||||||
ramDir = newDirectory();
|
ramDir = newDirectory();
|
||||||
IndexWriter writer = new IndexWriter(ramDir, newIndexWriterConfig(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET)));
|
fieldType = random().nextBoolean() ? FIELD_TYPE_TV : TextField.TYPE_STORED;
|
||||||
|
IndexWriter writer = new IndexWriter(ramDir, newIndexWriterConfig(analyzer));
|
||||||
|
|
||||||
for (String text : texts) {
|
for (String text : texts) {
|
||||||
addDoc(writer, text);
|
writer.addDocument(doc(FIELD_NAME, text));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// a few tests need other docs...:
|
||||||
Document doc = new Document();
|
Document doc = new Document();
|
||||||
doc.add(new IntField(NUMERIC_FIELD_NAME, 1, Field.Store.NO));
|
doc.add(new IntField(NUMERIC_FIELD_NAME, 1, Field.Store.NO));
|
||||||
doc.add(new StoredField(NUMERIC_FIELD_NAME, 1));
|
doc.add(new StoredField(NUMERIC_FIELD_NAME, 1));
|
||||||
|
@ -1969,6 +2028,8 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
||||||
writer.forceMerge(1);
|
writer.forceMerge(1);
|
||||||
writer.close();
|
writer.close();
|
||||||
reader = DirectoryReader.open(ramDir);
|
reader = DirectoryReader.open(ramDir);
|
||||||
|
|
||||||
|
//Misc:
|
||||||
numHighlights = 0;
|
numHighlights = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1979,13 +2040,11 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
||||||
ramDir.close();
|
ramDir.close();
|
||||||
super.tearDown();
|
super.tearDown();
|
||||||
}
|
}
|
||||||
private void addDoc(IndexWriter writer, String text) throws IOException {
|
|
||||||
|
private Document doc(String name, String value) {
|
||||||
Document d = new Document();
|
Document d = new Document();
|
||||||
|
d.add(new Field(name, value, fieldType));//fieldType is randomly chosen for term vectors in setUp
|
||||||
Field f = new TextField(FIELD_NAME, text, Field.Store.YES);
|
return d;
|
||||||
d.add(f);
|
|
||||||
writer.addDocument(d);
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private static Token createToken(String term, int start, int offset)
|
private static Token createToken(String term, int start, int offset)
|
||||||
|
@ -2164,11 +2223,13 @@ final class SynonymTokenizer extends TokenStream {
|
||||||
throws Exception {
|
throws Exception {
|
||||||
|
|
||||||
for (int i = 0; i < hits.totalHits; i++) {
|
for (int i = 0; i < hits.totalHits; i++) {
|
||||||
String text = searcher.doc(hits.scoreDocs[i].doc).get(HighlighterTest.FIELD_NAME);
|
final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
|
||||||
|
String text = doc.get(HighlighterTest.FIELD_NAME);
|
||||||
int maxNumFragmentsRequired = 2;
|
int maxNumFragmentsRequired = 2;
|
||||||
String fragmentSeparator = "...";
|
String fragmentSeparator = "...";
|
||||||
Scorer scorer = null;
|
Scorer scorer = null;
|
||||||
TokenStream tokenStream = analyzer.tokenStream(HighlighterTest.FIELD_NAME, text);
|
TokenStream tokenStream = TokenSources.getAnyTokenStream(searcher.getIndexReader(),
|
||||||
|
hits.scoreDocs[i].doc, HighlighterTest.FIELD_NAME, doc, analyzer);
|
||||||
if (mode == QUERY) {
|
if (mode == QUERY) {
|
||||||
scorer = new QueryScorer(query);
|
scorer = new QueryScorer(query);
|
||||||
} else if (mode == QUERY_TERM) {
|
} else if (mode == QUERY_TERM) {
|
||||||
|
@ -2176,7 +2237,6 @@ final class SynonymTokenizer extends TokenStream {
|
||||||
}
|
}
|
||||||
Highlighter highlighter = new Highlighter(formatter, scorer);
|
Highlighter highlighter = new Highlighter(formatter, scorer);
|
||||||
highlighter.setTextFragmenter(frag);
|
highlighter.setTextFragmenter(frag);
|
||||||
|
|
||||||
String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired,
|
String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired,
|
||||||
fragmentSeparator);
|
fragmentSeparator);
|
||||||
if (LuceneTestCase.VERBOSE) System.out.println("\t" + result);
|
if (LuceneTestCase.VERBOSE) System.out.println("\t" + result);
|
||||||
|
|
Loading…
Reference in New Issue