mirror of https://github.com/apache/lucene.git
LUCENE-6445: Highlighter TokenSources simplification
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1676540 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
3b868b3897
commit
736a9b6934
|
@ -18,5 +18,6 @@
|
||||||
<orderEntry type="module" module-name="queries" />
|
<orderEntry type="module" module-name="queries" />
|
||||||
<orderEntry type="module" module-name="lucene-core" />
|
<orderEntry type="module" module-name="lucene-core" />
|
||||||
<orderEntry type="module" module-name="join" />
|
<orderEntry type="module" module-name="join" />
|
||||||
|
<orderEntry type="module" module-name="analysis-common" />
|
||||||
</component>
|
</component>
|
||||||
</module>
|
</module>
|
||||||
|
|
|
@ -147,6 +147,9 @@ API Changes
|
||||||
|
|
||||||
* LUCENE-6446: Simplified Explanation API. (Adrien Grand)
|
* LUCENE-6446: Simplified Explanation API. (Adrien Grand)
|
||||||
|
|
||||||
|
* LUCENE-6445: Two new methods in Highlighter's TokenSources; the existing
|
||||||
|
methods are now marked deprecated. (David Smiley)
|
||||||
|
|
||||||
Other
|
Other
|
||||||
|
|
||||||
* LUCENE-6413: Test runner should report the number of suites completed/
|
* LUCENE-6413: Test runner should report the number of suites completed/
|
||||||
|
|
|
@ -17,10 +17,14 @@ package org.apache.lucene.benchmark.byTask.tasks;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
import java.util.Collection;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.benchmark.byTask.PerfRunData;
|
import org.apache.lucene.benchmark.byTask.PerfRunData;
|
||||||
import org.apache.lucene.document.Document;
|
|
||||||
import org.apache.lucene.index.IndexReader;
|
import org.apache.lucene.index.IndexReader;
|
||||||
import org.apache.lucene.index.StoredDocument;
|
import org.apache.lucene.index.StoredDocument;
|
||||||
import org.apache.lucene.search.Query;
|
import org.apache.lucene.search.Query;
|
||||||
|
@ -30,11 +34,6 @@ import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
|
||||||
import org.apache.lucene.search.highlight.TextFragment;
|
import org.apache.lucene.search.highlight.TextFragment;
|
||||||
import org.apache.lucene.search.highlight.TokenSources;
|
import org.apache.lucene.search.highlight.TokenSources;
|
||||||
|
|
||||||
import java.util.Set;
|
|
||||||
import java.util.Collection;
|
|
||||||
import java.util.HashSet;
|
|
||||||
import java.util.Collections;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Search and Traverse and Retrieve docs task. Highlight the fields in the retrieved documents.
|
* Search and Traverse and Retrieve docs task. Highlight the fields in the retrieved documents.
|
||||||
*
|
*
|
||||||
|
@ -103,7 +102,8 @@ public class SearchTravRetHighlightTask extends SearchTravTask {
|
||||||
@Override
|
@Override
|
||||||
public int doHighlight(IndexReader reader, int doc, String field,
|
public int doHighlight(IndexReader reader, int doc, String field,
|
||||||
StoredDocument document, Analyzer analyzer, String text) throws Exception {
|
StoredDocument document, Analyzer analyzer, String text) throws Exception {
|
||||||
TokenStream ts = TokenSources.getAnyTokenStream(reader, doc, field, document, analyzer);
|
final int maxStartOffset = highlighter.getMaxDocCharsToAnalyze() - 1;
|
||||||
|
TokenStream ts = TokenSources.getTokenStream(field, reader.getTermVectors(doc), text, analyzer, maxStartOffset);
|
||||||
TextFragment[] frag = highlighter.getBestTextFragments(ts, text, mergeContiguous, maxFrags);
|
TextFragment[] frag = highlighter.getBestTextFragments(ts, text, mergeContiguous, maxFrags);
|
||||||
return frag != null ? frag.length : 0;
|
return frag != null ? frag.length : 0;
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,20 +17,19 @@
|
||||||
|
|
||||||
package org.apache.lucene.benchmark.byTask.tasks;
|
package org.apache.lucene.benchmark.byTask.tasks;
|
||||||
|
|
||||||
import org.apache.lucene.benchmark.byTask.PerfRunData;
|
import java.io.IOException;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.search.highlight.Highlighter;
|
import org.apache.lucene.benchmark.byTask.PerfRunData;
|
||||||
import org.apache.lucene.search.highlight.TextFragment;
|
|
||||||
import org.apache.lucene.search.highlight.QueryScorer;
|
|
||||||
import org.apache.lucene.search.highlight.TokenSources;
|
|
||||||
import org.apache.lucene.search.Query;
|
|
||||||
import org.apache.lucene.document.Document;
|
|
||||||
import org.apache.lucene.index.IndexReader;
|
import org.apache.lucene.index.IndexReader;
|
||||||
import org.apache.lucene.index.StoredDocument;
|
import org.apache.lucene.index.StoredDocument;
|
||||||
|
import org.apache.lucene.search.Query;
|
||||||
import java.io.IOException;
|
import org.apache.lucene.search.highlight.Highlighter;
|
||||||
|
import org.apache.lucene.search.highlight.QueryScorer;
|
||||||
|
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
|
||||||
|
import org.apache.lucene.search.highlight.TextFragment;
|
||||||
|
import org.apache.lucene.search.highlight.TokenSources;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test Search task which counts number of searches.
|
* Test Search task which counts number of searches.
|
||||||
|
@ -59,7 +58,8 @@ public class CountingHighlighterTestTask extends SearchTravRetHighlightTask {
|
||||||
return new BenchmarkHighlighter() {
|
return new BenchmarkHighlighter() {
|
||||||
@Override
|
@Override
|
||||||
public int doHighlight(IndexReader reader, int doc, String field, StoredDocument document, Analyzer analyzer, String text) throws Exception {
|
public int doHighlight(IndexReader reader, int doc, String field, StoredDocument document, Analyzer analyzer, String text) throws Exception {
|
||||||
TokenStream ts = TokenSources.getAnyTokenStream(reader, doc, field, document, analyzer);
|
final int maxStartOffset = highlighter.getMaxDocCharsToAnalyze() - 1;
|
||||||
|
TokenStream ts = TokenSources.getTokenStream(field, reader.getTermVectors(doc), text, analyzer, maxStartOffset);
|
||||||
TextFragment[] frag = highlighter.getBestTextFragments(ts, text, mergeContiguous, maxFrags);
|
TextFragment[] frag = highlighter.getBestTextFragments(ts, text, mergeContiguous, maxFrags);
|
||||||
numHighlightedResults += frag != null ? frag.length : 0;
|
numHighlightedResults += frag != null ? frag.length : 0;
|
||||||
return frag != null ? frag.length : 0;
|
return frag != null ? frag.length : 0;
|
||||||
|
|
|
@ -32,12 +32,12 @@
|
||||||
<pathelement path="${memory.jar}"/>
|
<pathelement path="${memory.jar}"/>
|
||||||
<pathelement path="${queries.jar}"/>
|
<pathelement path="${queries.jar}"/>
|
||||||
<pathelement path="${join.jar}"/>
|
<pathelement path="${join.jar}"/>
|
||||||
|
<pathelement path="${analyzers-common.jar}"/>
|
||||||
<path refid="base.classpath"/>
|
<path refid="base.classpath"/>
|
||||||
</path>
|
</path>
|
||||||
|
|
||||||
<target name="init" depends="module-build.init,jar-memory,jar-queries,jar-join"/>
|
<target name="compile-core" depends="jar-memory,jar-queries,jar-join,jar-analyzers-common,common.compile-core" />
|
||||||
|
|
||||||
<target name="compile-core" depends="jar-memory, common.compile-core, jar-join" />
|
|
||||||
<target name="javadocs" depends="javadocs-memory,compile-core,check-javadocs-uptodate"
|
<target name="javadocs" depends="javadocs-memory,compile-core,check-javadocs-uptodate"
|
||||||
unless="javadocs-uptodate-${name}">
|
unless="javadocs-uptodate-${name}">
|
||||||
<invoke-module-javadoc>
|
<invoke-module-javadoc>
|
||||||
|
|
|
@ -24,17 +24,75 @@ import java.io.IOException;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.miscellaneous.LimitTokenOffsetFilter;
|
||||||
import org.apache.lucene.index.Fields;
|
import org.apache.lucene.index.Fields;
|
||||||
import org.apache.lucene.index.IndexReader;
|
import org.apache.lucene.index.IndexReader;
|
||||||
import org.apache.lucene.index.StoredDocument;
|
import org.apache.lucene.index.StoredDocument;
|
||||||
import org.apache.lucene.index.Terms;
|
import org.apache.lucene.index.Terms;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Hides implementation issues associated with obtaining a TokenStream for use
|
* Convenience methods for obtaining a {@link TokenStream} for use with the {@link Highlighter} - can obtain from
|
||||||
* with the higlighter - can obtain from TermFreqVectors with offsets and
|
* term vectors with offsets and positions or from an Analyzer re-parsing the stored content.
|
||||||
* (optionally) positions or from Analyzer class reparsing the stored content.
|
*
|
||||||
|
* @see TokenStreamFromTermVector
|
||||||
*/
|
*/
|
||||||
public class TokenSources {
|
public class TokenSources {
|
||||||
|
|
||||||
|
private TokenSources() {}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get a token stream from either un-inverting a term vector if possible, or by analyzing the text.
|
||||||
|
*
|
||||||
|
* WARNING: Don't call this if there is more than one value for this field. If there are, and if there are term
|
||||||
|
* vectors, then there is a single tokenstream with offsets suggesting all the field values were concatenated.
|
||||||
|
*
|
||||||
|
* @param field The field to either get term vectors from or to analyze the text from.
|
||||||
|
* @param tvFields from {@link IndexReader#getTermVectors(int)}. Possibly null. For performance, this instance should
|
||||||
|
* be re-used for the same document (e.g. when highlighting multiple fields).
|
||||||
|
* @param text the text to analyze, failing term vector un-inversion
|
||||||
|
* @param analyzer the analyzer to analyze {@code text} with, failing term vector un-inversion
|
||||||
|
* @param maxStartOffset Terms with a startOffset greater than this aren't returned. Use -1 for no limit.
|
||||||
|
* Suggest using {@link Highlighter#getMaxDocCharsToAnalyze()} - 1.
|
||||||
|
*
|
||||||
|
* @return a token stream from either term vectors, or from analyzing the text. Never null.
|
||||||
|
*/
|
||||||
|
public static TokenStream getTokenStream(String field, Fields tvFields, String text, Analyzer analyzer,
|
||||||
|
int maxStartOffset) throws IOException {
|
||||||
|
TokenStream tokenStream = getTermVectorTokenStreamOrNull(field, tvFields, maxStartOffset);
|
||||||
|
if (tokenStream != null) {
|
||||||
|
return tokenStream;
|
||||||
|
}
|
||||||
|
tokenStream = analyzer.tokenStream(field, text);
|
||||||
|
if (maxStartOffset >= 0 && maxStartOffset < text.length() - 1) {
|
||||||
|
tokenStream = new LimitTokenOffsetFilter(tokenStream, maxStartOffset);
|
||||||
|
}
|
||||||
|
return tokenStream;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get a token stream by un-inverting the term vector. This method returns null if {@code tvFields} is null
|
||||||
|
* or if the field has no term vector, or if the term vector doesn't have offsets. Positions are recommended on the
|
||||||
|
* term vector but it isn't strictly required.
|
||||||
|
*
|
||||||
|
* @param field The field to get term vectors from.
|
||||||
|
* @param tvFields from {@link IndexReader#getTermVectors(int)}. Possibly null. For performance, this instance should
|
||||||
|
* be re-used for the same document (e.g. when highlighting multiple fields).
|
||||||
|
* @param maxStartOffset Terms with a startOffset greater than this aren't returned. Use -1 for no limit.
|
||||||
|
* Suggest using {@link Highlighter#getMaxDocCharsToAnalyze()} - 1
|
||||||
|
* @return a token stream from term vectors. Null if no term vectors with the right options.
|
||||||
|
*/
|
||||||
|
public static TokenStream getTermVectorTokenStreamOrNull(String field, Fields tvFields, int maxStartOffset)
|
||||||
|
throws IOException {
|
||||||
|
if (tvFields == null) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
final Terms tvTerms = tvFields.terms(field);
|
||||||
|
if (tvTerms == null || !tvTerms.hasOffsets()) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
return new TokenStreamFromTermVector(tvTerms, maxStartOffset);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A convenience method that tries to first get a {@link TokenStreamFromTermVector} for the
|
* A convenience method that tries to first get a {@link TokenStreamFromTermVector} for the
|
||||||
* specified docId, then, falls back to using the passed in
|
* specified docId, then, falls back to using the passed in
|
||||||
|
@ -54,7 +112,7 @@ public class TokenSources {
|
||||||
* {@link org.apache.lucene.document.Document}
|
* {@link org.apache.lucene.document.Document}
|
||||||
* @throws IOException if there was an error loading
|
* @throws IOException if there was an error loading
|
||||||
*/
|
*/
|
||||||
|
@Deprecated // maintenance reasons LUCENE-6445
|
||||||
public static TokenStream getAnyTokenStream(IndexReader reader, int docId,
|
public static TokenStream getAnyTokenStream(IndexReader reader, int docId,
|
||||||
String field, StoredDocument document, Analyzer analyzer) throws IOException {
|
String field, StoredDocument document, Analyzer analyzer) throws IOException {
|
||||||
TokenStream ts = null;
|
TokenStream ts = null;
|
||||||
|
@ -83,6 +141,7 @@ public class TokenSources {
|
||||||
* @return null if field not stored correctly
|
* @return null if field not stored correctly
|
||||||
* @throws IOException If there is a low-level I/O error
|
* @throws IOException If there is a low-level I/O error
|
||||||
*/
|
*/
|
||||||
|
@Deprecated // maintenance reasons LUCENE-6445
|
||||||
public static TokenStream getAnyTokenStream(IndexReader reader, int docId,
|
public static TokenStream getAnyTokenStream(IndexReader reader, int docId,
|
||||||
String field, Analyzer analyzer) throws IOException {
|
String field, Analyzer analyzer) throws IOException {
|
||||||
TokenStream ts = null;
|
TokenStream ts = null;
|
||||||
|
@ -103,7 +162,7 @@ public class TokenSources {
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Simply calls {@link #getTokenStream(org.apache.lucene.index.Terms)} now. */
|
/** Simply calls {@link #getTokenStream(org.apache.lucene.index.Terms)} now. */
|
||||||
@Deprecated
|
@Deprecated // maintenance reasons LUCENE-6445
|
||||||
public static TokenStream getTokenStream(Terms vector,
|
public static TokenStream getTokenStream(Terms vector,
|
||||||
boolean tokenPositionsGuaranteedContiguous) throws IOException {
|
boolean tokenPositionsGuaranteedContiguous) throws IOException {
|
||||||
return getTokenStream(vector);
|
return getTokenStream(vector);
|
||||||
|
@ -119,6 +178,7 @@ public class TokenSources {
|
||||||
*
|
*
|
||||||
* @throws IllegalArgumentException if no offsets are available
|
* @throws IllegalArgumentException if no offsets are available
|
||||||
*/
|
*/
|
||||||
|
@Deprecated // maintenance reasons LUCENE-6445
|
||||||
public static TokenStream getTokenStream(final Terms tpv) throws IOException {
|
public static TokenStream getTokenStream(final Terms tpv) throws IOException {
|
||||||
|
|
||||||
if (!tpv.hasOffsets()) {
|
if (!tpv.hasOffsets()) {
|
||||||
|
@ -144,6 +204,7 @@ public class TokenSources {
|
||||||
*
|
*
|
||||||
* @see #getTokenStream(org.apache.lucene.index.Terms)
|
* @see #getTokenStream(org.apache.lucene.index.Terms)
|
||||||
*/
|
*/
|
||||||
|
@Deprecated // maintenance reasons LUCENE-6445
|
||||||
public static TokenStream getTokenStreamWithOffsets(IndexReader reader, int docId,
|
public static TokenStream getTokenStreamWithOffsets(IndexReader reader, int docId,
|
||||||
String field) throws IOException {
|
String field) throws IOException {
|
||||||
|
|
||||||
|
@ -164,13 +225,14 @@ public class TokenSources {
|
||||||
return getTokenStream(vector);
|
return getTokenStream(vector);
|
||||||
}
|
}
|
||||||
|
|
||||||
// convenience method
|
@Deprecated // maintenance reasons LUCENE-6445
|
||||||
public static TokenStream getTokenStream(IndexReader reader, int docId,
|
public static TokenStream getTokenStream(IndexReader reader, int docId,
|
||||||
String field, Analyzer analyzer) throws IOException {
|
String field, Analyzer analyzer) throws IOException {
|
||||||
StoredDocument doc = reader.document(docId);
|
StoredDocument doc = reader.document(docId);
|
||||||
return getTokenStream(doc, field, analyzer);
|
return getTokenStream(doc, field, analyzer);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Deprecated // maintenance reasons LUCENE-6445
|
||||||
public static TokenStream getTokenStream(StoredDocument doc, String field,
|
public static TokenStream getTokenStream(StoredDocument doc, String field,
|
||||||
Analyzer analyzer) {
|
Analyzer analyzer) {
|
||||||
String contents = doc.get(field);
|
String contents = doc.get(field);
|
||||||
|
@ -181,7 +243,7 @@ public class TokenSources {
|
||||||
return getTokenStream(field, contents, analyzer);
|
return getTokenStream(field, contents, analyzer);
|
||||||
}
|
}
|
||||||
|
|
||||||
// convenience method
|
@Deprecated // maintenance reasons LUCENE-6445
|
||||||
public static TokenStream getTokenStream(String field, String contents,
|
public static TokenStream getTokenStream(String field, String contents,
|
||||||
Analyzer analyzer) {
|
Analyzer analyzer) {
|
||||||
try {
|
try {
|
||||||
|
|
|
@ -23,19 +23,18 @@ import org.apache.lucene.analysis.MockAnalyzer;
|
||||||
import org.apache.lucene.analysis.MockTokenizer;
|
import org.apache.lucene.analysis.MockTokenizer;
|
||||||
import org.apache.lucene.analysis.Token;
|
import org.apache.lucene.analysis.Token;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
|
||||||
import org.apache.lucene.document.FieldType;
|
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.document.Field;
|
import org.apache.lucene.document.Field;
|
||||||
|
import org.apache.lucene.document.FieldType;
|
||||||
import org.apache.lucene.document.TextField;
|
import org.apache.lucene.document.TextField;
|
||||||
import org.apache.lucene.index.LeafReaderContext;
|
|
||||||
import org.apache.lucene.index.DirectoryReader;
|
import org.apache.lucene.index.DirectoryReader;
|
||||||
import org.apache.lucene.index.IndexReader;
|
import org.apache.lucene.index.IndexReader;
|
||||||
import org.apache.lucene.index.IndexWriter;
|
import org.apache.lucene.index.IndexWriter;
|
||||||
|
import org.apache.lucene.index.LeafReaderContext;
|
||||||
import org.apache.lucene.index.Term;
|
import org.apache.lucene.index.Term;
|
||||||
import org.apache.lucene.search.DocIdSetIterator;
|
|
||||||
import org.apache.lucene.search.IndexSearcher;
|
import org.apache.lucene.search.IndexSearcher;
|
||||||
import org.apache.lucene.search.PhraseQuery;
|
import org.apache.lucene.search.PhraseQuery;
|
||||||
import org.apache.lucene.search.Query;
|
import org.apache.lucene.search.Query;
|
||||||
|
@ -45,11 +44,12 @@ import org.apache.lucene.search.spans.SpanNearQuery;
|
||||||
import org.apache.lucene.search.spans.SpanQuery;
|
import org.apache.lucene.search.spans.SpanQuery;
|
||||||
import org.apache.lucene.search.spans.SpanTermQuery;
|
import org.apache.lucene.search.spans.SpanTermQuery;
|
||||||
import org.apache.lucene.store.Directory;
|
import org.apache.lucene.store.Directory;
|
||||||
import org.apache.lucene.util.LuceneTestCase;
|
|
||||||
import org.apache.lucene.util.FixedBitSet;
|
import org.apache.lucene.util.FixedBitSet;
|
||||||
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
|
|
||||||
public class HighlighterPhraseTest extends LuceneTestCase {
|
public class HighlighterPhraseTest extends LuceneTestCase {
|
||||||
private static final String FIELD = "text";
|
private static final String FIELD = "text";
|
||||||
|
|
||||||
public void testConcurrentPhrase() throws IOException, InvalidTokenOffsetsException {
|
public void testConcurrentPhrase() throws IOException, InvalidTokenOffsetsException {
|
||||||
final String TEXT = "the fox jumped";
|
final String TEXT = "the fox jumped";
|
||||||
final Directory directory = newDirectory();
|
final Directory directory = newDirectory();
|
||||||
|
@ -80,9 +80,8 @@ public class HighlighterPhraseTest extends LuceneTestCase {
|
||||||
new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
|
new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
|
||||||
new QueryScorer(phraseQuery));
|
new QueryScorer(phraseQuery));
|
||||||
|
|
||||||
final TokenStream tokenStream = TokenSources
|
final TokenStream tokenStream =
|
||||||
.getTokenStream(indexReader.getTermVector(
|
TokenSources.getTermVectorTokenStreamOrNull(FIELD, indexReader.getTermVectors(0), -1);
|
||||||
0, FIELD));
|
|
||||||
assertEquals(highlighter.getBestFragment(new TokenStreamConcurrent(),
|
assertEquals(highlighter.getBestFragment(new TokenStreamConcurrent(),
|
||||||
TEXT), highlighter.getBestFragment(tokenStream, TEXT));
|
TEXT), highlighter.getBestFragment(tokenStream, TEXT));
|
||||||
} finally {
|
} finally {
|
||||||
|
@ -147,9 +146,8 @@ public class HighlighterPhraseTest extends LuceneTestCase {
|
||||||
for (int position = bitset.nextSetBit(0); position < maxDoc-1; position = bitset
|
for (int position = bitset.nextSetBit(0); position < maxDoc-1; position = bitset
|
||||||
.nextSetBit(position + 1)) {
|
.nextSetBit(position + 1)) {
|
||||||
assertEquals(0, position);
|
assertEquals(0, position);
|
||||||
final TokenStream tokenStream = TokenSources.getTokenStream(
|
final TokenStream tokenStream =
|
||||||
indexReader.getTermVector(position,
|
TokenSources.getTermVectorTokenStreamOrNull(FIELD, indexReader.getTermVectors(position), -1);
|
||||||
FIELD));
|
|
||||||
assertEquals(highlighter.getBestFragment(new TokenStreamConcurrent(),
|
assertEquals(highlighter.getBestFragment(new TokenStreamConcurrent(),
|
||||||
TEXT), highlighter.getBestFragment(tokenStream, TEXT));
|
TEXT), highlighter.getBestFragment(tokenStream, TEXT));
|
||||||
}
|
}
|
||||||
|
@ -189,9 +187,8 @@ public class HighlighterPhraseTest extends LuceneTestCase {
|
||||||
final Highlighter highlighter = new Highlighter(
|
final Highlighter highlighter = new Highlighter(
|
||||||
new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
|
new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
|
||||||
new QueryScorer(phraseQuery));
|
new QueryScorer(phraseQuery));
|
||||||
final TokenStream tokenStream = TokenSources
|
final TokenStream tokenStream =
|
||||||
.getTokenStream(indexReader.getTermVector(
|
TokenSources.getTermVectorTokenStreamOrNull(FIELD, indexReader.getTermVectors(0), -1);
|
||||||
0, FIELD));
|
|
||||||
assertEquals(
|
assertEquals(
|
||||||
highlighter.getBestFragment(new TokenStreamSparse(), TEXT),
|
highlighter.getBestFragment(new TokenStreamSparse(), TEXT),
|
||||||
highlighter.getBestFragment(tokenStream, TEXT));
|
highlighter.getBestFragment(tokenStream, TEXT));
|
||||||
|
@ -230,8 +227,8 @@ public class HighlighterPhraseTest extends LuceneTestCase {
|
||||||
final Highlighter highlighter = new Highlighter(
|
final Highlighter highlighter = new Highlighter(
|
||||||
new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
|
new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
|
||||||
new QueryScorer(phraseQuery));
|
new QueryScorer(phraseQuery));
|
||||||
final TokenStream tokenStream = TokenSources.getTokenStream(
|
final TokenStream tokenStream =
|
||||||
indexReader.getTermVector(0, FIELD));
|
TokenSources.getTermVectorTokenStreamOrNull(FIELD, indexReader.getTermVectors(0), -1);
|
||||||
assertEquals("the fox <B>did</B> not <B>jump</B>", highlighter
|
assertEquals("the fox <B>did</B> not <B>jump</B>", highlighter
|
||||||
.getBestFragment(tokenStream, TEXT));
|
.getBestFragment(tokenStream, TEXT));
|
||||||
} finally {
|
} finally {
|
||||||
|
@ -269,9 +266,8 @@ public class HighlighterPhraseTest extends LuceneTestCase {
|
||||||
final Highlighter highlighter = new Highlighter(
|
final Highlighter highlighter = new Highlighter(
|
||||||
new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
|
new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
|
||||||
new QueryScorer(phraseQuery));
|
new QueryScorer(phraseQuery));
|
||||||
final TokenStream tokenStream = TokenSources
|
final TokenStream tokenStream =
|
||||||
.getTokenStream(indexReader.getTermVector(
|
TokenSources.getTermVectorTokenStreamOrNull(FIELD, indexReader.getTermVectors(0), -1);
|
||||||
0, FIELD));
|
|
||||||
assertEquals(
|
assertEquals(
|
||||||
highlighter.getBestFragment(new TokenStreamSparse(), TEXT),
|
highlighter.getBestFragment(new TokenStreamSparse(), TEXT),
|
||||||
highlighter.getBestFragment(tokenStream, TEXT));
|
highlighter.getBestFragment(tokenStream, TEXT));
|
||||||
|
|
|
@ -17,6 +17,8 @@ package org.apache.lucene.search.highlight;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
import javax.xml.parsers.DocumentBuilder;
|
||||||
|
import javax.xml.parsers.DocumentBuilderFactory;
|
||||||
import java.io.ByteArrayInputStream;
|
import java.io.ByteArrayInputStream;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.nio.charset.StandardCharsets;
|
import java.nio.charset.StandardCharsets;
|
||||||
|
@ -29,9 +31,6 @@ import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.StringTokenizer;
|
import java.util.StringTokenizer;
|
||||||
|
|
||||||
import javax.xml.parsers.DocumentBuilder;
|
|
||||||
import javax.xml.parsers.DocumentBuilderFactory;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.analysis.CachingTokenFilter;
|
import org.apache.lucene.analysis.CachingTokenFilter;
|
||||||
|
@ -133,9 +132,14 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
||||||
"This text has a typo in referring to Keneddy",
|
"This text has a typo in referring to Keneddy",
|
||||||
"wordx wordy wordz wordx wordy wordx worda wordb wordy wordc", "y z x y z a b", "lets is a the lets is a the lets is a the lets" };
|
"wordx wordy wordz wordx wordy wordx worda wordb wordy wordc", "y z x y z a b", "lets is a the lets is a the lets is a the lets" };
|
||||||
|
|
||||||
public void testQueryScorerHits() throws Exception {
|
// Convenience method for succinct tests; doesn't represent "best practice"
|
||||||
Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);
|
private TokenStream getAnyTokenStream(String fieldName, int docId)
|
||||||
|
throws IOException {
|
||||||
|
return TokenSources.getTokenStream(fieldName, searcher.getIndexReader().getTermVectors(docId),
|
||||||
|
searcher.doc(docId).get(fieldName), analyzer, -1);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testQueryScorerHits() throws Exception {
|
||||||
PhraseQuery phraseQuery = new PhraseQuery();
|
PhraseQuery phraseQuery = new PhraseQuery();
|
||||||
phraseQuery.add(new Term(FIELD_NAME, "very"));
|
phraseQuery.add(new Term(FIELD_NAME, "very"));
|
||||||
phraseQuery.add(new Term(FIELD_NAME, "long"));
|
phraseQuery.add(new Term(FIELD_NAME, "long"));
|
||||||
|
@ -149,11 +153,11 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
||||||
|
|
||||||
|
|
||||||
for (int i = 0; i < hits.scoreDocs.length; i++) {
|
for (int i = 0; i < hits.scoreDocs.length; i++) {
|
||||||
StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
|
final int docId = hits.scoreDocs[i].doc;
|
||||||
|
StoredDocument doc = searcher.doc(docId);
|
||||||
String storedField = doc.get(FIELD_NAME);
|
String storedField = doc.get(FIELD_NAME);
|
||||||
|
|
||||||
TokenStream stream = TokenSources.getAnyTokenStream(searcher
|
TokenStream stream = getAnyTokenStream(FIELD_NAME, docId);
|
||||||
.getIndexReader(), hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
|
|
||||||
|
|
||||||
Fragmenter fragmenter = new SimpleSpanFragmenter(scorer);
|
Fragmenter fragmenter = new SimpleSpanFragmenter(scorer);
|
||||||
|
|
||||||
|
@ -177,21 +181,21 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
||||||
QueryScorer scorer = new QueryScorer(query, FIELD_NAME);
|
QueryScorer scorer = new QueryScorer(query, FIELD_NAME);
|
||||||
Highlighter highlighter = new Highlighter(scorer);
|
Highlighter highlighter = new Highlighter(scorer);
|
||||||
|
|
||||||
StoredDocument doc = searcher.doc(hits.scoreDocs[0].doc);
|
final int docId0 = hits.scoreDocs[0].doc;
|
||||||
|
StoredDocument doc = searcher.doc(docId0);
|
||||||
String storedField = doc.get(FIELD_NAME);
|
String storedField = doc.get(FIELD_NAME);
|
||||||
|
|
||||||
TokenStream stream = TokenSources.getAnyTokenStream(searcher
|
TokenStream stream = getAnyTokenStream(FIELD_NAME, docId0);
|
||||||
.getIndexReader(), hits.scoreDocs[0].doc, FIELD_NAME, doc, analyzer);
|
|
||||||
Fragmenter fragmenter = new SimpleSpanFragmenter(scorer);
|
Fragmenter fragmenter = new SimpleSpanFragmenter(scorer);
|
||||||
highlighter.setTextFragmenter(fragmenter);
|
highlighter.setTextFragmenter(fragmenter);
|
||||||
String fragment = highlighter.getBestFragment(stream, storedField);
|
String fragment = highlighter.getBestFragment(stream, storedField);
|
||||||
assertEquals("Hello this is a piece of text that is <B>very</B> <B>long</B> and contains too much preamble and the meat is really here which says kennedy has been shot", fragment);
|
assertEquals("Hello this is a piece of text that is <B>very</B> <B>long</B> and contains too much preamble and the meat is really here which says kennedy has been shot", fragment);
|
||||||
|
|
||||||
doc = searcher.doc(hits.scoreDocs[1].doc);
|
final int docId1 = hits.scoreDocs[1].doc;
|
||||||
|
doc = searcher.doc(docId1);
|
||||||
storedField = doc.get(FIELD_NAME);
|
storedField = doc.get(FIELD_NAME);
|
||||||
|
|
||||||
stream = TokenSources.getAnyTokenStream(searcher
|
stream = getAnyTokenStream(FIELD_NAME, docId1);
|
||||||
.getIndexReader(), hits.scoreDocs[1].doc, FIELD_NAME, doc, analyzer);
|
|
||||||
highlighter.setTextFragmenter(new SimpleSpanFragmenter(scorer));
|
highlighter.setTextFragmenter(new SimpleSpanFragmenter(scorer));
|
||||||
fragment = highlighter.getBestFragment(stream, storedField);
|
fragment = highlighter.getBestFragment(stream, storedField);
|
||||||
assertEquals("This piece of text refers to Kennedy at the beginning then has a longer piece of text that is <B>very</B>", fragment);
|
assertEquals("This piece of text refers to Kennedy at the beginning then has a longer piece of text that is <B>very</B>", fragment);
|
||||||
|
@ -231,21 +235,21 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
||||||
QueryScorer scorer = new QueryScorer(query, FIELD_NAME);
|
QueryScorer scorer = new QueryScorer(query, FIELD_NAME);
|
||||||
Highlighter highlighter = new Highlighter(scorer);
|
Highlighter highlighter = new Highlighter(scorer);
|
||||||
|
|
||||||
StoredDocument doc = searcher.doc(hits.scoreDocs[0].doc);
|
final int docId0 = hits.scoreDocs[0].doc;
|
||||||
|
StoredDocument doc = searcher.doc(docId0);
|
||||||
String storedField = doc.get(FIELD_NAME);
|
String storedField = doc.get(FIELD_NAME);
|
||||||
|
|
||||||
TokenStream stream = TokenSources.getAnyTokenStream(searcher
|
TokenStream stream = getAnyTokenStream(FIELD_NAME, docId0);
|
||||||
.getIndexReader(), hits.scoreDocs[0].doc, FIELD_NAME, doc, analyzer);
|
|
||||||
Fragmenter fragmenter = new SimpleSpanFragmenter(scorer);
|
Fragmenter fragmenter = new SimpleSpanFragmenter(scorer);
|
||||||
highlighter.setTextFragmenter(fragmenter);
|
highlighter.setTextFragmenter(fragmenter);
|
||||||
String fragment = highlighter.getBestFragment(stream, storedField);
|
String fragment = highlighter.getBestFragment(stream, storedField);
|
||||||
assertEquals("Hello this is a piece of text that is <B>very</B> <B>long</B> and contains too much preamble and the meat is really here which says kennedy has been shot", fragment);
|
assertEquals("Hello this is a piece of text that is <B>very</B> <B>long</B> and contains too much preamble and the meat is really here which says kennedy has been shot", fragment);
|
||||||
|
|
||||||
doc = searcher.doc(hits.scoreDocs[1].doc);
|
final int docId1 = hits.scoreDocs[1].doc;
|
||||||
|
doc = searcher.doc(docId1);
|
||||||
storedField = doc.get(FIELD_NAME);
|
storedField = doc.get(FIELD_NAME);
|
||||||
|
|
||||||
stream = TokenSources.getAnyTokenStream(searcher
|
stream = getAnyTokenStream(FIELD_NAME, docId1);
|
||||||
.getIndexReader(), hits.scoreDocs[1].doc, FIELD_NAME, doc, analyzer);
|
|
||||||
highlighter.setTextFragmenter(new SimpleSpanFragmenter(scorer));
|
highlighter.setTextFragmenter(new SimpleSpanFragmenter(scorer));
|
||||||
fragment = highlighter.getBestFragment(stream, storedField);
|
fragment = highlighter.getBestFragment(stream, storedField);
|
||||||
assertEquals("This piece of text refers to Kennedy at the beginning then has a longer piece of text that is <B>very</B>", fragment);
|
assertEquals("This piece of text refers to Kennedy at the beginning then has a longer piece of text that is <B>very</B>", fragment);
|
||||||
|
@ -392,9 +396,10 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
||||||
Highlighter highlighter = new Highlighter(this, scorer);
|
Highlighter highlighter = new Highlighter(this, scorer);
|
||||||
|
|
||||||
for (int i = 0; i < hits.totalHits; i++) {
|
for (int i = 0; i < hits.totalHits; i++) {
|
||||||
final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
|
final int docId = hits.scoreDocs[i].doc;
|
||||||
|
final StoredDocument doc = searcher.doc(docId);
|
||||||
String text = doc.get(FIELD_NAME);
|
String text = doc.get(FIELD_NAME);
|
||||||
TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
|
TokenStream tokenStream = getAnyTokenStream(FIELD_NAME, docId);
|
||||||
|
|
||||||
highlighter.setTextFragmenter(new SimpleFragmenter(40));
|
highlighter.setTextFragmenter(new SimpleFragmenter(40));
|
||||||
|
|
||||||
|
@ -422,9 +427,10 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
||||||
highlighter = new Highlighter(this, scorer);
|
highlighter = new Highlighter(this, scorer);
|
||||||
|
|
||||||
for (int i = 0; i < hits.totalHits; i++) {
|
for (int i = 0; i < hits.totalHits; i++) {
|
||||||
final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
|
final int docId = hits.scoreDocs[i].doc;
|
||||||
|
final StoredDocument doc = searcher.doc(docId);
|
||||||
String text = doc.get(FIELD_NAME);
|
String text = doc.get(FIELD_NAME);
|
||||||
TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
|
TokenStream tokenStream = getAnyTokenStream(FIELD_NAME, docId);
|
||||||
|
|
||||||
highlighter.setTextFragmenter(new SimpleFragmenter(40));
|
highlighter.setTextFragmenter(new SimpleFragmenter(40));
|
||||||
|
|
||||||
|
@ -452,9 +458,10 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
||||||
highlighter = new Highlighter(this, scorer);
|
highlighter = new Highlighter(this, scorer);
|
||||||
|
|
||||||
for (int i = 0; i < hits.totalHits; i++) {
|
for (int i = 0; i < hits.totalHits; i++) {
|
||||||
final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
|
final int docId = hits.scoreDocs[i].doc;
|
||||||
|
final StoredDocument doc = searcher.doc(docId);
|
||||||
String text = doc.get(FIELD_NAME);
|
String text = doc.get(FIELD_NAME);
|
||||||
TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
|
TokenStream tokenStream = getAnyTokenStream(FIELD_NAME, docId);
|
||||||
|
|
||||||
highlighter.setTextFragmenter(new SimpleFragmenter(40));
|
highlighter.setTextFragmenter(new SimpleFragmenter(40));
|
||||||
|
|
||||||
|
@ -478,9 +485,10 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
||||||
Highlighter highlighter = new Highlighter(this, scorer);
|
Highlighter highlighter = new Highlighter(this, scorer);
|
||||||
|
|
||||||
for (int i = 0; i < hits.totalHits; i++) {
|
for (int i = 0; i < hits.totalHits; i++) {
|
||||||
final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
|
final int docId = hits.scoreDocs[i].doc;
|
||||||
|
final StoredDocument doc = searcher.doc(docId);
|
||||||
String text = doc.get(FIELD_NAME);
|
String text = doc.get(FIELD_NAME);
|
||||||
TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
|
TokenStream tokenStream = getAnyTokenStream(FIELD_NAME, docId);
|
||||||
|
|
||||||
highlighter.setTextFragmenter(new SimpleFragmenter(40));
|
highlighter.setTextFragmenter(new SimpleFragmenter(40));
|
||||||
|
|
||||||
|
@ -503,9 +511,10 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
||||||
Highlighter highlighter = new Highlighter(this, scorer);
|
Highlighter highlighter = new Highlighter(this, scorer);
|
||||||
|
|
||||||
for (int i = 0; i < hits.totalHits; i++) {
|
for (int i = 0; i < hits.totalHits; i++) {
|
||||||
final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
|
final int docId = hits.scoreDocs[i].doc;
|
||||||
|
final StoredDocument doc = searcher.doc(docId);
|
||||||
String text = doc.get(FIELD_NAME);
|
String text = doc.get(FIELD_NAME);
|
||||||
TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
|
TokenStream tokenStream = getAnyTokenStream(FIELD_NAME, docId);
|
||||||
|
|
||||||
highlighter.setTextFragmenter(new SimpleFragmenter(40));
|
highlighter.setTextFragmenter(new SimpleFragmenter(40));
|
||||||
|
|
||||||
|
@ -528,9 +537,10 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
||||||
Highlighter highlighter = new Highlighter(this, scorer);
|
Highlighter highlighter = new Highlighter(this, scorer);
|
||||||
|
|
||||||
for (int i = 0; i < hits.totalHits; i++) {
|
for (int i = 0; i < hits.totalHits; i++) {
|
||||||
final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
|
final int docId = hits.scoreDocs[i].doc;
|
||||||
|
final StoredDocument doc = searcher.doc(docId);
|
||||||
String text = doc.get(FIELD_NAME);
|
String text = doc.get(FIELD_NAME);
|
||||||
TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
|
TokenStream tokenStream = getAnyTokenStream(FIELD_NAME, docId);
|
||||||
|
|
||||||
highlighter.setTextFragmenter(new SimpleFragmenter(40));
|
highlighter.setTextFragmenter(new SimpleFragmenter(40));
|
||||||
|
|
||||||
|
@ -614,8 +624,8 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
||||||
|
|
||||||
for (int i = 0; i < hits.totalHits; i++) {
|
for (int i = 0; i < hits.totalHits; i++) {
|
||||||
String text = "parent document";
|
String text = "parent document";
|
||||||
StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
|
final int docId = hits.scoreDocs[i].doc;
|
||||||
TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
|
TokenStream tokenStream = getAnyTokenStream(FIELD_NAME, docId);
|
||||||
|
|
||||||
highlighter.setTextFragmenter(new SimpleFragmenter(40));
|
highlighter.setTextFragmenter(new SimpleFragmenter(40));
|
||||||
highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired, "...");
|
highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired, "...");
|
||||||
|
@ -640,9 +650,10 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
||||||
highlighter.setTextFragmenter(new SimpleFragmenter(40));
|
highlighter.setTextFragmenter(new SimpleFragmenter(40));
|
||||||
|
|
||||||
for (int i = 0; i < hits.totalHits; i++) {
|
for (int i = 0; i < hits.totalHits; i++) {
|
||||||
final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
|
final int docId = hits.scoreDocs[i].doc;
|
||||||
|
final StoredDocument doc = searcher.doc(docId);
|
||||||
String text = doc.get(FIELD_NAME);
|
String text = doc.get(FIELD_NAME);
|
||||||
TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
|
TokenStream tokenStream = getAnyTokenStream(FIELD_NAME, docId);
|
||||||
|
|
||||||
String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired,
|
String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired,
|
||||||
"...");
|
"...");
|
||||||
|
@ -663,9 +674,10 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
||||||
int maxNumFragmentsRequired = 2;
|
int maxNumFragmentsRequired = 2;
|
||||||
|
|
||||||
for (int i = 0; i < hits.totalHits; i++) {
|
for (int i = 0; i < hits.totalHits; i++) {
|
||||||
final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
|
final int docId = hits.scoreDocs[i].doc;
|
||||||
|
final StoredDocument doc = searcher.doc(docId);
|
||||||
String text = doc.get(FIELD_NAME);
|
String text = doc.get(FIELD_NAME);
|
||||||
TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
|
TokenStream tokenStream = getAnyTokenStream(FIELD_NAME, docId);
|
||||||
QueryScorer scorer = new QueryScorer(query, FIELD_NAME);
|
QueryScorer scorer = new QueryScorer(query, FIELD_NAME);
|
||||||
Highlighter highlighter = new Highlighter(this, scorer);
|
Highlighter highlighter = new Highlighter(this, scorer);
|
||||||
|
|
||||||
|
@ -694,9 +706,10 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
||||||
Highlighter highlighter = new Highlighter(this, scorer);
|
Highlighter highlighter = new Highlighter(this, scorer);
|
||||||
|
|
||||||
for (int i = 0; i < hits.totalHits; i++) {
|
for (int i = 0; i < hits.totalHits; i++) {
|
||||||
final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
|
final int docId = hits.scoreDocs[i].doc;
|
||||||
|
final StoredDocument doc = searcher.doc(docId);
|
||||||
String text = doc.get(FIELD_NAME);
|
String text = doc.get(FIELD_NAME);
|
||||||
TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
|
TokenStream tokenStream = getAnyTokenStream(FIELD_NAME, docId);
|
||||||
|
|
||||||
highlighter.setTextFragmenter(new SimpleSpanFragmenter(scorer, 5));
|
highlighter.setTextFragmenter(new SimpleSpanFragmenter(scorer, 5));
|
||||||
|
|
||||||
|
@ -749,9 +762,10 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
||||||
Highlighter highlighter = new Highlighter(this,scorer);
|
Highlighter highlighter = new Highlighter(this,scorer);
|
||||||
|
|
||||||
for (int i = 0; i < hits.totalHits; i++) {
|
for (int i = 0; i < hits.totalHits; i++) {
|
||||||
final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
|
final int docId = hits.scoreDocs[i].doc;
|
||||||
|
final StoredDocument doc = searcher.doc(docId);
|
||||||
String text = doc.get(FIELD_NAME);
|
String text = doc.get(FIELD_NAME);
|
||||||
TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
|
TokenStream tokenStream = getAnyTokenStream(FIELD_NAME, docId);
|
||||||
|
|
||||||
highlighter.setTextFragmenter(new SimpleFragmenter(40));
|
highlighter.setTextFragmenter(new SimpleFragmenter(40));
|
||||||
|
|
||||||
|
@ -821,9 +835,10 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
||||||
highlighter.setTextFragmenter(new SimpleFragmenter(40));
|
highlighter.setTextFragmenter(new SimpleFragmenter(40));
|
||||||
int maxNumFragmentsRequired = 2;
|
int maxNumFragmentsRequired = 2;
|
||||||
for (int i = 0; i < hits.totalHits; i++) {
|
for (int i = 0; i < hits.totalHits; i++) {
|
||||||
final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
|
final int docId = hits.scoreDocs[i].doc;
|
||||||
|
final StoredDocument doc = searcher.doc(docId);
|
||||||
String text = doc.get(FIELD_NAME);
|
String text = doc.get(FIELD_NAME);
|
||||||
TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
|
TokenStream tokenStream = getAnyTokenStream(FIELD_NAME, docId);
|
||||||
|
|
||||||
String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired,
|
String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired,
|
||||||
"...");
|
"...");
|
||||||
|
@ -1016,9 +1031,11 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
||||||
hits = searcher.search(query, 1000);
|
hits = searcher.search(query, 1000);
|
||||||
|
|
||||||
for (int i = 0; i < hits.totalHits; i++) {
|
for (int i = 0; i < hits.totalHits; i++) {
|
||||||
final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
|
final int docId = hits.scoreDocs[i].doc;
|
||||||
|
final StoredDocument doc = searcher.doc(docId);
|
||||||
String text = doc.get(FIELD_NAME);
|
String text = doc.get(FIELD_NAME);
|
||||||
TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer); int maxNumFragmentsRequired = 2;
|
TokenStream tokenStream = getAnyTokenStream(FIELD_NAME, docId);
|
||||||
|
int maxNumFragmentsRequired = 2;
|
||||||
String fragmentSeparator = "...";
|
String fragmentSeparator = "...";
|
||||||
QueryScorer scorer = new QueryScorer(query, HighlighterTest.FIELD_NAME);
|
QueryScorer scorer = new QueryScorer(query, HighlighterTest.FIELD_NAME);
|
||||||
|
|
||||||
|
@ -1040,9 +1057,10 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
||||||
numHighlights = 0;
|
numHighlights = 0;
|
||||||
|
|
||||||
for (int i = 0; i < hits.totalHits; i++) {
|
for (int i = 0; i < hits.totalHits; i++) {
|
||||||
final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
|
final int docId = hits.scoreDocs[i].doc;
|
||||||
|
final StoredDocument doc = searcher.doc(docId);
|
||||||
String text = doc.get(FIELD_NAME);
|
String text = doc.get(FIELD_NAME);
|
||||||
TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
|
TokenStream tokenStream = getAnyTokenStream(FIELD_NAME, docId);
|
||||||
int maxNumFragmentsRequired = 2;
|
int maxNumFragmentsRequired = 2;
|
||||||
String fragmentSeparator = "...";
|
String fragmentSeparator = "...";
|
||||||
QueryScorer scorer = new QueryScorer(query, null);
|
QueryScorer scorer = new QueryScorer(query, null);
|
||||||
|
@ -1065,9 +1083,11 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
||||||
numHighlights = 0;
|
numHighlights = 0;
|
||||||
|
|
||||||
for (int i = 0; i < hits.totalHits; i++) {
|
for (int i = 0; i < hits.totalHits; i++) {
|
||||||
final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
|
final int docId = hits.scoreDocs[i].doc;
|
||||||
|
final StoredDocument doc = searcher.doc(docId);
|
||||||
String text = doc.get(FIELD_NAME);
|
String text = doc.get(FIELD_NAME);
|
||||||
TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer); int maxNumFragmentsRequired = 2;
|
TokenStream tokenStream = getAnyTokenStream(FIELD_NAME, docId);
|
||||||
|
int maxNumFragmentsRequired = 2;
|
||||||
String fragmentSeparator = "...";
|
String fragmentSeparator = "...";
|
||||||
QueryScorer scorer = new QueryScorer(query, "random_field", HighlighterTest.FIELD_NAME);
|
QueryScorer scorer = new QueryScorer(query, "random_field", HighlighterTest.FIELD_NAME);
|
||||||
|
|
||||||
|
@ -1241,9 +1261,10 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
||||||
doSearching(new TermQuery(new Term(FIELD_NAME, "kennedy")));
|
doSearching(new TermQuery(new Term(FIELD_NAME, "kennedy")));
|
||||||
numHighlights = 0;
|
numHighlights = 0;
|
||||||
for (int i = 0; i < hits.totalHits; i++) {
|
for (int i = 0; i < hits.totalHits; i++) {
|
||||||
final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
|
final int docId = hits.scoreDocs[i].doc;
|
||||||
|
final StoredDocument doc = searcher.doc(docId);
|
||||||
String text = doc.get(FIELD_NAME);
|
String text = doc.get(FIELD_NAME);
|
||||||
TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
|
TokenStream tokenStream = getAnyTokenStream(FIELD_NAME, docId);
|
||||||
|
|
||||||
Highlighter highlighter = getHighlighter(query, FIELD_NAME,
|
Highlighter highlighter = getHighlighter(query, FIELD_NAME,
|
||||||
HighlighterTest.this);
|
HighlighterTest.this);
|
||||||
|
@ -1256,9 +1277,10 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
||||||
|
|
||||||
numHighlights = 0;
|
numHighlights = 0;
|
||||||
for (int i = 0; i < hits.totalHits; i++) {
|
for (int i = 0; i < hits.totalHits; i++) {
|
||||||
final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
|
final int docId = hits.scoreDocs[i].doc;
|
||||||
|
final StoredDocument doc = searcher.doc(docId);
|
||||||
String text = doc.get(FIELD_NAME);
|
String text = doc.get(FIELD_NAME);
|
||||||
TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
|
TokenStream tokenStream = getAnyTokenStream(FIELD_NAME, docId);
|
||||||
Highlighter highlighter = getHighlighter(query, FIELD_NAME,
|
Highlighter highlighter = getHighlighter(query, FIELD_NAME,
|
||||||
HighlighterTest.this);
|
HighlighterTest.this);
|
||||||
highlighter.getBestFragment(tokenStream, text);
|
highlighter.getBestFragment(tokenStream, text);
|
||||||
|
@ -1268,9 +1290,10 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
||||||
|
|
||||||
numHighlights = 0;
|
numHighlights = 0;
|
||||||
for (int i = 0; i < hits.totalHits; i++) {
|
for (int i = 0; i < hits.totalHits; i++) {
|
||||||
final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
|
final int docId = hits.scoreDocs[i].doc;
|
||||||
|
final StoredDocument doc = searcher.doc(docId);
|
||||||
String text = doc.get(FIELD_NAME);
|
String text = doc.get(FIELD_NAME);
|
||||||
TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
|
TokenStream tokenStream = getAnyTokenStream(FIELD_NAME, docId);
|
||||||
|
|
||||||
Highlighter highlighter = getHighlighter(query, FIELD_NAME,
|
Highlighter highlighter = getHighlighter(query, FIELD_NAME,
|
||||||
HighlighterTest.this);
|
HighlighterTest.this);
|
||||||
|
@ -1400,9 +1423,10 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
||||||
doSearching(new TermQuery(new Term(FIELD_NAME, "kennedy")));
|
doSearching(new TermQuery(new Term(FIELD_NAME, "kennedy")));
|
||||||
|
|
||||||
for (int i = 0; i < hits.totalHits; i++) {
|
for (int i = 0; i < hits.totalHits; i++) {
|
||||||
final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
|
final int docId = hits.scoreDocs[i].doc;
|
||||||
|
final StoredDocument doc = searcher.doc(docId);
|
||||||
String text = doc.get(FIELD_NAME);
|
String text = doc.get(FIELD_NAME);
|
||||||
TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
|
TokenStream tokenStream = getAnyTokenStream(FIELD_NAME, docId);
|
||||||
|
|
||||||
Highlighter highlighter = getHighlighter(query, FIELD_NAME,
|
Highlighter highlighter = getHighlighter(query, FIELD_NAME,
|
||||||
HighlighterTest.this);// new Highlighter(this, new
|
HighlighterTest.this);// new Highlighter(this, new
|
||||||
|
@ -1553,9 +1577,10 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
||||||
int maxNumFragmentsRequired = 3;
|
int maxNumFragmentsRequired = 3;
|
||||||
|
|
||||||
for (int i = 0; i < hits.totalHits; i++) {
|
for (int i = 0; i < hits.totalHits; i++) {
|
||||||
final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
|
final int docId = hits.scoreDocs[i].doc;
|
||||||
|
final StoredDocument doc = searcher.doc(docId);
|
||||||
String text = doc.get(FIELD_NAME);
|
String text = doc.get(FIELD_NAME);
|
||||||
TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
|
TokenStream tokenStream = getAnyTokenStream(FIELD_NAME, docId);
|
||||||
Highlighter highlighter = getHighlighter(query, FIELD_NAME, HighlighterTest.this, false);
|
Highlighter highlighter = getHighlighter(query, FIELD_NAME, HighlighterTest.this, false);
|
||||||
|
|
||||||
highlighter.setTextFragmenter(new SimpleFragmenter(40));
|
highlighter.setTextFragmenter(new SimpleFragmenter(40));
|
||||||
|
@ -2260,13 +2285,15 @@ final class SynonymTokenizer extends TokenStream {
|
||||||
throws Exception {
|
throws Exception {
|
||||||
|
|
||||||
for (int i = 0; i < hits.totalHits; i++) {
|
for (int i = 0; i < hits.totalHits; i++) {
|
||||||
final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
|
final int docId = hits.scoreDocs[i].doc;
|
||||||
|
final StoredDocument doc = searcher.doc(docId);
|
||||||
String text = doc.get(HighlighterTest.FIELD_NAME);
|
String text = doc.get(HighlighterTest.FIELD_NAME);
|
||||||
int maxNumFragmentsRequired = 2;
|
int maxNumFragmentsRequired = 2;
|
||||||
String fragmentSeparator = "...";
|
String fragmentSeparator = "...";
|
||||||
Scorer scorer = null;
|
Scorer scorer = null;
|
||||||
TokenStream tokenStream = TokenSources.getAnyTokenStream(searcher.getIndexReader(),
|
TokenStream tokenStream =
|
||||||
hits.scoreDocs[i].doc, HighlighterTest.FIELD_NAME, doc, analyzer);
|
TokenSources.getTokenStream(HighlighterTest.FIELD_NAME,
|
||||||
|
searcher.getIndexReader().getTermVectors(docId), text, analyzer, -1);
|
||||||
if (mode == QUERY) {
|
if (mode == QUERY) {
|
||||||
scorer = new QueryScorer(query);
|
scorer = new QueryScorer(query);
|
||||||
} else if (mode == QUERY_TERM) {
|
} else if (mode == QUERY_TERM) {
|
||||||
|
|
|
@ -22,6 +22,7 @@ import java.io.IOException;
|
||||||
import com.carrotsearch.randomizedtesting.annotations.Repeat;
|
import com.carrotsearch.randomizedtesting.annotations.Repeat;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.analysis.CannedTokenStream;
|
import org.apache.lucene.analysis.CannedTokenStream;
|
||||||
|
import org.apache.lucene.analysis.MockAnalyzer;
|
||||||
import org.apache.lucene.analysis.Token;
|
import org.apache.lucene.analysis.Token;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
@ -34,6 +35,7 @@ import org.apache.lucene.document.FieldType;
|
||||||
import org.apache.lucene.document.TextField;
|
import org.apache.lucene.document.TextField;
|
||||||
import org.apache.lucene.index.BaseTermVectorsFormatTestCase;
|
import org.apache.lucene.index.BaseTermVectorsFormatTestCase;
|
||||||
import org.apache.lucene.index.DirectoryReader;
|
import org.apache.lucene.index.DirectoryReader;
|
||||||
|
import org.apache.lucene.index.Fields;
|
||||||
import org.apache.lucene.index.IndexReader;
|
import org.apache.lucene.index.IndexReader;
|
||||||
import org.apache.lucene.index.IndexWriter;
|
import org.apache.lucene.index.IndexWriter;
|
||||||
import org.apache.lucene.index.RandomIndexWriter;
|
import org.apache.lucene.index.RandomIndexWriter;
|
||||||
|
@ -50,6 +52,11 @@ import org.apache.lucene.util.BytesRef;
|
||||||
import org.apache.lucene.util.TestUtil;
|
import org.apache.lucene.util.TestUtil;
|
||||||
|
|
||||||
// LUCENE-2874
|
// LUCENE-2874
|
||||||
|
|
||||||
|
/** Tests {@link org.apache.lucene.search.highlight.TokenSources} and
|
||||||
|
* {@link org.apache.lucene.search.highlight.TokenStreamFromTermVector}
|
||||||
|
* indirectly from that.
|
||||||
|
*/
|
||||||
public class TokenSourcesTest extends BaseTokenStreamTestCase {
|
public class TokenSourcesTest extends BaseTokenStreamTestCase {
|
||||||
private static final String FIELD = "text";
|
private static final String FIELD = "text";
|
||||||
|
|
||||||
|
@ -100,6 +107,7 @@ public class TokenSourcesTest extends BaseTokenStreamTestCase {
|
||||||
final Document document = new Document();
|
final Document document = new Document();
|
||||||
FieldType customType = new FieldType(TextField.TYPE_NOT_STORED);
|
FieldType customType = new FieldType(TextField.TYPE_NOT_STORED);
|
||||||
customType.setStoreTermVectors(true);
|
customType.setStoreTermVectors(true);
|
||||||
|
// no positions!
|
||||||
customType.setStoreTermVectorOffsets(true);
|
customType.setStoreTermVectorOffsets(true);
|
||||||
document.add(new Field(FIELD, new OverlappingTokenStream(), customType));
|
document.add(new Field(FIELD, new OverlappingTokenStream(), customType));
|
||||||
indexWriter.addDocument(document);
|
indexWriter.addDocument(document);
|
||||||
|
@ -122,9 +130,8 @@ public class TokenSourcesTest extends BaseTokenStreamTestCase {
|
||||||
final Highlighter highlighter = new Highlighter(
|
final Highlighter highlighter = new Highlighter(
|
||||||
new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
|
new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
|
||||||
new QueryScorer(query));
|
new QueryScorer(query));
|
||||||
final TokenStream tokenStream = TokenSources
|
final TokenStream tokenStream =
|
||||||
.getTokenStream(
|
TokenSources.getTermVectorTokenStreamOrNull(FIELD, indexReader.getTermVectors(0), -1);
|
||||||
indexReader.getTermVector(0, FIELD));
|
|
||||||
assertEquals("<B>the fox</B> did not jump",
|
assertEquals("<B>the fox</B> did not jump",
|
||||||
highlighter.getBestFragment(tokenStream, TEXT));
|
highlighter.getBestFragment(tokenStream, TEXT));
|
||||||
} finally {
|
} finally {
|
||||||
|
@ -166,9 +173,8 @@ public class TokenSourcesTest extends BaseTokenStreamTestCase {
|
||||||
final Highlighter highlighter = new Highlighter(
|
final Highlighter highlighter = new Highlighter(
|
||||||
new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
|
new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
|
||||||
new QueryScorer(query));
|
new QueryScorer(query));
|
||||||
final TokenStream tokenStream = TokenSources
|
final TokenStream tokenStream =
|
||||||
.getTokenStream(
|
TokenSources.getTermVectorTokenStreamOrNull(FIELD, indexReader.getTermVectors(0), -1);
|
||||||
indexReader.getTermVector(0, FIELD));
|
|
||||||
assertEquals("<B>the fox</B> did not jump",
|
assertEquals("<B>the fox</B> did not jump",
|
||||||
highlighter.getBestFragment(tokenStream, TEXT));
|
highlighter.getBestFragment(tokenStream, TEXT));
|
||||||
} finally {
|
} finally {
|
||||||
|
@ -187,6 +193,7 @@ public class TokenSourcesTest extends BaseTokenStreamTestCase {
|
||||||
final Document document = new Document();
|
final Document document = new Document();
|
||||||
FieldType customType = new FieldType(TextField.TYPE_NOT_STORED);
|
FieldType customType = new FieldType(TextField.TYPE_NOT_STORED);
|
||||||
customType.setStoreTermVectors(true);
|
customType.setStoreTermVectors(true);
|
||||||
|
// no positions!
|
||||||
customType.setStoreTermVectorOffsets(true);
|
customType.setStoreTermVectorOffsets(true);
|
||||||
document.add(new Field(FIELD, new OverlappingTokenStream(), customType));
|
document.add(new Field(FIELD, new OverlappingTokenStream(), customType));
|
||||||
indexWriter.addDocument(document);
|
indexWriter.addDocument(document);
|
||||||
|
@ -209,9 +216,8 @@ public class TokenSourcesTest extends BaseTokenStreamTestCase {
|
||||||
final Highlighter highlighter = new Highlighter(
|
final Highlighter highlighter = new Highlighter(
|
||||||
new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
|
new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
|
||||||
new QueryScorer(phraseQuery));
|
new QueryScorer(phraseQuery));
|
||||||
final TokenStream tokenStream = TokenSources
|
final TokenStream tokenStream =
|
||||||
.getTokenStream(
|
TokenSources.getTermVectorTokenStreamOrNull(FIELD, indexReader.getTermVectors(0), -1);
|
||||||
indexReader.getTermVector(0, FIELD));
|
|
||||||
assertEquals("<B>the fox</B> did not jump",
|
assertEquals("<B>the fox</B> did not jump",
|
||||||
highlighter.getBestFragment(tokenStream, TEXT));
|
highlighter.getBestFragment(tokenStream, TEXT));
|
||||||
} finally {
|
} finally {
|
||||||
|
@ -230,6 +236,7 @@ public class TokenSourcesTest extends BaseTokenStreamTestCase {
|
||||||
final Document document = new Document();
|
final Document document = new Document();
|
||||||
FieldType customType = new FieldType(TextField.TYPE_NOT_STORED);
|
FieldType customType = new FieldType(TextField.TYPE_NOT_STORED);
|
||||||
customType.setStoreTermVectors(true);
|
customType.setStoreTermVectors(true);
|
||||||
|
customType.setStoreTermVectorPositions(true);
|
||||||
customType.setStoreTermVectorOffsets(true);
|
customType.setStoreTermVectorOffsets(true);
|
||||||
document.add(new Field(FIELD, new OverlappingTokenStream(), customType));
|
document.add(new Field(FIELD, new OverlappingTokenStream(), customType));
|
||||||
indexWriter.addDocument(document);
|
indexWriter.addDocument(document);
|
||||||
|
@ -252,9 +259,8 @@ public class TokenSourcesTest extends BaseTokenStreamTestCase {
|
||||||
final Highlighter highlighter = new Highlighter(
|
final Highlighter highlighter = new Highlighter(
|
||||||
new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
|
new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
|
||||||
new QueryScorer(phraseQuery));
|
new QueryScorer(phraseQuery));
|
||||||
final TokenStream tokenStream = TokenSources
|
final TokenStream tokenStream =
|
||||||
.getTokenStream(
|
TokenSources.getTermVectorTokenStreamOrNull(FIELD, indexReader.getTermVectors(0), -1);
|
||||||
indexReader.getTermVector(0, FIELD));
|
|
||||||
assertEquals("<B>the fox</B> did not jump",
|
assertEquals("<B>the fox</B> did not jump",
|
||||||
highlighter.getBestFragment(tokenStream, TEXT));
|
highlighter.getBestFragment(tokenStream, TEXT));
|
||||||
} finally {
|
} finally {
|
||||||
|
@ -263,7 +269,7 @@ public class TokenSourcesTest extends BaseTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testTermVectorWithoutOffsetsThrowsException()
|
public void testTermVectorWithoutOffsetsDoesntWork()
|
||||||
throws IOException, InvalidTokenOffsetsException {
|
throws IOException, InvalidTokenOffsetsException {
|
||||||
final Directory directory = newDirectory();
|
final Directory directory = newDirectory();
|
||||||
final IndexWriter indexWriter = new IndexWriter(directory,
|
final IndexWriter indexWriter = new IndexWriter(directory,
|
||||||
|
@ -282,12 +288,9 @@ public class TokenSourcesTest extends BaseTokenStreamTestCase {
|
||||||
final IndexReader indexReader = DirectoryReader.open(directory);
|
final IndexReader indexReader = DirectoryReader.open(directory);
|
||||||
try {
|
try {
|
||||||
assertEquals(1, indexReader.numDocs());
|
assertEquals(1, indexReader.numDocs());
|
||||||
TokenSources.getTokenStream(
|
final TokenStream tokenStream =
|
||||||
indexReader.getTermVector(0, FIELD));
|
TokenSources.getTermVectorTokenStreamOrNull(FIELD, indexReader.getTermVectors(0), -1);
|
||||||
fail("TokenSources.getTokenStream should throw IllegalArgumentException if term vector has no offsets");
|
assertNull(tokenStream);
|
||||||
}
|
|
||||||
catch (IllegalArgumentException e) {
|
|
||||||
// expected
|
|
||||||
}
|
}
|
||||||
finally {
|
finally {
|
||||||
indexReader.close();
|
indexReader.close();
|
||||||
|
@ -333,7 +336,7 @@ public class TokenSourcesTest extends BaseTokenStreamTestCase {
|
||||||
writer.close();
|
writer.close();
|
||||||
assertEquals(1, reader.numDocs());
|
assertEquals(1, reader.numDocs());
|
||||||
|
|
||||||
TokenStream ts = TokenSources.getTokenStream(reader.getTermVectors(0).terms("field"));
|
TokenStream ts = TokenSources.getTermVectorTokenStreamOrNull("field", reader.getTermVectors(0), -1);
|
||||||
|
|
||||||
CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
|
CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
|
||||||
PositionIncrementAttribute posIncAtt = ts.getAttribute(PositionIncrementAttribute.class);
|
PositionIncrementAttribute posIncAtt = ts.getAttribute(PositionIncrementAttribute.class);
|
||||||
|
@ -409,7 +412,8 @@ public class TokenSourcesTest extends BaseTokenStreamTestCase {
|
||||||
writer.close();
|
writer.close();
|
||||||
assertEquals(1, reader.numDocs());
|
assertEquals(1, reader.numDocs());
|
||||||
|
|
||||||
TokenStream vectorTokenStream = TokenSources.getTokenStream(reader.getTermVectors(0).terms("field"));
|
TokenStream vectorTokenStream =
|
||||||
|
TokenSources.getTermVectorTokenStreamOrNull("field", reader.getTermVectors(0), -1);
|
||||||
|
|
||||||
//sometimes check payloads
|
//sometimes check payloads
|
||||||
PayloadAttribute payloadAttribute = null;
|
PayloadAttribute payloadAttribute = null;
|
||||||
|
@ -428,6 +432,59 @@ public class TokenSourcesTest extends BaseTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
reader.close();
|
reader.close();
|
||||||
|
dir.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testMaxStartOffsetConsistency() throws IOException {
|
||||||
|
FieldType tvFieldType = new FieldType(TextField.TYPE_NOT_STORED);
|
||||||
|
tvFieldType.setStoreTermVectors(true);
|
||||||
|
tvFieldType.setStoreTermVectorOffsets(true);
|
||||||
|
tvFieldType.setStoreTermVectorPositions(true);
|
||||||
|
|
||||||
|
Directory dir = newDirectory();
|
||||||
|
|
||||||
|
MockAnalyzer analyzer = new MockAnalyzer(random());
|
||||||
|
analyzer.setEnableChecks(false);//we don't necessarily consume the whole stream because of limiting by startOffset
|
||||||
|
Document doc = new Document();
|
||||||
|
final String TEXT = " f gg h";
|
||||||
|
doc.add(new Field("fld_tv", analyzer.tokenStream("fooFld", TEXT), tvFieldType));
|
||||||
|
doc.add(new TextField("fld_notv", analyzer.tokenStream("barFld", TEXT)));
|
||||||
|
|
||||||
|
IndexReader reader;
|
||||||
|
try (RandomIndexWriter writer = new RandomIndexWriter(random(), dir)) {
|
||||||
|
writer.addDocument(doc);
|
||||||
|
reader = writer.getReader();
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
Fields tvFields = reader.getTermVectors(0);
|
||||||
|
for (int maxStartOffset = -1; maxStartOffset <= TEXT.length(); maxStartOffset++) {
|
||||||
|
TokenStream tvStream = TokenSources.getTokenStream("fld_tv", tvFields, TEXT, analyzer, maxStartOffset);
|
||||||
|
TokenStream anaStream = TokenSources.getTokenStream("fld_notv", tvFields, TEXT, analyzer, maxStartOffset);
|
||||||
|
|
||||||
|
//assert have same tokens, none of which has a start offset > maxStartOffset
|
||||||
|
final OffsetAttribute tvOffAtt = tvStream.addAttribute(OffsetAttribute.class);
|
||||||
|
final OffsetAttribute anaOffAtt = anaStream.addAttribute(OffsetAttribute.class);
|
||||||
|
tvStream.reset();
|
||||||
|
anaStream.reset();
|
||||||
|
while (tvStream.incrementToken()) {
|
||||||
|
assertTrue(anaStream.incrementToken());
|
||||||
|
assertEquals(tvOffAtt.startOffset(), anaOffAtt.startOffset());
|
||||||
|
if (maxStartOffset >= 0)
|
||||||
|
assertTrue(tvOffAtt.startOffset() <= maxStartOffset);
|
||||||
|
}
|
||||||
|
assertTrue(anaStream.incrementToken() == false);
|
||||||
|
tvStream.end();
|
||||||
|
anaStream.end();
|
||||||
|
tvStream.close();
|
||||||
|
anaStream.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
} finally {
|
||||||
|
reader.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
dir.close();
|
dir.close();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -33,6 +33,7 @@ import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
|
import org.apache.lucene.index.Fields;
|
||||||
import org.apache.lucene.index.StorableField;
|
import org.apache.lucene.index.StorableField;
|
||||||
import org.apache.lucene.index.StoredDocument;
|
import org.apache.lucene.index.StoredDocument;
|
||||||
import org.apache.lucene.search.Query;
|
import org.apache.lucene.search.Query;
|
||||||
|
@ -493,8 +494,9 @@ public class DefaultSolrHighlighter extends SolrHighlighter implements PluginInf
|
||||||
List<TextFragment> frags = new ArrayList<>();
|
List<TextFragment> frags = new ArrayList<>();
|
||||||
|
|
||||||
//Try term vectors, which is faster
|
//Try term vectors, which is faster
|
||||||
|
final Fields tvFields = searcher.getIndexReader().getTermVectors(docId); // TODO add as param; see SOLR-5855
|
||||||
final TokenStream tvStream =
|
final TokenStream tvStream =
|
||||||
TokenSources.getTokenStreamWithOffsets(searcher.getIndexReader(), docId, fieldName);
|
TokenSources.getTermVectorTokenStreamOrNull(fieldName, tvFields, maxCharsToAnalyze - 1);
|
||||||
// We need to wrap in OffsetWindowTokenFilter if multi-valued
|
// We need to wrap in OffsetWindowTokenFilter if multi-valued
|
||||||
final OffsetWindowTokenFilter tvWindowStream;
|
final OffsetWindowTokenFilter tvWindowStream;
|
||||||
if (tvStream != null && fieldValues.size() > 1) {
|
if (tvStream != null && fieldValues.size() > 1) {
|
||||||
|
|
Loading…
Reference in New Issue