mirror of https://github.com/apache/lucene.git
SOLR-6680: refactor DefaultSolrHighlighter.TermOffsetsTokenStream (from term vectors) to avoid buffering the token.
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1647481 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
e0dbdd9260
commit
75bc69dcce
|
@ -356,8 +356,9 @@ Optimizations
|
|||
compare-and-set writes. This change also adds batching for consecutive messages
|
||||
belonging to the same collection with stateFormat=2. (shalin)
|
||||
|
||||
* SOLR-6680: DefaultSolrHighlighter can sometimes avoid CachingTokenFilter to save memory and
|
||||
enable other optimizations. (David Smiley)
|
||||
* SOLR-6680: DefaultSolrHighlighter can sometimes avoid CachingTokenFilter with
|
||||
hl.usePhraseHighlighter, and can be more efficient handling data from term vectors.
|
||||
(David Smiley)
|
||||
|
||||
Other Changes
|
||||
----------------------
|
||||
|
|
|
@ -16,16 +16,42 @@
|
|||
*/
|
||||
package org.apache.solr.highlight;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.ListIterator;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.CachingTokenFilter;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.index.StorableField;
|
||||
import org.apache.lucene.index.StoredDocument;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.highlight.*;
|
||||
import org.apache.lucene.search.highlight.Encoder;
|
||||
import org.apache.lucene.search.highlight.Formatter;
|
||||
import org.apache.lucene.search.vectorhighlight.*;
|
||||
import org.apache.lucene.search.highlight.Fragmenter;
|
||||
import org.apache.lucene.search.highlight.Highlighter;
|
||||
import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
|
||||
import org.apache.lucene.search.highlight.OffsetLimitTokenFilter;
|
||||
import org.apache.lucene.search.highlight.QueryScorer;
|
||||
import org.apache.lucene.search.highlight.QueryTermScorer;
|
||||
import org.apache.lucene.search.highlight.Scorer;
|
||||
import org.apache.lucene.search.highlight.TextFragment;
|
||||
import org.apache.lucene.search.highlight.TokenSources;
|
||||
import org.apache.lucene.search.vectorhighlight.BoundaryScanner;
|
||||
import org.apache.lucene.search.vectorhighlight.FastVectorHighlighter;
|
||||
import org.apache.lucene.search.vectorhighlight.FieldQuery;
|
||||
import org.apache.lucene.search.vectorhighlight.FragListBuilder;
|
||||
import org.apache.lucene.search.vectorhighlight.FragmentsBuilder;
|
||||
import org.apache.lucene.util.AttributeSource.State;
|
||||
import org.apache.solr.common.SolrException;
|
||||
import org.apache.solr.common.params.HighlightParams;
|
||||
|
@ -45,9 +71,6 @@ import org.apache.solr.util.plugin.PluginInfoInitialized;
|
|||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
*
|
||||
* @since solr 1.3
|
||||
|
@ -460,11 +483,15 @@ public class DefaultSolrHighlighter extends SolrHighlighter implements PluginInf
|
|||
String[] summaries = null;
|
||||
List<TextFragment> frags = new ArrayList<>();
|
||||
|
||||
TermOffsetsTokenStream tots = null; // to be non-null iff we're using TermOffsets optimization (multi-valued)
|
||||
//Try term vectors, which is faster
|
||||
TokenStream tvStream = TokenSources.getTokenStreamWithOffsets(searcher.getIndexReader(), docId, fieldName);
|
||||
final OffsetWindowTokenFilter tvWindowStream;
|
||||
if (tvStream != null && schemaField.multiValued() && isActuallyMultiValued(allFields, fieldName)) {
|
||||
tots = new TermOffsetsTokenStream(tvStream);
|
||||
tvWindowStream = new OffsetWindowTokenFilter(tvStream);
|
||||
} else {
|
||||
tvWindowStream = null;
|
||||
}
|
||||
|
||||
int mvToExamine = Integer.parseInt(req.getParams().get(HighlightParams.MAX_MULTIVALUED_TO_EXAMINE,
|
||||
Integer.toString(Integer.MAX_VALUE)));
|
||||
int mvToMatch = Integer.parseInt(req.getParams().get(HighlightParams.MAX_MULTIVALUED_TO_MATCH,
|
||||
|
@ -478,10 +505,9 @@ public class DefaultSolrHighlighter extends SolrHighlighter implements PluginInf
|
|||
--mvToExamine;
|
||||
String thisText = thisField.stringValue();
|
||||
TokenStream tstream;
|
||||
if (tots != null) {
|
||||
// if we're using TermOffsets optimization (multi-valued field with term vectors), then get the next
|
||||
// field value's TokenStream (i.e. get field j's TokenStream) from tots:
|
||||
tstream = tots.getMultiValuedTokenStream(thisText.length());
|
||||
if (tvWindowStream != null) {
|
||||
// if we have a multi-valued field with term vectors, then get the next offset window
|
||||
tstream = tvWindowStream.advanceToNextWindowOfLength(thisText.length());
|
||||
} else if (tvStream != null) {
|
||||
tstream = tvStream; // single-valued with term vectors
|
||||
} else {
|
||||
|
@ -684,6 +710,13 @@ final class TokenOrderingFilter extends TokenFilter {
|
|||
this.windowSize = windowSize;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
queue.clear();
|
||||
done = false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
while (!done && queue.size() < windowSize) {
|
||||
|
@ -726,76 +759,69 @@ class OrderedToken {
|
|||
int startOffset;
|
||||
}
|
||||
|
||||
class TermOffsetsTokenStream {
|
||||
/** For use with term vectors of multi-valued fields. We want an offset based window into it's TokenStream. */
|
||||
final class OffsetWindowTokenFilter extends TokenFilter {
|
||||
|
||||
final TokenStream bufferedTokenStream;
|
||||
final OffsetAttribute bufferedOffsetAtt;
|
||||
State bufferedToken;
|
||||
int bufferedStartOffset;
|
||||
int bufferedEndOffset;
|
||||
int startOffset = 0;
|
||||
int endOffset;
|
||||
boolean bufferedTokenStreamWasReset = false;
|
||||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
private int windowStartOffset;
|
||||
private int windowEndOffset = -1;//exclusive
|
||||
private boolean windowTokenIncremented = false;
|
||||
private boolean inputWasReset = false;
|
||||
private State capturedState;//only used for first token of each subsequent window
|
||||
|
||||
public TermOffsetsTokenStream( TokenStream tstream ){
|
||||
bufferedTokenStream = tstream;
|
||||
bufferedOffsetAtt = bufferedTokenStream.addAttribute(OffsetAttribute.class);
|
||||
OffsetWindowTokenFilter(TokenStream input) {//input should not have been reset already
|
||||
super(input);
|
||||
}
|
||||
|
||||
public TokenStream getMultiValuedTokenStream( final int length ){
|
||||
endOffset = startOffset + length;
|
||||
return new MultiValuedStream(length);
|
||||
//Called at the start of each value/window
|
||||
OffsetWindowTokenFilter advanceToNextWindowOfLength(int length) {
|
||||
windowStartOffset = windowEndOffset + 1;//unclear why there's a single offset gap between values, but tests show it
|
||||
windowEndOffset = windowStartOffset + length;
|
||||
windowTokenIncremented = false;//thereby permit reset()
|
||||
return this;
|
||||
}
|
||||
|
||||
final class MultiValuedStream extends TokenStream {
|
||||
private final int length;
|
||||
private boolean incrementTokenWasCalled = false;
|
||||
OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
|
||||
MultiValuedStream(int length) {
|
||||
super(bufferedTokenStream.cloneAttributes());//clone so we don't manipulate the buffered offsets
|
||||
this.length = length;
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
//we do some state checking to ensure this is being used correctly
|
||||
if (windowTokenIncremented) {
|
||||
throw new IllegalStateException("This TokenStream does not support being subsequently reset()");
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
//this flag allows reset() to be called multiple times up-front without a problem
|
||||
if (incrementTokenWasCalled) {
|
||||
throw new IllegalStateException("This TokenStream does not support being subsequently reset()");
|
||||
}
|
||||
if (!bufferedTokenStreamWasReset) {
|
||||
bufferedTokenStream.reset();
|
||||
bufferedTokenStreamWasReset = true;
|
||||
}
|
||||
if (!inputWasReset) {
|
||||
super.reset();
|
||||
inputWasReset = true;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
incrementTokenWasCalled = true;
|
||||
while( true ){
|
||||
if( bufferedToken == null ) {
|
||||
if (!bufferedTokenStream.incrementToken())
|
||||
return false;
|
||||
bufferedToken = bufferedTokenStream.captureState();
|
||||
bufferedStartOffset = bufferedOffsetAtt.startOffset();
|
||||
bufferedEndOffset = bufferedOffsetAtt.endOffset();
|
||||
}
|
||||
|
||||
if( startOffset <= bufferedStartOffset &&
|
||||
bufferedEndOffset <= endOffset ){
|
||||
restoreState(bufferedToken);
|
||||
bufferedToken = null;
|
||||
offsetAtt.setOffset( offsetAtt.startOffset() - startOffset, offsetAtt.endOffset() - startOffset );
|
||||
return true;
|
||||
}
|
||||
else if( bufferedEndOffset > endOffset ){
|
||||
startOffset += length + 1;
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
assert inputWasReset;
|
||||
windowTokenIncremented = true;
|
||||
while (true) {
|
||||
//increment Token
|
||||
if (capturedState == null) {
|
||||
if (!input.incrementToken()) {
|
||||
return false;
|
||||
}
|
||||
bufferedToken = null;
|
||||
} else {
|
||||
restoreState(capturedState);
|
||||
capturedState = null;
|
||||
//Set posInc to 1 on first token of subsequent windows. To be thorough, we could subtract posIncGap?
|
||||
posIncAtt.setPositionIncrement(1);
|
||||
}
|
||||
}
|
||||
|
||||
final int startOffset = offsetAtt.startOffset();
|
||||
final int endOffset = offsetAtt.endOffset();
|
||||
if (startOffset >= windowEndOffset) {//end of window
|
||||
capturedState = captureState();
|
||||
return false;
|
||||
}
|
||||
if (startOffset >= windowStartOffset) {//in this window
|
||||
offsetAtt.setOffset(startOffset - windowStartOffset, endOffset - windowStartOffset);
|
||||
return true;
|
||||
}
|
||||
//otherwise this token is before the window; continue to advance
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,6 +17,10 @@
|
|||
|
||||
package org.apache.solr.highlight;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
|
||||
|
@ -29,10 +33,6 @@ import org.junit.After;
|
|||
import org.junit.BeforeClass;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Tests some basic functionality of Solr while demonstrating good
|
||||
* Best Practices for using AbstractSolrTestCase
|
||||
|
@ -170,16 +170,15 @@ public class HighlighterTest extends SolrTestCaseJ4 {
|
|||
}
|
||||
|
||||
@Test
|
||||
public void testTermOffsetsTokenStream() throws Exception {
|
||||
public void testOffsetWindowTokenFilter() throws Exception {
|
||||
String[] multivalued = { "a b c d", "e f g", "h", "i j k l m n" };
|
||||
Analyzer a1 = new WhitespaceAnalyzer();
|
||||
TokenStream tokenStream = a1.tokenStream("", "a b c d e f g h i j k l m n");
|
||||
tokenStream.reset();
|
||||
|
||||
TermOffsetsTokenStream tots = new TermOffsetsTokenStream(
|
||||
tokenStream);
|
||||
OffsetWindowTokenFilter tots = new OffsetWindowTokenFilter(tokenStream);
|
||||
for( String v : multivalued ){
|
||||
TokenStream ts1 = tots.getMultiValuedTokenStream( v.length() );
|
||||
TokenStream ts1 = tots.advanceToNextWindowOfLength(v.length());
|
||||
ts1.reset();
|
||||
Analyzer a2 = new WhitespaceAnalyzer();
|
||||
TokenStream ts2 = a2.tokenStream("", v);
|
||||
ts2.reset();
|
||||
|
|
Loading…
Reference in New Issue