mirror of https://github.com/apache/lucene.git
LUCENE-7526: UnifiedHighlighter: enhance MTQ passage relevancy. TokenStreamFromTermVector isn't used by the UH anymore. Refactor AnalysisOffsetStrategy into TokenStream and MemoryIndex strategies, and related refactorings from that.
This commit is contained in:
parent
280cbfd8fb
commit
7af454ad76
|
@ -85,6 +85,11 @@ Improvements
|
|||
* LUCENE-7524: Added more detailed explanation of how IDF is computed in
|
||||
ClassicSimilarity and BM25Similarity. (Adrien Grand)
|
||||
|
||||
* LUCENE-7526: Enhanced UnifiedHighlighter's passage relevancy for queries with
|
||||
wildcards and sometimes just terms. Added shouldPreferPassageRelevancyOverSpeed()
|
||||
which can be overridden to return false to eek out more speed in some cases.
|
||||
(Timothy M. Rodriguez, David Smiley)
|
||||
|
||||
Other
|
||||
|
||||
* LUCENE-7546: Fixed references to benchmark wikipedia data and the Jenkins line-docs file
|
||||
|
|
|
@ -34,7 +34,7 @@ content.source=org.apache.lucene.benchmark.byTask.feeds.EnwikiContentSource
|
|||
docs.file=temp/enwiki-20070527-pages-articles.xml.bz2
|
||||
|
||||
query.maker=org.apache.lucene.benchmark.byTask.feeds.FileBasedQueryMaker
|
||||
file.query.maker.file=conf/query-phrases.txt
|
||||
file.query.maker.file=conf/query-terms.txt
|
||||
log.queries=false
|
||||
log.step.SearchTravRetHighlight=-1
|
||||
|
||||
|
@ -55,7 +55,7 @@ highlighter=HlImpl:NONE:SH_A:UH_A:PH_P:UH_P:UH_PV
|
|||
|
||||
{ "Warm" SearchTravRetHighlight > : 1000
|
||||
|
||||
{ "HL" SearchTravRetHighlight > : 500
|
||||
{ "HL" SearchTravRetHighlight > : 2000
|
||||
|
||||
CloseReader
|
||||
|
||||
|
|
|
@ -54,7 +54,7 @@ highlighter=HlImpl:NONE:SH_V:FVH_V:UH_V
|
|||
|
||||
{ "Warm" SearchTravRetHighlight > : 1000
|
||||
|
||||
{ "HL" SearchTravRetHighlight > : 500
|
||||
{ "HL" SearchTravRetHighlight > : 2000
|
||||
|
||||
CloseReader
|
||||
|
||||
|
|
|
@ -17,181 +17,154 @@
|
|||
package org.apache.lucene.search.uhighlight;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.function.Function;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.FilteringTokenFilter;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.LeafReader;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.memory.MemoryIndex;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.spans.SpanQuery;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.automaton.Automata;
|
||||
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
|
||||
|
||||
|
||||
/**
|
||||
* Uses an {@link Analyzer} on content to get offsets. It may use a {@link MemoryIndex} too.
|
||||
* Provides a base class for analysis based offset strategies to extend from.
|
||||
* Requires an Analyzer and provides an override-able method for altering how
|
||||
* the TokenStream is created.
|
||||
*
|
||||
* @lucene.internal
|
||||
*/
|
||||
public class AnalysisOffsetStrategy extends FieldOffsetStrategy {
|
||||
public abstract class AnalysisOffsetStrategy extends FieldOffsetStrategy {
|
||||
|
||||
//TODO: Consider splitting this highlighter into a MemoryIndexFieldHighlighter and a TokenStreamFieldHighlighter
|
||||
private static final BytesRef[] ZERO_LEN_BYTES_REF_ARRAY = new BytesRef[0];
|
||||
private final Analyzer analyzer;
|
||||
private final MemoryIndex memoryIndex;
|
||||
private final LeafReader leafReader;
|
||||
private final CharacterRunAutomaton preMemIndexFilterAutomaton;
|
||||
protected final Analyzer analyzer;
|
||||
|
||||
public AnalysisOffsetStrategy(String field, BytesRef[] extractedTerms, PhraseHelper phraseHelper,
|
||||
CharacterRunAutomaton[] automata, Analyzer analyzer,
|
||||
Function<Query, Collection<Query>> multiTermQueryRewrite) {
|
||||
super(field, extractedTerms, phraseHelper, automata);
|
||||
public AnalysisOffsetStrategy(String field, BytesRef[] queryTerms, PhraseHelper phraseHelper, CharacterRunAutomaton[] automata, Analyzer analyzer) {
|
||||
super(field, queryTerms, phraseHelper, automata);
|
||||
this.analyzer = analyzer;
|
||||
// Automata (Wildcards / MultiTermQuery):
|
||||
this.automata = automata;
|
||||
|
||||
if (terms.length > 0 && !strictPhrases.hasPositionSensitivity()) {
|
||||
this.automata = convertTermsToAutomata(terms, automata);
|
||||
// clear the terms array now that we've moved them to be expressed as automata
|
||||
terms = ZERO_LEN_BYTES_REF_ARRAY;
|
||||
if (analyzer.getOffsetGap(field) != 1) { // note: 1 is the default. It is RARELY changed.
|
||||
throw new IllegalArgumentException(
|
||||
"offset gap of the provided analyzer should be 1 (field " + field + ")");
|
||||
}
|
||||
|
||||
if (terms.length > 0 || strictPhrases.willRewrite()) { //needs MemoryIndex
|
||||
// init MemoryIndex
|
||||
boolean storePayloads = strictPhrases.hasPositionSensitivity(); // might be needed
|
||||
memoryIndex = new MemoryIndex(true, storePayloads);//true==store offsets
|
||||
leafReader = (LeafReader) memoryIndex.createSearcher().getIndexReader();
|
||||
// preFilter for MemoryIndex
|
||||
preMemIndexFilterAutomaton = buildCombinedAutomaton(field, terms, this.automata, strictPhrases,
|
||||
multiTermQueryRewrite);
|
||||
} else {
|
||||
memoryIndex = null;
|
||||
leafReader = null;
|
||||
preMemIndexFilterAutomaton = null;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public UnifiedHighlighter.OffsetSource getOffsetSource() {
|
||||
public final UnifiedHighlighter.OffsetSource getOffsetSource() {
|
||||
return UnifiedHighlighter.OffsetSource.ANALYSIS;
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<OffsetsEnum> getOffsetsEnums(IndexReader reader, int docId, String content) throws IOException {
|
||||
// note: don't need LimitTokenOffsetFilter since content is already truncated to maxLength
|
||||
TokenStream tokenStream = tokenStream(content);
|
||||
|
||||
if (memoryIndex != null) { // also handles automata.length > 0
|
||||
// We use a MemoryIndex and index the tokenStream so that later we have the PostingsEnum with offsets.
|
||||
|
||||
// note: An *alternative* strategy is to get PostingsEnums without offsets from the main index
|
||||
// and then marry this up with a fake PostingsEnum backed by a TokenStream (which has the offsets) and
|
||||
// can use that to filter applicable tokens? It would have the advantage of being able to exit
|
||||
// early and save some re-analysis. This would be an additional method/offset-source approach
|
||||
// since it's still useful to highlight without any index (so we build MemoryIndex).
|
||||
|
||||
// note: probably unwise to re-use TermsEnum on reset mem index so we don't. But we do re-use the
|
||||
// leaf reader, which is a bit more top level than in the guts.
|
||||
memoryIndex.reset();
|
||||
|
||||
// Filter the tokenStream to applicable terms
|
||||
if (preMemIndexFilterAutomaton != null) {
|
||||
tokenStream = newKeepWordFilter(tokenStream, preMemIndexFilterAutomaton);
|
||||
}
|
||||
memoryIndex.addField(field, tokenStream);//note: calls tokenStream.reset() & close()
|
||||
tokenStream = null; // it's consumed; done.
|
||||
docId = 0;
|
||||
|
||||
if (automata.length > 0) {
|
||||
Terms foundTerms = leafReader.terms(field);
|
||||
if (foundTerms == null) {
|
||||
return Collections.emptyList(); //No offsets for this field.
|
||||
}
|
||||
// Un-invert for the automata. Much more compact than a CachingTokenStream
|
||||
tokenStream = MultiTermHighlighting.uninvertAndFilterTerms(foundTerms, 0, automata, content.length());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return createOffsetsEnums(leafReader, docId, tokenStream);
|
||||
}
|
||||
|
||||
protected TokenStream tokenStream(String content) throws IOException {
|
||||
return MultiValueTokenStream.wrap(field, analyzer, content, UnifiedHighlighter.MULTIVAL_SEP_CHAR);
|
||||
}
|
||||
|
||||
private static CharacterRunAutomaton[] convertTermsToAutomata(BytesRef[] terms, CharacterRunAutomaton[] automata) {
|
||||
CharacterRunAutomaton[] newAutomata = new CharacterRunAutomaton[terms.length + automata.length];
|
||||
for (int i = 0; i < terms.length; i++) {
|
||||
newAutomata[i] = MultiTermHighlighting.makeStringMatchAutomata(terms[i]);
|
||||
// If there is no splitChar in content then we needn't wrap:
|
||||
int splitCharIdx = content.indexOf(UnifiedHighlighter.MULTIVAL_SEP_CHAR);
|
||||
if (splitCharIdx == -1) {
|
||||
return analyzer.tokenStream(field, content);
|
||||
}
|
||||
// Append existing automata (that which is used for MTQs)
|
||||
System.arraycopy(automata, 0, newAutomata, terms.length, automata.length);
|
||||
return newAutomata;
|
||||
|
||||
TokenStream subTokenStream = analyzer.tokenStream(field, content.substring(0, splitCharIdx));
|
||||
|
||||
return new MultiValueTokenStream(subTokenStream, field, analyzer, content, UnifiedHighlighter.MULTIVAL_SEP_CHAR, splitCharIdx);
|
||||
}
|
||||
|
||||
private static FilteringTokenFilter newKeepWordFilter(final TokenStream tokenStream,
|
||||
final CharacterRunAutomaton charRunAutomaton) {
|
||||
// it'd be nice to use KeepWordFilter but it demands a CharArraySet. TODO File JIRA? Need a new interface?
|
||||
return new FilteringTokenFilter(tokenStream) {
|
||||
final CharTermAttribute charAtt = addAttribute(CharTermAttribute.class);
|
||||
|
||||
@Override
|
||||
protected boolean accept() throws IOException {
|
||||
return charRunAutomaton.run(charAtt.buffer(), 0, charAtt.length());
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Build one {@link CharacterRunAutomaton} matching any term the query might match.
|
||||
* Wraps an {@link Analyzer} and string text that represents multiple values delimited by a specified character. This
|
||||
* exposes a TokenStream that matches what would get indexed considering the
|
||||
* {@link Analyzer#getPositionIncrementGap(String)}. Currently this assumes {@link Analyzer#getOffsetGap(String)} is
|
||||
* 1; an exception will be thrown if it isn't.
|
||||
* <br />
|
||||
* It would be more orthogonal for this to be an Analyzer since we're wrapping an Analyzer but doing so seems like
|
||||
* more work. The underlying components see a Reader not a String -- and the String is easy to
|
||||
* split up without redundant buffering.
|
||||
*
|
||||
* @lucene.internal
|
||||
*/
|
||||
private static CharacterRunAutomaton buildCombinedAutomaton(String field, BytesRef[] terms,
|
||||
CharacterRunAutomaton[] automata,
|
||||
PhraseHelper strictPhrases,
|
||||
Function<Query, Collection<Query>> multiTermQueryRewrite) {
|
||||
List<CharacterRunAutomaton> allAutomata = new ArrayList<>();
|
||||
if (terms.length > 0) {
|
||||
allAutomata.add(new CharacterRunAutomaton(Automata.makeStringUnion(Arrays.asList(terms))));
|
||||
}
|
||||
Collections.addAll(allAutomata, automata);
|
||||
for (SpanQuery spanQuery : strictPhrases.getSpanQueries()) {
|
||||
Collections.addAll(allAutomata,
|
||||
MultiTermHighlighting.extractAutomata(spanQuery, field, true, multiTermQueryRewrite));//true==lookInSpan
|
||||
private static final class MultiValueTokenStream extends TokenFilter {
|
||||
|
||||
private final String fieldName;
|
||||
private final Analyzer indexAnalyzer;
|
||||
private final String content;
|
||||
private final char splitChar;
|
||||
|
||||
private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
|
||||
private int startValIdx = 0;
|
||||
private int endValIdx;
|
||||
private int remainingPosInc = 0;
|
||||
|
||||
private MultiValueTokenStream(TokenStream subTokenStream, String fieldName, Analyzer indexAnalyzer,
|
||||
String content, char splitChar, int splitCharIdx) {
|
||||
super(subTokenStream); // subTokenStream is already initialized to operate on the first value
|
||||
this.fieldName = fieldName;
|
||||
this.indexAnalyzer = indexAnalyzer;
|
||||
this.content = content;
|
||||
this.splitChar = splitChar;
|
||||
this.endValIdx = splitCharIdx;
|
||||
}
|
||||
|
||||
if (allAutomata.size() == 1) {
|
||||
return allAutomata.get(0);
|
||||
}
|
||||
//TODO it'd be nice if we could get at the underlying Automaton in CharacterRunAutomaton so that we
|
||||
// could union them all. But it's not exposed, and note TermRangeQuery isn't modelled as an Automaton
|
||||
// by MultiTermHighlighting.
|
||||
|
||||
// Return an aggregate CharacterRunAutomaton of others
|
||||
return new CharacterRunAutomaton(Automata.makeEmpty()) {// the makeEmpty() is bogus; won't be used
|
||||
@Override
|
||||
public boolean run(char[] chars, int offset, int length) {
|
||||
for (int i = 0; i < allAutomata.size(); i++) {// don't use foreach to avoid Iterator allocation
|
||||
if (allAutomata.get(i).run(chars, offset, length)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
if (startValIdx != 0) {
|
||||
throw new IllegalStateException("This TokenStream wasn't developed to be re-used.");
|
||||
// ... although we could if a need for it arises.
|
||||
}
|
||||
};
|
||||
}
|
||||
super.reset();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
while (true) {
|
||||
|
||||
if (input.incrementToken()) {
|
||||
// Position tracking:
|
||||
if (remainingPosInc > 0) {//usually true first token of additional values (not first val)
|
||||
posIncAtt.setPositionIncrement(remainingPosInc + posIncAtt.getPositionIncrement());
|
||||
remainingPosInc = 0;//reset
|
||||
}
|
||||
// Offset tracking:
|
||||
offsetAtt.setOffset(
|
||||
startValIdx + offsetAtt.startOffset(),
|
||||
startValIdx + offsetAtt.endOffset()
|
||||
);
|
||||
return true;
|
||||
}
|
||||
|
||||
if (endValIdx == content.length()) {//no more
|
||||
return false;
|
||||
}
|
||||
|
||||
input.end(); // might adjust position increment
|
||||
remainingPosInc += posIncAtt.getPositionIncrement();
|
||||
input.close();
|
||||
remainingPosInc += indexAnalyzer.getPositionIncrementGap(fieldName);
|
||||
|
||||
// Get new tokenStream based on next segment divided by the splitChar
|
||||
startValIdx = endValIdx + 1;
|
||||
endValIdx = content.indexOf(splitChar, startValIdx);
|
||||
if (endValIdx == -1) {//EOF
|
||||
endValIdx = content.length();
|
||||
}
|
||||
TokenStream tokenStream = indexAnalyzer.tokenStream(fieldName, content.substring(startValIdx, endValIdx));
|
||||
if (tokenStream != input) {// (input is defined in TokenFilter set in the constructor)
|
||||
// This is a grand trick we do -- knowing that the analyzer's re-use strategy is going to produce the
|
||||
// very same tokenStream instance and thus have the same AttributeSource as this wrapping TokenStream
|
||||
// since we used it as our input in the constructor.
|
||||
// Were this not the case, we'd have to copy every attribute of interest since we can't alter the
|
||||
// AttributeSource of this wrapping TokenStream post-construction (it's all private/final).
|
||||
// If this is a problem, we could do that instead; maybe with a custom CharTermAttribute that allows
|
||||
// us to easily set the char[] reference without literally copying char by char.
|
||||
throw new IllegalStateException("Require TokenStream re-use. Unsupported re-use strategy?: " +
|
||||
indexAnalyzer.getReuseStrategy());
|
||||
}
|
||||
tokenStream.reset();
|
||||
} // while loop to increment token of this new value
|
||||
}
|
||||
|
||||
@Override
|
||||
public void end() throws IOException {
|
||||
super.end();
|
||||
// Offset tracking:
|
||||
offsetAtt.setOffset(
|
||||
startValIdx + offsetAtt.startOffset(),
|
||||
startValIdx + offsetAtt.endOffset());
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,145 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.search.uhighlight;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.index.PostingsEnum;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.PriorityQueue;
|
||||
|
||||
/**
|
||||
* Provides a view over several underlying PostingsEnums for the iteration of offsets on the current document only.
|
||||
* It's not general purpose; the position returned is always -1 and it doesn't iterate the documents.
|
||||
*/
|
||||
final class CompositeOffsetsPostingsEnum extends PostingsEnum {
|
||||
|
||||
private final int docId;
|
||||
private final int freq;
|
||||
private final PriorityQueue<BoundsCheckingPostingsEnum> queue;
|
||||
private boolean firstPositionConsumed = false;
|
||||
|
||||
/**
|
||||
* This class is used to ensure we don't over iterate the underlying
|
||||
* postings enum by keeping track of the position relative to the
|
||||
* frequency.
|
||||
* Ideally this would've been an implementation of a PostingsEnum
|
||||
* but it would have to delegate most methods and it seemed easier
|
||||
* to just wrap the tweaked method.
|
||||
*/
|
||||
private static final class BoundsCheckingPostingsEnum {
|
||||
|
||||
private final PostingsEnum postingsEnum;
|
||||
private int remainingPositions;
|
||||
|
||||
BoundsCheckingPostingsEnum(PostingsEnum postingsEnum) throws IOException {
|
||||
this.postingsEnum = postingsEnum;
|
||||
this.remainingPositions = postingsEnum.freq();
|
||||
nextPosition();
|
||||
}
|
||||
|
||||
/** Advances to the next position and returns true, or returns false if it can't. */
|
||||
private boolean nextPosition() throws IOException {
|
||||
if (remainingPositions-- > 0) {
|
||||
postingsEnum.nextPosition(); // ignore the actual position; we don't care.
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/** The provided {@link PostingsEnum}s must all be positioned to the same document, and must have offsets. */
|
||||
CompositeOffsetsPostingsEnum(List<PostingsEnum> postingsEnums) throws IOException {
|
||||
queue = new PriorityQueue<BoundsCheckingPostingsEnum>(postingsEnums.size()) {
|
||||
@Override
|
||||
protected boolean lessThan(BoundsCheckingPostingsEnum a, BoundsCheckingPostingsEnum b) {
|
||||
try {
|
||||
return a.postingsEnum.startOffset() < b.postingsEnum.startOffset();
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
int freqAdd = 0;
|
||||
for (PostingsEnum postingsEnum : postingsEnums) {
|
||||
queue.add(new BoundsCheckingPostingsEnum(postingsEnum));
|
||||
freqAdd += postingsEnum.freq();
|
||||
}
|
||||
freq = freqAdd;
|
||||
this.docId = queue.top().postingsEnum.docID();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int freq() throws IOException {
|
||||
return freq;
|
||||
}
|
||||
|
||||
/** Advances to the next position. Always returns -1; the caller is assumed not to care for the highlighter. */
|
||||
@Override
|
||||
public int nextPosition() throws IOException {
|
||||
if (!firstPositionConsumed) {
|
||||
firstPositionConsumed = true;
|
||||
} else if (queue.size() == 0) {
|
||||
throw new IllegalStateException("nextPosition called too many times");
|
||||
} else if (queue.top().nextPosition()) { // advance head
|
||||
queue.updateTop(); //the new position may be behind another postingsEnum in the queue
|
||||
} else {
|
||||
queue.pop(); //this postingsEnum is consumed; get rid of it. Another will take it's place.
|
||||
}
|
||||
assert queue.size() > 0;
|
||||
return -1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int startOffset() throws IOException {
|
||||
return queue.top().postingsEnum.startOffset();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int endOffset() throws IOException {
|
||||
return queue.top().postingsEnum.endOffset();
|
||||
}
|
||||
|
||||
@Override
|
||||
public BytesRef getPayload() throws IOException {
|
||||
return queue.top().postingsEnum.getPayload();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int docID() {
|
||||
return docId;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int nextDoc() throws IOException {
|
||||
return NO_MORE_DOCS;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int advance(int target) throws IOException {
|
||||
return NO_MORE_DOCS;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long cost() {
|
||||
return 1L; //at most 1 doc is returned
|
||||
}
|
||||
}
|
|
@ -14,16 +14,14 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.search.uhighlight;
|
||||
|
||||
import java.io.Closeable;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.LeafReader;
|
||||
import org.apache.lucene.index.PostingsEnum;
|
||||
|
@ -31,6 +29,7 @@ import org.apache.lucene.index.Terms;
|
|||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.search.spans.Spans;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.CharsRefBuilder;
|
||||
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
|
||||
|
||||
/**
|
||||
|
@ -42,14 +41,14 @@ import org.apache.lucene.util.automaton.CharacterRunAutomaton;
|
|||
public abstract class FieldOffsetStrategy {
|
||||
|
||||
protected final String field;
|
||||
protected BytesRef[] terms; // Query: free-standing terms
|
||||
protected PhraseHelper strictPhrases; // Query: position-sensitive information TODO: rename
|
||||
protected CharacterRunAutomaton[] automata; // Query: free-standing wildcards (multi-term query)
|
||||
protected final PhraseHelper phraseHelper; // Query: position-sensitive information TODO: rename
|
||||
protected final BytesRef[] terms; // Query: free-standing terms
|
||||
protected final CharacterRunAutomaton[] automata; // Query: free-standing wildcards (multi-term query)
|
||||
|
||||
public FieldOffsetStrategy(String field, BytesRef[] queryTerms, PhraseHelper phraseHelper, CharacterRunAutomaton[] automata) {
|
||||
this.field = field;
|
||||
this.terms = queryTerms;
|
||||
this.strictPhrases = phraseHelper;
|
||||
this.phraseHelper = phraseHelper;
|
||||
this.automata = automata;
|
||||
}
|
||||
|
||||
|
@ -65,58 +64,90 @@ public abstract class FieldOffsetStrategy {
|
|||
*/
|
||||
public abstract List<OffsetsEnum> getOffsetsEnums(IndexReader reader, int docId, String content) throws IOException;
|
||||
|
||||
protected List<OffsetsEnum> createOffsetsEnums(LeafReader leafReader, int doc, TokenStream tokenStream) throws IOException {
|
||||
List<OffsetsEnum> offsetsEnums = createOffsetsEnumsFromReader(leafReader, doc);
|
||||
if (automata.length > 0) {
|
||||
offsetsEnums.add(createOffsetsEnumFromTokenStream(doc, tokenStream));
|
||||
protected List<OffsetsEnum> createOffsetsEnumsFromReader(LeafReader leafReader, int doc) throws IOException {
|
||||
final Terms termsIndex = leafReader.terms(field);
|
||||
if (termsIndex == null) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
return offsetsEnums;
|
||||
}
|
||||
|
||||
protected List<OffsetsEnum> createOffsetsEnumsFromReader(LeafReader atomicReader, int doc) throws IOException {
|
||||
// For strict positions, get a Map of term to Spans:
|
||||
// note: ScriptPhraseHelper.NONE does the right thing for these method calls
|
||||
final Map<BytesRef, Spans> strictPhrasesTermToSpans =
|
||||
strictPhrases.getTermToSpans(atomicReader, doc);
|
||||
phraseHelper.getTermToSpans(leafReader, doc);
|
||||
// Usually simply wraps terms in a List; but if willRewrite() then can be expanded
|
||||
final List<BytesRef> sourceTerms =
|
||||
strictPhrases.expandTermsIfRewrite(terms, strictPhrasesTermToSpans);
|
||||
phraseHelper.expandTermsIfRewrite(terms, strictPhrasesTermToSpans);
|
||||
|
||||
final List<OffsetsEnum> offsetsEnums = new ArrayList<>(sourceTerms.size() + 1);
|
||||
final List<OffsetsEnum> offsetsEnums = new ArrayList<>(sourceTerms.size() + automata.length);
|
||||
|
||||
Terms termsIndex = atomicReader == null || sourceTerms.isEmpty() ? null : atomicReader.terms(field);
|
||||
if (termsIndex != null) {
|
||||
// Handle sourceTerms:
|
||||
if (!sourceTerms.isEmpty()) {
|
||||
TermsEnum termsEnum = termsIndex.iterator();//does not return null
|
||||
for (BytesRef term : sourceTerms) {
|
||||
if (!termsEnum.seekExact(term)) {
|
||||
continue; // term not found
|
||||
}
|
||||
PostingsEnum postingsEnum = termsEnum.postings(null, PostingsEnum.OFFSETS);
|
||||
if (postingsEnum == null) {
|
||||
// no offsets or positions available
|
||||
throw new IllegalArgumentException("field '" + field + "' was indexed without offsets, cannot highlight");
|
||||
}
|
||||
if (doc != postingsEnum.advance(doc)) { // now it's positioned, although may be exhausted
|
||||
continue;
|
||||
}
|
||||
postingsEnum = strictPhrases.filterPostings(term, postingsEnum, strictPhrasesTermToSpans.get(term));
|
||||
if (postingsEnum == null) {
|
||||
continue;// completely filtered out
|
||||
}
|
||||
if (termsEnum.seekExact(term)) {
|
||||
PostingsEnum postingsEnum = termsEnum.postings(null, PostingsEnum.OFFSETS);
|
||||
|
||||
offsetsEnums.add(new OffsetsEnum(term, postingsEnum));
|
||||
if (postingsEnum == null) {
|
||||
// no offsets or positions available
|
||||
throw new IllegalArgumentException("field '" + field + "' was indexed without offsets, cannot highlight");
|
||||
}
|
||||
|
||||
if (doc == postingsEnum.advance(doc)) { // now it's positioned, although may be exhausted
|
||||
postingsEnum = phraseHelper.filterPostings(term, postingsEnum, strictPhrasesTermToSpans.get(term));
|
||||
if (postingsEnum != null) {
|
||||
offsetsEnums.add(new OffsetsEnum(term, postingsEnum));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Handle automata
|
||||
if (automata.length > 0) {
|
||||
offsetsEnums.addAll(createAutomataOffsetsFromTerms(termsIndex, doc));
|
||||
}
|
||||
|
||||
return offsetsEnums;
|
||||
}
|
||||
|
||||
protected OffsetsEnum createOffsetsEnumFromTokenStream(int doc, TokenStream tokenStream) throws IOException {
|
||||
// if there are automata (MTQ), we have to initialize the "fake" enum wrapping them.
|
||||
assert tokenStream != null;
|
||||
// TODO Opt: we sometimes evaluate the automata twice when this TS isn't the original; can we avoid?
|
||||
PostingsEnum mtqPostingsEnum = MultiTermHighlighting.getDocsEnum(tokenStream, automata);
|
||||
assert mtqPostingsEnum instanceof Closeable; // FYI we propagate close() later.
|
||||
mtqPostingsEnum.advance(doc);
|
||||
return new OffsetsEnum(null, mtqPostingsEnum);
|
||||
protected List<OffsetsEnum> createAutomataOffsetsFromTerms(Terms termsIndex, int doc) throws IOException {
|
||||
List<List<PostingsEnum>> automataPostings = new ArrayList<>(automata.length);
|
||||
for (int i = 0; i < automata.length; i++) {
|
||||
automataPostings.add(new ArrayList<>());
|
||||
}
|
||||
|
||||
TermsEnum termsEnum = termsIndex.iterator();
|
||||
BytesRef term;
|
||||
CharsRefBuilder refBuilder = new CharsRefBuilder();
|
||||
while ((term = termsEnum.next()) != null) {
|
||||
for (int i = 0; i < automata.length; i++) {
|
||||
CharacterRunAutomaton automaton = automata[i];
|
||||
refBuilder.copyUTF8Bytes(term);
|
||||
if (automaton.run(refBuilder.chars(), 0, refBuilder.length())) {
|
||||
PostingsEnum postings = termsEnum.postings(null, PostingsEnum.OFFSETS);
|
||||
if (doc == postings.advance(doc)) {
|
||||
automataPostings.get(i).add(postings);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
List<OffsetsEnum> offsetsEnums = new ArrayList<>(automata.length); //will be at most this long
|
||||
for (int i = 0; i < automata.length; i++) {
|
||||
CharacterRunAutomaton automaton = automata[i];
|
||||
List<PostingsEnum> postingsEnums = automataPostings.get(i);
|
||||
int size = postingsEnums.size();
|
||||
if (size > 0) { //only add if we have offsets
|
||||
BytesRef wildcardTerm = new BytesRef(automaton.toString());
|
||||
if (size == 1) { //don't wrap in a composite if there's only one OffsetsEnum
|
||||
offsetsEnums.add(new OffsetsEnum(wildcardTerm, postingsEnums.get(0)));
|
||||
} else {
|
||||
offsetsEnums.add(new OffsetsEnum(wildcardTerm, new CompositeOffsetsPostingsEnum(postingsEnums)));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return offsetsEnums;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -0,0 +1,129 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.search.uhighlight;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.function.Function;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.FilteringTokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.LeafReader;
|
||||
import org.apache.lucene.index.memory.MemoryIndex;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.spans.SpanQuery;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.automaton.Automata;
|
||||
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
|
||||
|
||||
|
||||
/**
|
||||
* Uses an {@link Analyzer} on content to get offsets and then populates a {@link MemoryIndex}.
|
||||
*
|
||||
* @lucene.internal
|
||||
*/
|
||||
public class MemoryIndexOffsetStrategy extends AnalysisOffsetStrategy {
|
||||
|
||||
private final MemoryIndex memoryIndex;
|
||||
private final LeafReader leafReader;
|
||||
private final CharacterRunAutomaton preMemIndexFilterAutomaton;
|
||||
|
||||
public MemoryIndexOffsetStrategy(String field, BytesRef[] extractedTerms, PhraseHelper phraseHelper,
|
||||
CharacterRunAutomaton[] automata, Analyzer analyzer,
|
||||
Function<Query, Collection<Query>> multiTermQueryRewrite) {
|
||||
super(field, extractedTerms, phraseHelper, automata, analyzer);
|
||||
boolean storePayloads = phraseHelper.hasPositionSensitivity(); // might be needed
|
||||
memoryIndex = new MemoryIndex(true, storePayloads);//true==store offsets
|
||||
leafReader = (LeafReader) memoryIndex.createSearcher().getIndexReader(); // appears to be re-usable
|
||||
// preFilter for MemoryIndex
|
||||
preMemIndexFilterAutomaton = buildCombinedAutomaton(field, terms, this.automata, phraseHelper, multiTermQueryRewrite);
|
||||
}
|
||||
|
||||
/**
|
||||
* Build one {@link CharacterRunAutomaton} matching any term the query might match.
|
||||
*/
|
||||
private static CharacterRunAutomaton buildCombinedAutomaton(String field, BytesRef[] terms,
|
||||
CharacterRunAutomaton[] automata,
|
||||
PhraseHelper strictPhrases,
|
||||
Function<Query, Collection<Query>> multiTermQueryRewrite) {
|
||||
List<CharacterRunAutomaton> allAutomata = new ArrayList<>();
|
||||
if (terms.length > 0) {
|
||||
allAutomata.add(new CharacterRunAutomaton(Automata.makeStringUnion(Arrays.asList(terms))));
|
||||
}
|
||||
Collections.addAll(allAutomata, automata);
|
||||
for (SpanQuery spanQuery : strictPhrases.getSpanQueries()) {
|
||||
Collections.addAll(allAutomata,
|
||||
MultiTermHighlighting.extractAutomata(spanQuery, field, true, multiTermQueryRewrite));//true==lookInSpan
|
||||
}
|
||||
|
||||
if (allAutomata.size() == 1) {
|
||||
return allAutomata.get(0);
|
||||
}
|
||||
//TODO it'd be nice if we could get at the underlying Automaton in CharacterRunAutomaton so that we
|
||||
// could union them all. But it's not exposed, and note TermRangeQuery isn't modelled as an Automaton
|
||||
// by MultiTermHighlighting.
|
||||
|
||||
// Return an aggregate CharacterRunAutomaton of others
|
||||
return new CharacterRunAutomaton(Automata.makeEmpty()) {// the makeEmpty() is bogus; won't be used
|
||||
@Override
|
||||
public boolean run(char[] chars, int offset, int length) {
|
||||
for (int i = 0; i < allAutomata.size(); i++) {// don't use foreach to avoid Iterator allocation
|
||||
if (allAutomata.get(i).run(chars, offset, length)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<OffsetsEnum> getOffsetsEnums(IndexReader reader, int docId, String content) throws IOException {
|
||||
// note: don't need LimitTokenOffsetFilter since content is already truncated to maxLength
|
||||
TokenStream tokenStream = tokenStream(content);
|
||||
|
||||
// Filter the tokenStream to applicable terms
|
||||
tokenStream = newKeepWordFilter(tokenStream, preMemIndexFilterAutomaton);
|
||||
memoryIndex.reset();
|
||||
memoryIndex.addField(field, tokenStream);//note: calls tokenStream.reset() & close()
|
||||
docId = 0;
|
||||
|
||||
return createOffsetsEnumsFromReader(leafReader, docId);
|
||||
}
|
||||
|
||||
|
||||
private static FilteringTokenFilter newKeepWordFilter(final TokenStream tokenStream,
|
||||
final CharacterRunAutomaton charRunAutomaton) {
|
||||
// it'd be nice to use KeepWordFilter but it demands a CharArraySet. TODO File JIRA? Need a new interface?
|
||||
return new FilteringTokenFilter(tokenStream) {
|
||||
final CharTermAttribute charAtt = addAttribute(CharTermAttribute.class);
|
||||
|
||||
@Override
|
||||
protected boolean accept() throws IOException {
|
||||
return charRunAutomaton.run(charAtt.buffer(), 0, charAtt.length());
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
}
|
|
@ -16,8 +16,6 @@
|
|||
*/
|
||||
package org.apache.lucene.search.uhighlight;
|
||||
|
||||
import java.io.Closeable;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
|
@ -25,15 +23,7 @@ import java.util.Comparator;
|
|||
import java.util.List;
|
||||
import java.util.function.Function;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.index.FilterLeafReader;
|
||||
import org.apache.lucene.index.FilteredTermsEnum;
|
||||
import org.apache.lucene.index.PostingsEnum;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.search.AutomatonQuery;
|
||||
import org.apache.lucene.search.BooleanClause;
|
||||
import org.apache.lucene.search.BooleanQuery;
|
||||
|
@ -48,9 +38,7 @@ import org.apache.lucene.search.spans.SpanNearQuery;
|
|||
import org.apache.lucene.search.spans.SpanNotQuery;
|
||||
import org.apache.lucene.search.spans.SpanOrQuery;
|
||||
import org.apache.lucene.search.spans.SpanPositionCheckQuery;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
import org.apache.lucene.util.CharsRefBuilder;
|
||||
import org.apache.lucene.util.UnicodeUtil;
|
||||
import org.apache.lucene.util.automaton.Automata;
|
||||
import org.apache.lucene.util.automaton.Automaton;
|
||||
|
@ -210,182 +198,4 @@ class MultiTermHighlighting {
|
|||
return list.toArray(new CharacterRunAutomaton[list.size()]);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a "fake" DocsAndPositionsEnum over the tokenstream, returning offsets where {@code matchers}
|
||||
* matches tokens.
|
||||
* <p>
|
||||
* This is solely used internally by PostingsHighlighter: <b>DO NOT USE THIS METHOD!</b>
|
||||
*/
|
||||
public static PostingsEnum getDocsEnum(final TokenStream ts, final CharacterRunAutomaton[] matchers) throws IOException {
|
||||
return new TokenStreamPostingsEnum(ts, matchers);
|
||||
}
|
||||
|
||||
// TODO: we could use CachingWrapperFilter, (or consume twice) to allow us to have a true freq()
|
||||
// but this would have a performance cost for likely little gain in the user experience, it
|
||||
// would only serve to make this method less bogus.
|
||||
// instead, we always return freq() = Integer.MAX_VALUE and let the highlighter terminate based on offset...
|
||||
// TODO: DWS perhaps instead OffsetsEnum could become abstract and this would be an impl?
|
||||
private static class TokenStreamPostingsEnum extends PostingsEnum implements Closeable {
|
||||
TokenStream stream; // becomes null when closed
|
||||
final CharacterRunAutomaton[] matchers;
|
||||
final CharTermAttribute charTermAtt;
|
||||
final OffsetAttribute offsetAtt;
|
||||
|
||||
int currentDoc = -1;
|
||||
int currentMatch = -1;
|
||||
int currentStartOffset = -1;
|
||||
|
||||
int currentEndOffset = -1;
|
||||
|
||||
final BytesRef matchDescriptions[];
|
||||
|
||||
TokenStreamPostingsEnum(TokenStream ts, CharacterRunAutomaton[] matchers) throws IOException {
|
||||
this.stream = ts;
|
||||
this.matchers = matchers;
|
||||
matchDescriptions = new BytesRef[matchers.length];
|
||||
charTermAtt = ts.addAttribute(CharTermAttribute.class);
|
||||
offsetAtt = ts.addAttribute(OffsetAttribute.class);
|
||||
ts.reset();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int nextPosition() throws IOException {
|
||||
if (stream != null) {
|
||||
while (stream.incrementToken()) {
|
||||
for (int i = 0; i < matchers.length; i++) {
|
||||
if (matchers[i].run(charTermAtt.buffer(), 0, charTermAtt.length())) {
|
||||
currentStartOffset = offsetAtt.startOffset();
|
||||
currentEndOffset = offsetAtt.endOffset();
|
||||
currentMatch = i;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
stream.end();
|
||||
close();
|
||||
}
|
||||
// exhausted
|
||||
currentStartOffset = currentEndOffset = Integer.MAX_VALUE;
|
||||
return Integer.MAX_VALUE;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int freq() throws IOException {
|
||||
return Integer.MAX_VALUE; // lie
|
||||
}
|
||||
|
||||
@Override
|
||||
public int startOffset() throws IOException {
|
||||
assert currentStartOffset >= 0;
|
||||
return currentStartOffset;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int endOffset() throws IOException {
|
||||
assert currentEndOffset >= 0;
|
||||
return currentEndOffset;
|
||||
}
|
||||
|
||||
@Override
|
||||
public BytesRef getPayload() throws IOException {
|
||||
if (matchDescriptions[currentMatch] == null) {
|
||||
matchDescriptions[currentMatch] = new BytesRef(matchers[currentMatch].toString());
|
||||
}
|
||||
return matchDescriptions[currentMatch];
|
||||
}
|
||||
|
||||
@Override
|
||||
public int docID() {
|
||||
return currentDoc;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int nextDoc() throws IOException {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int advance(int target) throws IOException {
|
||||
return currentDoc = target;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long cost() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
if (stream != null) {
|
||||
stream.close();
|
||||
stream = null;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a TokenStream un-inverted from the provided Terms, but filtered based on the automata. The
|
||||
* Terms must have exactly one doc count (e.g. term vector or MemoryIndex).
|
||||
*/
|
||||
//TODO: Alternatively, produce a list of OffsetsEnums from the Terms that match the automata.
|
||||
public static TokenStream uninvertAndFilterTerms(Terms termsIndex,
|
||||
int doc,
|
||||
final CharacterRunAutomaton[] automata,
|
||||
int offsetLength)
|
||||
throws IOException {
|
||||
assert automata.length > 0;
|
||||
//Note: if automata were plain Automaton (not CharacterRunAutomaton), we might instead use
|
||||
// TermsEnum.intersect(compiledAutomaton). But probably won't help due to O(N) TV impl so whatever.
|
||||
FilterLeafReader.FilterTerms filteredTermsIndex = new FilterLeafReader.FilterTerms(termsIndex) {
|
||||
@Override
|
||||
public TermsEnum iterator() throws IOException {
|
||||
return new FilteredTermsEnum(super.iterator(), false) {//false == no seek
|
||||
CharsRefBuilder tempCharsRefBuilder = new CharsRefBuilder();//reuse only for UTF8->UTF16 call
|
||||
|
||||
@Override
|
||||
protected AcceptStatus accept(BytesRef termBytesRef) throws IOException {
|
||||
//Grab the term (in same way as BytesRef.utf8ToString() but we don't want a String obj)
|
||||
tempCharsRefBuilder.grow(termBytesRef.length);
|
||||
final int charLen = UnicodeUtil.UTF8toUTF16(termBytesRef, tempCharsRefBuilder.chars());
|
||||
for (CharacterRunAutomaton runAutomaton : automata) {
|
||||
if (runAutomaton.run(tempCharsRefBuilder.chars(), 0, charLen)) {
|
||||
return AcceptStatus.YES;
|
||||
}
|
||||
}
|
||||
return AcceptStatus.NO;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
public long size() throws IOException {
|
||||
return -1; // unknown
|
||||
}
|
||||
|
||||
@Override
|
||||
public long getSumTotalTermFreq() throws IOException {
|
||||
return -1; // unknown
|
||||
}
|
||||
|
||||
@Override
|
||||
public long getSumDocFreq() throws IOException {
|
||||
return -1; // unknown
|
||||
}
|
||||
};
|
||||
float loadFactor = 1f / 64f;
|
||||
return new TokenStreamFromTermVector(filteredTermsIndex, doc, offsetLength, loadFactor);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a simple automata that matches the specified term.
|
||||
*/
|
||||
public static CharacterRunAutomaton makeStringMatchAutomata(BytesRef term) {
|
||||
String termString = term.utf8ToString();
|
||||
return new CharacterRunAutomaton(Automata.makeString(termString)) {
|
||||
@Override
|
||||
public String toString() {
|
||||
return termString;
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,148 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.search.uhighlight;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
|
||||
/**
|
||||
* Wraps an {@link Analyzer} and string text that represents multiple values delimited by a specified character. This
|
||||
* exposes a TokenStream that matches what would get indexed considering the
|
||||
* {@link Analyzer#getPositionIncrementGap(String)}. Currently this assumes {@link Analyzer#getOffsetGap(String)} is
|
||||
* 1; an exception will be thrown if it isn't.
|
||||
* <br />
|
||||
* It would be more orthogonal for this to be an Analyzer since we're wrapping an Analyzer but doing so seems like
|
||||
* more work. The underlying components see a Reader not a String -- and the String is easy to
|
||||
* split up without redundant buffering.
|
||||
*
|
||||
* @lucene.internal
|
||||
*/
|
||||
final class MultiValueTokenStream extends TokenFilter {
|
||||
|
||||
private final String fieldName;
|
||||
private final Analyzer indexAnalyzer;
|
||||
private final String content;
|
||||
private final char splitChar;
|
||||
|
||||
private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
|
||||
private int startValIdx = 0;
|
||||
private int endValIdx;
|
||||
private int remainingPosInc = 0;
|
||||
|
||||
/** note: The caller must remember to close the TokenStream eventually. */
|
||||
static TokenStream wrap(String fieldName, Analyzer indexAnalyzer, String content, char splitChar)
|
||||
throws IOException {
|
||||
if (indexAnalyzer.getOffsetGap(fieldName) != 1) { // note: 1 is the default. It is RARELY changed.
|
||||
throw new IllegalArgumentException(
|
||||
"offset gap of the provided analyzer should be 1 (field " + fieldName + ")");
|
||||
}
|
||||
// If there is no splitChar in content then we needn't wrap:
|
||||
int splitCharIdx = content.indexOf(splitChar);
|
||||
if (splitCharIdx == -1) {
|
||||
return indexAnalyzer.tokenStream(fieldName, content);
|
||||
}
|
||||
|
||||
TokenStream subTokenStream = indexAnalyzer.tokenStream(fieldName, content.substring(0, splitCharIdx));
|
||||
|
||||
return new MultiValueTokenStream(subTokenStream, fieldName, indexAnalyzer, content, splitChar, splitCharIdx);
|
||||
}
|
||||
|
||||
private MultiValueTokenStream(TokenStream subTokenStream, String fieldName, Analyzer indexAnalyzer,
|
||||
String content, char splitChar, int splitCharIdx) {
|
||||
super(subTokenStream); // subTokenStream is already initialized to operate on the first value
|
||||
this.fieldName = fieldName;
|
||||
this.indexAnalyzer = indexAnalyzer;
|
||||
this.content = content;
|
||||
this.splitChar = splitChar;
|
||||
this.endValIdx = splitCharIdx;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
if (startValIdx != 0) {
|
||||
throw new IllegalStateException("This TokenStream wasn't developed to be re-used.");
|
||||
// ... although we could if a need for it arises.
|
||||
}
|
||||
super.reset();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
while (true) {
|
||||
|
||||
if (input.incrementToken()) {
|
||||
// Position tracking:
|
||||
if (remainingPosInc > 0) {//usually true first token of additional values (not first val)
|
||||
posIncAtt.setPositionIncrement(remainingPosInc + posIncAtt.getPositionIncrement());
|
||||
remainingPosInc = 0;//reset
|
||||
}
|
||||
// Offset tracking:
|
||||
offsetAtt.setOffset(
|
||||
startValIdx + offsetAtt.startOffset(),
|
||||
startValIdx + offsetAtt.endOffset()
|
||||
);
|
||||
return true;
|
||||
}
|
||||
|
||||
if (endValIdx == content.length()) {//no more
|
||||
return false;
|
||||
}
|
||||
|
||||
input.end(); // might adjust position increment
|
||||
remainingPosInc += posIncAtt.getPositionIncrement();
|
||||
input.close();
|
||||
remainingPosInc += indexAnalyzer.getPositionIncrementGap(fieldName);
|
||||
|
||||
// Get new tokenStream based on next segment divided by the splitChar
|
||||
startValIdx = endValIdx + 1;
|
||||
endValIdx = content.indexOf(splitChar, startValIdx);
|
||||
if (endValIdx == -1) {//EOF
|
||||
endValIdx = content.length();
|
||||
}
|
||||
TokenStream tokenStream = indexAnalyzer.tokenStream(fieldName, content.substring(startValIdx, endValIdx));
|
||||
if (tokenStream != input) {// (input is defined in TokenFilter set in the constructor)
|
||||
// This is a grand trick we do -- knowing that the analyzer's re-use strategy is going to produce the
|
||||
// very same tokenStream instance and thus have the same AttributeSource as this wrapping TokenStream
|
||||
// since we used it as our input in the constructor.
|
||||
// Were this not the case, we'd have to copy every attribute of interest since we can't alter the
|
||||
// AttributeSource of this wrapping TokenStream post-construction (it's all private/final).
|
||||
// If this is a problem, we could do that instead; maybe with a custom CharTermAttribute that allows
|
||||
// us to easily set the char[] reference without literally copying char by char.
|
||||
throw new IllegalStateException("Require TokenStream re-use. Unsupported re-use strategy?: " +
|
||||
indexAnalyzer.getReuseStrategy());
|
||||
}
|
||||
tokenStream.reset();
|
||||
} // while loop to increment token of this new value
|
||||
}
|
||||
|
||||
@Override
|
||||
public void end() throws IOException {
|
||||
super.end();
|
||||
// Offset tracking:
|
||||
offsetAtt.setOffset(
|
||||
startValIdx + offsetAtt.startOffset(),
|
||||
startValIdx + offsetAtt.endOffset());
|
||||
}
|
||||
|
||||
}
|
|
@ -76,6 +76,7 @@ public class OffsetsEnum implements Comparable<OffsetsEnum>, Closeable {
|
|||
}
|
||||
|
||||
void nextPosition() throws IOException {
|
||||
assert hasMorePositions();
|
||||
pos++;
|
||||
postingsEnum.nextPosition();
|
||||
}
|
||||
|
|
|
@ -40,7 +40,7 @@ public final class Passage {
|
|||
BytesRef matchTerms[] = new BytesRef[8];
|
||||
int numMatches = 0;
|
||||
|
||||
void addMatch(int startOffset, int endOffset, BytesRef term) {
|
||||
public void addMatch(int startOffset, int endOffset, BytesRef term) {
|
||||
assert startOffset >= this.startOffset && startOffset <= this.endOffset;
|
||||
if (numMatches == matchStarts.length) {
|
||||
int newLength = ArrayUtil.oversize(numMatches+1, RamUsageEstimator.NUM_BYTES_OBJECT_REF);
|
||||
|
|
|
@ -266,7 +266,7 @@ public class PhraseHelper {
|
|||
}
|
||||
|
||||
/**
|
||||
* Returns terms as a List, but expanded to any terms in strictPhrases' keySet if present. That can only
|
||||
* Returns terms as a List, but expanded to any terms in phraseHelper' keySet if present. That can only
|
||||
* happen if willRewrite() is true.
|
||||
*/
|
||||
List<BytesRef> expandTermsIfRewrite(BytesRef[] terms, Map<BytesRef, Spans> strictPhrasesTermToSpans) {
|
||||
|
|
|
@ -41,7 +41,7 @@ public class PostingsOffsetStrategy extends FieldOffsetStrategy {
|
|||
|
||||
@Override
|
||||
public List<OffsetsEnum> getOffsetsEnums(IndexReader reader, int docId, String content) throws IOException {
|
||||
LeafReader leafReader;
|
||||
final LeafReader leafReader;
|
||||
if (reader instanceof LeafReader) {
|
||||
leafReader = (LeafReader) reader;
|
||||
} else {
|
||||
|
@ -54,6 +54,7 @@ public class PostingsOffsetStrategy extends FieldOffsetStrategy {
|
|||
return createOffsetsEnumsFromReader(leafReader, docId);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public UnifiedHighlighter.OffsetSource getOffsetSource() {
|
||||
return UnifiedHighlighter.OffsetSource.POSTINGS;
|
||||
|
|
|
@ -20,7 +20,6 @@ import java.io.IOException;
|
|||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.LeafReader;
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
|
@ -58,14 +57,11 @@ public class PostingsWithTermVectorsOffsetStrategy extends FieldOffsetStrategy {
|
|||
}
|
||||
leafReader = new TermVectorFilteredLeafReader(leafReader, docTerms);
|
||||
|
||||
TokenStream tokenStream = automata.length > 0 ? MultiTermHighlighting
|
||||
.uninvertAndFilterTerms(leafReader.terms(field), docId, this.automata, content.length()) : null;
|
||||
|
||||
return createOffsetsEnums(leafReader, docId, tokenStream);
|
||||
return createOffsetsEnumsFromReader(leafReader, docId);
|
||||
}
|
||||
|
||||
@Override
|
||||
public UnifiedHighlighter.OffsetSource getOffsetSource() {
|
||||
return UnifiedHighlighter.OffsetSource.POSTINGS;
|
||||
return UnifiedHighlighter.OffsetSource.POSTINGS_WITH_TERM_VECTORS;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -20,7 +20,6 @@ import java.io.IOException;
|
|||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.LeafReader;
|
||||
import org.apache.lucene.index.Terms;
|
||||
|
@ -51,18 +50,10 @@ public class TermVectorOffsetStrategy extends FieldOffsetStrategy {
|
|||
return Collections.emptyList();
|
||||
}
|
||||
|
||||
LeafReader leafReader = null;
|
||||
if ((terms.length > 0) || strictPhrases.willRewrite()) {
|
||||
leafReader = new TermVectorLeafReader(field, tvTerms);
|
||||
docId = 0;
|
||||
}
|
||||
LeafReader leafReader = new TermVectorLeafReader(field, tvTerms);
|
||||
docId = 0;
|
||||
|
||||
TokenStream tokenStream = null;
|
||||
if (automata.length > 0) {
|
||||
tokenStream = MultiTermHighlighting.uninvertAndFilterTerms(tvTerms, 0, automata, content.length());
|
||||
}
|
||||
|
||||
return createOffsetsEnums(leafReader, docId, tokenStream);
|
||||
return createOffsetsEnumsFromReader(leafReader, docId);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,395 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.search.uhighlight;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.index.PostingsEnum;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.BytesRefArray;
|
||||
import org.apache.lucene.util.BytesRefBuilder;
|
||||
import org.apache.lucene.util.CharsRefBuilder;
|
||||
import org.apache.lucene.util.Counter;
|
||||
import org.apache.lucene.util.UnicodeUtil;
|
||||
|
||||
/**
|
||||
* TokenStream created from a term vector field. The term vector requires positions and/or offsets (either). If you
|
||||
* want payloads add PayloadAttributeImpl (as you would normally) but don't assume the attribute is already added just
|
||||
* because you know the term vector has payloads, since the first call to incrementToken() will observe if you asked
|
||||
* for them and if not then won't get them. This TokenStream supports an efficient {@link #reset()}, so there's
|
||||
* no need to wrap with a caching impl.
|
||||
*
|
||||
* @lucene.internal
|
||||
*/
|
||||
final class TokenStreamFromTermVector extends TokenStream {
|
||||
// note: differs from similar class in the standard highlighter. This one is optimized for sparse cases.
|
||||
|
||||
/**
|
||||
* content length divided by distinct positions; an average of dense text.
|
||||
*/
|
||||
private static final double AVG_CHARS_PER_POSITION = 6;
|
||||
|
||||
private static final int INSERTION_SORT_THRESHOLD = 16;
|
||||
|
||||
private final Terms vector;
|
||||
|
||||
private final int filteredDocId;
|
||||
|
||||
private final CharTermAttribute termAttribute;
|
||||
|
||||
private final PositionIncrementAttribute positionIncrementAttribute;
|
||||
|
||||
private final int offsetLength;
|
||||
|
||||
private final float loadFactor;
|
||||
|
||||
private OffsetAttribute offsetAttribute;//maybe null
|
||||
|
||||
private PayloadAttribute payloadAttribute;//maybe null
|
||||
|
||||
private CharsRefBuilder termCharsBuilder;//term data here
|
||||
|
||||
private BytesRefArray payloadsBytesRefArray;//only used when payloadAttribute is non-null
|
||||
private BytesRefBuilder spareBytesRefBuilder;//only used when payloadAttribute is non-null
|
||||
|
||||
private TokenLL firstToken = null; // the head of a linked-list
|
||||
|
||||
private TokenLL incrementToken = null;
|
||||
|
||||
private boolean initialized = false;//lazy
|
||||
|
||||
public TokenStreamFromTermVector(Terms vector, int offsetLength) throws IOException {
|
||||
this(vector, 0, offsetLength, 1f);
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructor.
|
||||
*
|
||||
* @param vector Terms that contains the data for
|
||||
* creating the TokenStream. Must have positions and/or offsets.
|
||||
* @param filteredDocId The docID we will process.
|
||||
* @param offsetLength Supply the character length of the text being uninverted, or a lower value if you don't want
|
||||
* to invert text beyond an offset (in so doing this will act as a filter). If you don't
|
||||
* know the length, pass -1. In conjunction with {@code loadFactor}, it's used to
|
||||
* determine how many buckets to create during uninversion.
|
||||
* It's also used to filter out tokens with a start offset exceeding this value.
|
||||
* @param loadFactor The percent of tokens from the original terms (by position count) that are
|
||||
* expected to be inverted. If they are filtered (e.g.
|
||||
* {@link org.apache.lucene.index.FilterLeafReader.FilterTerms})
|
||||
* then consider using less than 1.0 to avoid wasting space.
|
||||
* 1.0 means all, 1/64th would suggest 1/64th of all tokens coming from vector.
|
||||
*/
|
||||
TokenStreamFromTermVector(Terms vector, int filteredDocId, int offsetLength, float loadFactor) throws IOException {
|
||||
super();
|
||||
this.filteredDocId = filteredDocId;
|
||||
this.offsetLength = offsetLength == Integer.MAX_VALUE ? -1 : offsetLength;
|
||||
if (loadFactor <= 0f || loadFactor > 1f) {
|
||||
throw new IllegalArgumentException("loadFactor should be > 0 and <= 1");
|
||||
}
|
||||
this.loadFactor = loadFactor;
|
||||
assert !hasAttribute(PayloadAttribute.class) : "AttributeFactory shouldn't have payloads *yet*";
|
||||
if (!vector.hasPositions() && !vector.hasOffsets()) {
|
||||
throw new IllegalArgumentException("The term vector needs positions and/or offsets.");
|
||||
}
|
||||
assert vector.hasFreqs();
|
||||
this.vector = vector;
|
||||
termAttribute = addAttribute(CharTermAttribute.class);
|
||||
positionIncrementAttribute = addAttribute(PositionIncrementAttribute.class);
|
||||
}
|
||||
|
||||
public Terms getTermVectorTerms() {
|
||||
return vector;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
incrementToken = null;
|
||||
super.reset();
|
||||
}
|
||||
|
||||
//We delay initialization because we can see which attributes the consumer wants, particularly payloads
|
||||
private void init() throws IOException {
|
||||
assert !initialized;
|
||||
int dpEnumFlags = 0;
|
||||
if (vector.hasOffsets()) {
|
||||
offsetAttribute = addAttribute(OffsetAttribute.class);
|
||||
dpEnumFlags |= PostingsEnum.OFFSETS;
|
||||
}
|
||||
if (vector.hasPayloads() && hasAttribute(PayloadAttribute.class)) {
|
||||
payloadAttribute = getAttribute(PayloadAttribute.class);
|
||||
payloadsBytesRefArray = new BytesRefArray(Counter.newCounter());
|
||||
spareBytesRefBuilder = new BytesRefBuilder();
|
||||
dpEnumFlags |= PostingsEnum.PAYLOADS;
|
||||
}
|
||||
|
||||
// We put term data here
|
||||
termCharsBuilder = new CharsRefBuilder();
|
||||
termCharsBuilder.grow(initTotalTermCharLen());
|
||||
|
||||
// Step 1: iterate termsEnum and create a token, placing into a bucketed array (given a load factor)
|
||||
|
||||
final TokenLL[] tokenBuckets = initTokenBucketsArray();
|
||||
final double OFFSET_TO_BUCKET_IDX = loadFactor / AVG_CHARS_PER_POSITION;
|
||||
final double POSITION_TO_BUCKET_IDX = loadFactor;
|
||||
|
||||
final TermsEnum termsEnum = vector.iterator();
|
||||
BytesRef termBytesRef;
|
||||
PostingsEnum dpEnum = null;
|
||||
final CharsRefBuilder tempCharsRefBuilder = new CharsRefBuilder();//only for UTF8->UTF16 call
|
||||
|
||||
TERM_LOOP:
|
||||
while ((termBytesRef = termsEnum.next()) != null) {
|
||||
//Grab the term (in same way as BytesRef.utf8ToString() but we don't want a String obj)
|
||||
// note: if term vectors supported seek by ord then we might just keep an int and seek by ord on-demand
|
||||
tempCharsRefBuilder.grow(termBytesRef.length);
|
||||
final int termCharsLen = UnicodeUtil.UTF8toUTF16(termBytesRef, tempCharsRefBuilder.chars());
|
||||
final int termCharsOff = termCharsBuilder.length();
|
||||
termCharsBuilder.append(tempCharsRefBuilder.chars(), 0, termCharsLen);
|
||||
dpEnum = termsEnum.postings(dpEnum, dpEnumFlags);
|
||||
assert dpEnum != null; // presumably checked by TokenSources.hasPositions earlier
|
||||
int currentDocId = dpEnum.advance(filteredDocId);
|
||||
if (currentDocId != filteredDocId) {
|
||||
continue; //Not expected
|
||||
}
|
||||
final int freq = dpEnum.freq();
|
||||
for (int j = 0; j < freq; j++) {
|
||||
TokenLL token = new TokenLL();
|
||||
token.position = dpEnum.nextPosition(); // can be -1 if not in the TV
|
||||
token.termCharsOff = termCharsOff;
|
||||
token.termCharsLen = (short) Math.min(termCharsLen, Short.MAX_VALUE);
|
||||
// copy offset (if it's there) and compute bucketIdx
|
||||
int bucketIdx;
|
||||
if (offsetAttribute != null) {
|
||||
token.startOffset = dpEnum.startOffset();
|
||||
if (offsetLength >= 0 && token.startOffset > offsetLength) {
|
||||
continue TERM_LOOP;//filter this token out; exceeds threshold
|
||||
}
|
||||
token.endOffsetInc = (short) Math.min(dpEnum.endOffset() - token.startOffset, Short.MAX_VALUE);
|
||||
bucketIdx = (int) (token.startOffset * OFFSET_TO_BUCKET_IDX);
|
||||
} else {
|
||||
bucketIdx = (int) (token.position * POSITION_TO_BUCKET_IDX);
|
||||
}
|
||||
if (bucketIdx >= tokenBuckets.length) {
|
||||
bucketIdx = tokenBuckets.length - 1;
|
||||
}
|
||||
|
||||
if (payloadAttribute != null) {
|
||||
final BytesRef payload = dpEnum.getPayload();
|
||||
token.payloadIndex = payload == null ? -1 : payloadsBytesRefArray.append(payload);
|
||||
}
|
||||
|
||||
//Add token to the head of the bucket linked list
|
||||
token.next = tokenBuckets[bucketIdx];
|
||||
tokenBuckets[bucketIdx] = token;
|
||||
}
|
||||
}
|
||||
|
||||
// Step 2: Link all Tokens into a linked-list and sort all tokens at the same position
|
||||
|
||||
firstToken = initLinkAndSortTokens(tokenBuckets);
|
||||
|
||||
// If the term vector didn't have positions, synthesize them
|
||||
if (!vector.hasPositions() && firstToken != null) {
|
||||
TokenLL prevToken = firstToken;
|
||||
prevToken.position = 0;
|
||||
for (TokenLL token = prevToken.next; token != null; prevToken = token, token = token.next) {
|
||||
if (prevToken.startOffset == token.startOffset) {
|
||||
token.position = prevToken.position;
|
||||
} else {
|
||||
token.position = prevToken.position + 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
initialized = true;
|
||||
}
|
||||
|
||||
private static TokenLL initLinkAndSortTokens(TokenLL[] tokenBuckets) {
|
||||
TokenLL firstToken = null;
|
||||
List<TokenLL> scratchTokenArray = new ArrayList<>(); // declare here for re-use. TODO use native array
|
||||
TokenLL prevToken = null;
|
||||
for (TokenLL tokenHead : tokenBuckets) {
|
||||
if (tokenHead == null) {
|
||||
continue;
|
||||
}
|
||||
//sort tokens at this position and link them; return the first
|
||||
TokenLL tokenTail;
|
||||
// just one token
|
||||
if (tokenHead.next == null) {
|
||||
tokenTail = tokenHead;
|
||||
} else {
|
||||
// add the linked list to a temporary array
|
||||
for (TokenLL cur = tokenHead; cur != null; cur = cur.next) {
|
||||
scratchTokenArray.add(cur);
|
||||
}
|
||||
// sort; and set tokenHead & tokenTail
|
||||
if (scratchTokenArray.size() < INSERTION_SORT_THRESHOLD) {
|
||||
// insertion sort by creating a linked list (leave scratchTokenArray alone)
|
||||
tokenHead = tokenTail = scratchTokenArray.get(0);
|
||||
tokenHead.next = null;
|
||||
for (int i = 1; i < scratchTokenArray.size(); i++) {
|
||||
TokenLL insertToken = scratchTokenArray.get(i);
|
||||
if (insertToken.compareTo(tokenHead) <= 0) {
|
||||
// takes the place of tokenHead
|
||||
insertToken.next = tokenHead;
|
||||
tokenHead = insertToken;
|
||||
} else {
|
||||
// goes somewhere after tokenHead
|
||||
for (TokenLL prev = tokenHead; true; prev = prev.next) {
|
||||
if (prev.next == null || insertToken.compareTo(prev.next) <= 0) {
|
||||
if (prev.next == null) {
|
||||
tokenTail = insertToken;
|
||||
}
|
||||
insertToken.next = prev.next;
|
||||
prev.next = insertToken;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
Collections.sort(scratchTokenArray);
|
||||
// take back out and create a linked list
|
||||
TokenLL prev = tokenHead = scratchTokenArray.get(0);
|
||||
for (int i = 1; i < scratchTokenArray.size(); i++) {
|
||||
prev.next = scratchTokenArray.get(i);
|
||||
prev = prev.next;
|
||||
}
|
||||
tokenTail = prev;
|
||||
tokenTail.next = null;
|
||||
}
|
||||
scratchTokenArray.clear();//too bad ArrayList nulls it out; we don't actually need that
|
||||
}
|
||||
|
||||
//link to previous
|
||||
if (prevToken != null) {
|
||||
assert prevToken.next == null;
|
||||
prevToken.next = tokenHead; //concatenate linked-list
|
||||
assert prevToken.compareTo(tokenHead) < 0 : "wrong offset / position ordering expectations";
|
||||
} else {
|
||||
assert firstToken == null;
|
||||
firstToken = tokenHead;
|
||||
}
|
||||
|
||||
prevToken = tokenTail;
|
||||
}
|
||||
return firstToken;
|
||||
}
|
||||
|
||||
private int initTotalTermCharLen() throws IOException {
|
||||
int guessNumTerms;
|
||||
if (vector.size() != -1) {
|
||||
guessNumTerms = (int) vector.size();
|
||||
} else if (offsetLength != -1) {
|
||||
guessNumTerms = (int) (offsetLength * 0.33);//guess 1/3rd
|
||||
} else {
|
||||
return 128;
|
||||
}
|
||||
return Math.max(64, (int) (guessNumTerms * loadFactor * 7.0));//7 is over-estimate of average term len
|
||||
}
|
||||
|
||||
private TokenLL[] initTokenBucketsArray() throws IOException {
|
||||
// Estimate the number of non-empty positions (number of tokens, excluding same-position synonyms).
|
||||
int positionsEstimate;
|
||||
if (offsetLength == -1) { // no clue what the char length is.
|
||||
// Estimate the number of position slots we need from term stats based on Wikipedia.
|
||||
int sumTotalTermFreq = (int) vector.getSumTotalTermFreq();
|
||||
if (sumTotalTermFreq == -1) {//unfortunately term vectors seem to not have this stat
|
||||
int size = (int) vector.size();
|
||||
if (size == -1) {//doesn't happen with term vectors, it seems, but pick a default any way
|
||||
size = 128;
|
||||
}
|
||||
sumTotalTermFreq = (int) (size * 2.4);
|
||||
}
|
||||
positionsEstimate = (int) (sumTotalTermFreq * 1.5);//less than 1 in 10 docs exceed this
|
||||
} else {
|
||||
// guess number of token positions by this factor.
|
||||
positionsEstimate = (int) (offsetLength / AVG_CHARS_PER_POSITION);
|
||||
}
|
||||
// apply the load factor.
|
||||
return new TokenLL[Math.max(1, (int) (positionsEstimate * loadFactor))];
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
int posInc;
|
||||
if (incrementToken == null) {
|
||||
if (!initialized) {
|
||||
init();
|
||||
assert initialized;
|
||||
}
|
||||
incrementToken = firstToken;
|
||||
if (incrementToken == null) {
|
||||
return false;
|
||||
}
|
||||
posInc = incrementToken.position + 1;//first token normally has pos 0; add 1 to get posInc
|
||||
} else if (incrementToken.next != null) {
|
||||
int lastPosition = incrementToken.position;
|
||||
incrementToken = incrementToken.next;
|
||||
posInc = incrementToken.position - lastPosition;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
clearAttributes();
|
||||
termAttribute.copyBuffer(termCharsBuilder.chars(), incrementToken.termCharsOff, incrementToken.termCharsLen);
|
||||
|
||||
positionIncrementAttribute.setPositionIncrement(posInc);
|
||||
if (offsetAttribute != null) {
|
||||
offsetAttribute.setOffset(incrementToken.startOffset, incrementToken.startOffset + incrementToken.endOffsetInc);
|
||||
}
|
||||
if (payloadAttribute != null && incrementToken.payloadIndex >= 0) {
|
||||
payloadAttribute.setPayload(payloadsBytesRefArray.get(spareBytesRefBuilder, incrementToken.payloadIndex));
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private static class TokenLL implements Comparable<TokenLL> {
|
||||
// This class should weigh 32 bytes, including object header
|
||||
|
||||
int termCharsOff; // see termCharsBuilder
|
||||
short termCharsLen;
|
||||
|
||||
int position;
|
||||
int startOffset;
|
||||
short endOffsetInc; // add to startOffset to get endOffset
|
||||
int payloadIndex;
|
||||
|
||||
TokenLL next;
|
||||
|
||||
@Override
|
||||
public int compareTo(TokenLL tokenB) {
|
||||
int cmp = Integer.compare(this.position, tokenB.position);
|
||||
if (cmp == 0) {
|
||||
cmp = Integer.compare(this.startOffset, tokenB.startOffset);
|
||||
if (cmp == 0) {
|
||||
cmp = Short.compare(this.endOffsetInc, tokenB.endOffsetInc);
|
||||
}
|
||||
}
|
||||
return cmp;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,173 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.search.uhighlight;
|
||||
|
||||
import java.io.Closeable;
|
||||
import java.io.IOException;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.PostingsEnum;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.automaton.Automata;
|
||||
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
|
||||
|
||||
/**
|
||||
* Analyzes the text, producing a single {@link OffsetsEnum} wrapping the {@link TokenStream} filtered to terms
|
||||
* in the query, including wildcards. It can't handle position-sensitive queries (phrases). Passage accuracy suffers
|
||||
* because the freq() is unknown -- it's always {@link Integer#MAX_VALUE} instead.
|
||||
*/
|
||||
public class TokenStreamOffsetStrategy extends AnalysisOffsetStrategy {
|
||||
|
||||
private static final BytesRef[] ZERO_LEN_BYTES_REF_ARRAY = new BytesRef[0];
|
||||
|
||||
public TokenStreamOffsetStrategy(String field, BytesRef[] terms, PhraseHelper phraseHelper, CharacterRunAutomaton[] automata, Analyzer indexAnalyzer) {
|
||||
super(field, ZERO_LEN_BYTES_REF_ARRAY, phraseHelper, convertTermsToAutomata(terms, automata), indexAnalyzer);
|
||||
assert phraseHelper.hasPositionSensitivity() == false;
|
||||
}
|
||||
|
||||
private static CharacterRunAutomaton[] convertTermsToAutomata(BytesRef[] terms, CharacterRunAutomaton[] automata) {
|
||||
CharacterRunAutomaton[] newAutomata = new CharacterRunAutomaton[terms.length + automata.length];
|
||||
for (int i = 0; i < terms.length; i++) {
|
||||
String termString = terms[i].utf8ToString();
|
||||
newAutomata[i] = new CharacterRunAutomaton(Automata.makeString(termString)) {
|
||||
@Override
|
||||
public String toString() {
|
||||
return termString;
|
||||
}
|
||||
};
|
||||
}
|
||||
// Append existing automata (that which is used for MTQs)
|
||||
System.arraycopy(automata, 0, newAutomata, terms.length, automata.length);
|
||||
return newAutomata;
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<OffsetsEnum> getOffsetsEnums(IndexReader reader, int docId, String content) throws IOException {
|
||||
TokenStream tokenStream = tokenStream(content);
|
||||
PostingsEnum mtqPostingsEnum = new TokenStreamPostingsEnum(tokenStream, automata);
|
||||
mtqPostingsEnum.advance(docId);
|
||||
return Collections.singletonList(new OffsetsEnum(null, mtqPostingsEnum));
|
||||
}
|
||||
|
||||
// but this would have a performance cost for likely little gain in the user experience, it
|
||||
// would only serve to make this method less bogus.
|
||||
// instead, we always return freq() = Integer.MAX_VALUE and let the highlighter terminate based on offset...
|
||||
// TODO: DWS perhaps instead OffsetsEnum could become abstract and this would be an impl?
|
||||
private static class TokenStreamPostingsEnum extends PostingsEnum implements Closeable {
|
||||
TokenStream stream; // becomes null when closed
|
||||
final CharacterRunAutomaton[] matchers;
|
||||
final CharTermAttribute charTermAtt;
|
||||
final OffsetAttribute offsetAtt;
|
||||
|
||||
int currentDoc = -1;
|
||||
int currentMatch = -1;
|
||||
int currentStartOffset = -1;
|
||||
|
||||
int currentEndOffset = -1;
|
||||
|
||||
final BytesRef matchDescriptions[];
|
||||
|
||||
TokenStreamPostingsEnum(TokenStream ts, CharacterRunAutomaton[] matchers) throws IOException {
|
||||
this.stream = ts;
|
||||
this.matchers = matchers;
|
||||
matchDescriptions = new BytesRef[matchers.length];
|
||||
charTermAtt = ts.addAttribute(CharTermAttribute.class);
|
||||
offsetAtt = ts.addAttribute(OffsetAttribute.class);
|
||||
ts.reset();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int nextPosition() throws IOException {
|
||||
if (stream != null) {
|
||||
while (stream.incrementToken()) {
|
||||
for (int i = 0; i < matchers.length; i++) {
|
||||
if (matchers[i].run(charTermAtt.buffer(), 0, charTermAtt.length())) {
|
||||
currentStartOffset = offsetAtt.startOffset();
|
||||
currentEndOffset = offsetAtt.endOffset();
|
||||
currentMatch = i;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
stream.end();
|
||||
close();
|
||||
}
|
||||
// exhausted
|
||||
currentStartOffset = currentEndOffset = Integer.MAX_VALUE;
|
||||
return Integer.MAX_VALUE;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int freq() throws IOException {
|
||||
return Integer.MAX_VALUE; // lie
|
||||
}
|
||||
|
||||
@Override
|
||||
public int startOffset() throws IOException {
|
||||
assert currentStartOffset >= 0;
|
||||
return currentStartOffset;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int endOffset() throws IOException {
|
||||
assert currentEndOffset >= 0;
|
||||
return currentEndOffset;
|
||||
}
|
||||
|
||||
@Override
|
||||
public BytesRef getPayload() throws IOException {
|
||||
if (matchDescriptions[currentMatch] == null) {
|
||||
matchDescriptions[currentMatch] = new BytesRef(matchers[currentMatch].toString());
|
||||
}
|
||||
return matchDescriptions[currentMatch];
|
||||
}
|
||||
|
||||
@Override
|
||||
public int docID() {
|
||||
return currentDoc;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int nextDoc() throws IOException {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int advance(int target) throws IOException {
|
||||
return currentDoc = target;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long cost() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
if (stream != null) {
|
||||
stream.close();
|
||||
stream = null;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -117,6 +117,8 @@ public class UnifiedHighlighter {
|
|||
|
||||
private boolean defaultHighlightPhrasesStrictly = true; // AKA "accuracy" or "query debugging"
|
||||
|
||||
private boolean defaultPassageRelevancyOverSpeed = true; //For analysis, prefer MemoryIndexOffsetStrategy
|
||||
|
||||
// private boolean defaultRequireFieldMatch = true; TODO
|
||||
|
||||
private int maxLength = DEFAULT_MAX_LENGTH;
|
||||
|
@ -213,6 +215,12 @@ public class UnifiedHighlighter {
|
|||
return defaultHighlightPhrasesStrictly;
|
||||
}
|
||||
|
||||
|
||||
protected boolean shouldPreferPassageRelevancyOverSpeed(String field) {
|
||||
return defaultPassageRelevancyOverSpeed;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* The maximum content size to process. Content will be truncated to this size before highlighting. Typically
|
||||
* snippets closer to the beginning of the document better summarize its content.
|
||||
|
@ -716,8 +724,13 @@ public class UnifiedHighlighter {
|
|||
}
|
||||
|
||||
protected FieldHighlighter getFieldHighlighter(String field, Query query, SortedSet<Term> allTerms, int maxPassages) {
|
||||
BytesRef[] terms = filterExtractedTerms(field, allTerms);
|
||||
Set<HighlightFlag> highlightFlags = getFlags(field);
|
||||
PhraseHelper phraseHelper = getPhraseHelper(field, query, highlightFlags);
|
||||
CharacterRunAutomaton[] automata = getAutomata(field, query, highlightFlags);
|
||||
OffsetSource offsetSource = getOptimizedOffsetSource(field, terms, phraseHelper, automata);
|
||||
return new FieldHighlighter(field,
|
||||
getOffsetStrategy(field, query, allTerms),
|
||||
getOffsetStrategy(offsetSource, field, terms, phraseHelper, automata, highlightFlags),
|
||||
new SplittingBreakIterator(getBreakIterator(field), UnifiedHighlighter.MULTIVAL_SEP_CHAR),
|
||||
getScorer(field),
|
||||
maxPassages,
|
||||
|
@ -725,41 +738,7 @@ public class UnifiedHighlighter {
|
|||
getFormatter(field));
|
||||
}
|
||||
|
||||
protected FieldOffsetStrategy getOffsetStrategy(String field, Query query, SortedSet<Term> allTerms) {
|
||||
EnumSet<HighlightFlag> highlightFlags = getFlags(field);
|
||||
BytesRef[] terms = filterExtractedTerms(field, allTerms);
|
||||
PhraseHelper phraseHelper = getPhraseHelper(field, query, highlightFlags);
|
||||
CharacterRunAutomaton[] automata = getAutomata(field, query, highlightFlags);
|
||||
OffsetSource offsetSource = getOptimizedOffsetSource(field, terms, phraseHelper, automata);
|
||||
switch (offsetSource) {
|
||||
case ANALYSIS:
|
||||
return new AnalysisOffsetStrategy(field, terms, phraseHelper, automata, getIndexAnalyzer(),
|
||||
this::preMultiTermQueryRewrite);
|
||||
case NONE_NEEDED:
|
||||
return NoOpOffsetStrategy.INSTANCE;
|
||||
case TERM_VECTORS:
|
||||
return new TermVectorOffsetStrategy(field, terms, phraseHelper, automata);
|
||||
case POSTINGS:
|
||||
return new PostingsOffsetStrategy(field, terms, phraseHelper, automata);
|
||||
case POSTINGS_WITH_TERM_VECTORS:
|
||||
return new PostingsWithTermVectorsOffsetStrategy(field, terms, phraseHelper, automata);
|
||||
default:
|
||||
throw new IllegalArgumentException("Unrecognized offset source " + offsetSource);
|
||||
}
|
||||
}
|
||||
|
||||
protected EnumSet<HighlightFlag> getFlags(String field) {
|
||||
EnumSet<HighlightFlag> highlightFlags = EnumSet.noneOf(HighlightFlag.class);
|
||||
if (shouldHandleMultiTermQuery(field)) {
|
||||
highlightFlags.add(HighlightFlag.MULTI_TERM_QUERY);
|
||||
}
|
||||
if (shouldHighlightPhrasesStrictly(field)) {
|
||||
highlightFlags.add(HighlightFlag.PHRASES);
|
||||
}
|
||||
return highlightFlags;
|
||||
}
|
||||
|
||||
protected BytesRef[] filterExtractedTerms(String field, SortedSet<Term> queryTerms) {
|
||||
protected static BytesRef[] filterExtractedTerms(String field, SortedSet<Term> queryTerms) {
|
||||
// TODO consider requireFieldMatch
|
||||
Term floor = new Term(field, "");
|
||||
Term ceiling = new Term(field, UnicodeUtil.BIG_TERM);
|
||||
|
@ -774,7 +753,21 @@ public class UnifiedHighlighter {
|
|||
return terms;
|
||||
}
|
||||
|
||||
protected PhraseHelper getPhraseHelper(String field, Query query, EnumSet<HighlightFlag> highlightFlags) {
|
||||
protected Set<HighlightFlag> getFlags(String field) {
|
||||
Set<HighlightFlag> highlightFlags = EnumSet.noneOf(HighlightFlag.class);
|
||||
if (shouldHandleMultiTermQuery(field)) {
|
||||
highlightFlags.add(HighlightFlag.MULTI_TERM_QUERY);
|
||||
}
|
||||
if (shouldHighlightPhrasesStrictly(field)) {
|
||||
highlightFlags.add(HighlightFlag.PHRASES);
|
||||
}
|
||||
if (shouldPreferPassageRelevancyOverSpeed(field)) {
|
||||
highlightFlags.add(HighlightFlag.PASSAGE_RELEVANCY_OVER_SPEED);
|
||||
}
|
||||
return highlightFlags;
|
||||
}
|
||||
|
||||
protected PhraseHelper getPhraseHelper(String field, Query query, Set<HighlightFlag> highlightFlags) {
|
||||
boolean highlightPhrasesStrictly = highlightFlags.contains(HighlightFlag.PHRASES);
|
||||
boolean handleMultiTermQuery = highlightFlags.contains(HighlightFlag.MULTI_TERM_QUERY);
|
||||
return highlightPhrasesStrictly ?
|
||||
|
@ -782,7 +775,7 @@ public class UnifiedHighlighter {
|
|||
PhraseHelper.NONE;
|
||||
}
|
||||
|
||||
protected CharacterRunAutomaton[] getAutomata(String field, Query query, EnumSet<HighlightFlag> highlightFlags) {
|
||||
protected CharacterRunAutomaton[] getAutomata(String field, Query query, Set<HighlightFlag> highlightFlags) {
|
||||
return highlightFlags.contains(HighlightFlag.MULTI_TERM_QUERY)
|
||||
? MultiTermHighlighting.extractAutomata(query, field, !highlightFlags.contains(HighlightFlag.PHRASES),
|
||||
this::preMultiTermQueryRewrite)
|
||||
|
@ -790,11 +783,12 @@ public class UnifiedHighlighter {
|
|||
}
|
||||
|
||||
protected OffsetSource getOptimizedOffsetSource(String field, BytesRef[] terms, PhraseHelper phraseHelper, CharacterRunAutomaton[] automata) {
|
||||
OffsetSource offsetSource = getOffsetSource(field);
|
||||
|
||||
if (terms.length == 0 && automata.length == 0 && !phraseHelper.willRewrite()) {
|
||||
return OffsetSource.NONE_NEEDED; //nothing to highlight
|
||||
}
|
||||
|
||||
OffsetSource offsetSource = getOffsetSource(field);
|
||||
switch (offsetSource) {
|
||||
case POSTINGS:
|
||||
if (phraseHelper.willRewrite()) {
|
||||
|
@ -822,6 +816,32 @@ public class UnifiedHighlighter {
|
|||
return offsetSource;
|
||||
}
|
||||
|
||||
protected FieldOffsetStrategy getOffsetStrategy(OffsetSource offsetSource, String field, BytesRef[] terms,
|
||||
PhraseHelper phraseHelper, CharacterRunAutomaton[] automata,
|
||||
Set<HighlightFlag> highlightFlags) {
|
||||
switch (offsetSource) {
|
||||
case ANALYSIS:
|
||||
if (!phraseHelper.hasPositionSensitivity() &&
|
||||
!highlightFlags.contains(HighlightFlag.PASSAGE_RELEVANCY_OVER_SPEED)) {
|
||||
//skip using a memory index since it's pure term filtering
|
||||
return new TokenStreamOffsetStrategy(field, terms, phraseHelper, automata, getIndexAnalyzer());
|
||||
} else {
|
||||
return new MemoryIndexOffsetStrategy(field, terms, phraseHelper, automata, getIndexAnalyzer(),
|
||||
this::preMultiTermQueryRewrite);
|
||||
}
|
||||
case NONE_NEEDED:
|
||||
return NoOpOffsetStrategy.INSTANCE;
|
||||
case TERM_VECTORS:
|
||||
return new TermVectorOffsetStrategy(field, terms, phraseHelper, automata);
|
||||
case POSTINGS:
|
||||
return new PostingsOffsetStrategy(field, terms, phraseHelper, automata);
|
||||
case POSTINGS_WITH_TERM_VECTORS:
|
||||
return new PostingsWithTermVectorsOffsetStrategy(field, terms, phraseHelper, automata);
|
||||
default:
|
||||
throw new IllegalArgumentException("Unrecognized offset source " + offsetSource);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* When highlighting phrases accurately, we need to know which {@link SpanQuery}'s need to have
|
||||
* {@link Query#rewrite(IndexReader)} called on them. It helps performance to avoid it if it's not needed.
|
||||
|
@ -1041,10 +1061,9 @@ public class UnifiedHighlighter {
|
|||
*/
|
||||
public enum HighlightFlag {
|
||||
PHRASES,
|
||||
MULTI_TERM_QUERY
|
||||
MULTI_TERM_QUERY,
|
||||
PASSAGE_RELEVANCY_OVER_SPEED
|
||||
// TODO: ignoreQueryFields
|
||||
// TODO: useQueryBoosts
|
||||
// TODO: avoidMemoryIndexIfPossible
|
||||
// TODO: preferMemoryIndexForStats
|
||||
}
|
||||
}
|
||||
|
|
|
@ -773,7 +773,40 @@ public class TestUnifiedHighlighterMTQ extends LuceneTestCase {
|
|||
ir.close();
|
||||
}
|
||||
|
||||
public void testTokenStreamIsClosed() throws IOException {
|
||||
public void testWithMaxLenAndMultipleWildcardMatches() throws IOException {
|
||||
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, indexAnalyzer);
|
||||
|
||||
Field body = new Field("body", "", fieldType);
|
||||
Document doc = new Document();
|
||||
doc.add(body);
|
||||
|
||||
//tests interleaving of multiple wildcard matches with the CompositePostingsEnum
|
||||
//In this case the CompositePostingsEnum will have an underlying PostingsEnum that jumps form pos 1 to 9 for bravo
|
||||
//and a second with position 2 for Bravado
|
||||
body.setStringValue("Alpha Bravo Bravado foo foo foo. Foo foo Alpha Bravo");
|
||||
iw.addDocument(doc);
|
||||
|
||||
IndexReader ir = iw.getReader();
|
||||
iw.close();
|
||||
|
||||
IndexSearcher searcher = newSearcher(ir);
|
||||
UnifiedHighlighter highlighter = new UnifiedHighlighter(searcher, indexAnalyzer);
|
||||
highlighter.setMaxLength(32);//a little past first sentence
|
||||
|
||||
BooleanQuery query = new BooleanQuery.Builder()
|
||||
.add(new TermQuery(new Term("body", "alpha")), BooleanClause.Occur.MUST)
|
||||
.add(new PrefixQuery(new Term("body", "bra")), BooleanClause.Occur.MUST)
|
||||
.build();
|
||||
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
|
||||
String snippets[] = highlighter.highlight("body", query, topDocs, 2);//ask for 2 but we'll only get 1
|
||||
assertArrayEquals(
|
||||
new String[]{"<b>Alpha</b> <b>Bravo</b> <b>Bravado</b> foo foo foo."}, snippets
|
||||
);
|
||||
|
||||
ir.close();
|
||||
}
|
||||
|
||||
public void testTokenStreamIsClosed() throws Exception {
|
||||
// note: test is a derivative of testWithMaxLen()
|
||||
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, indexAnalyzer);
|
||||
|
||||
|
@ -828,8 +861,8 @@ public class TestUnifiedHighlighterMTQ extends LuceneTestCase {
|
|||
if (fieldType == UHTestHelper.reanalysisType) {
|
||||
fail("Expecting EXPECTED IOException");
|
||||
}
|
||||
} catch (IOException e) {
|
||||
if (!e.getMessage().equals("EXPECTED")) {
|
||||
} catch (Exception e) {
|
||||
if (!e.getMessage().contains("EXPECTED")) {
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -50,9 +50,8 @@ public class TestUnifiedHighlighterRanking extends LuceneTestCase {
|
|||
|
||||
Analyzer indexAnalyzer;
|
||||
|
||||
// note: don't choose reanalysis because it doesn't always know the term frequency, which is a statistic used
|
||||
// in passage ranking. Sometimes it does (e.g. when it builds a MemoryIndex) but not necessarily.
|
||||
final FieldType fieldType = UHTestHelper.randomFieldType(random(), UHTestHelper.postingsType, UHTestHelper.tvType);
|
||||
// note: all offset sources, by default, use term freq, so it shouldn't matter which we choose.
|
||||
final FieldType fieldType = UHTestHelper.randomFieldType(random());
|
||||
|
||||
/**
|
||||
* indexes a bunch of gibberish, and then highlights top(n).
|
||||
|
|
|
@ -22,11 +22,13 @@ import java.text.BreakIterator;
|
|||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.SortedSet;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.LeafReader;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
|
@ -68,6 +70,11 @@ public class TestUnifiedHighlighterExtensibility extends LuceneTestCase {
|
|||
return Collections.emptyList();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<OffsetsEnum> createOffsetsEnumsFromReader(LeafReader leafReader, int doc) throws IOException {
|
||||
return super.createOffsetsEnumsFromReader(leafReader, doc);
|
||||
}
|
||||
|
||||
};
|
||||
assertEquals(offsetSource, strategy.getOffsetSource());
|
||||
}
|
||||
|
@ -142,8 +149,8 @@ public class TestUnifiedHighlighterExtensibility extends LuceneTestCase {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected FieldOffsetStrategy getOffsetStrategy(String field, Query query, SortedSet<Term> allTerms) {
|
||||
return super.getOffsetStrategy(field, query, allTerms);
|
||||
protected FieldOffsetStrategy getOffsetStrategy(OffsetSource offsetSource, String field, BytesRef[] terms, PhraseHelper phraseHelper, CharacterRunAutomaton[] automata, Set<HighlightFlag> highlightFlags) {
|
||||
return super.getOffsetStrategy(offsetSource, field, terms, phraseHelper, automata, highlightFlags);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
Loading…
Reference in New Issue