LUCENE-7526: UnifiedHighlighter: enhance MTQ passage relevancy. TokenStreamFromTermVector isn't used by the UH anymore. Refactor AnalysisOffsetStrategy into TokenStream and MemoryIndex strategies, and related refactorings from that.

This commit is contained in:
David Smiley 2016-11-15 16:16:46 -05:00
parent 280cbfd8fb
commit 7af454ad76
21 changed files with 767 additions and 997 deletions

View File

@ -85,6 +85,11 @@ Improvements
* LUCENE-7524: Added more detailed explanation of how IDF is computed in
ClassicSimilarity and BM25Similarity. (Adrien Grand)
* LUCENE-7526: Enhanced UnifiedHighlighter's passage relevancy for queries with
wildcards and sometimes just terms. Added shouldPreferPassageRelevancyOverSpeed()
which can be overridden to return false to eek out more speed in some cases.
(Timothy M. Rodriguez, David Smiley)
Other
* LUCENE-7546: Fixed references to benchmark wikipedia data and the Jenkins line-docs file

View File

@ -34,7 +34,7 @@ content.source=org.apache.lucene.benchmark.byTask.feeds.EnwikiContentSource
docs.file=temp/enwiki-20070527-pages-articles.xml.bz2
query.maker=org.apache.lucene.benchmark.byTask.feeds.FileBasedQueryMaker
file.query.maker.file=conf/query-phrases.txt
file.query.maker.file=conf/query-terms.txt
log.queries=false
log.step.SearchTravRetHighlight=-1
@ -55,7 +55,7 @@ highlighter=HlImpl:NONE:SH_A:UH_A:PH_P:UH_P:UH_PV
{ "Warm" SearchTravRetHighlight > : 1000
{ "HL" SearchTravRetHighlight > : 500
{ "HL" SearchTravRetHighlight > : 2000
CloseReader

View File

@ -54,7 +54,7 @@ highlighter=HlImpl:NONE:SH_V:FVH_V:UH_V
{ "Warm" SearchTravRetHighlight > : 1000
{ "HL" SearchTravRetHighlight > : 500
{ "HL" SearchTravRetHighlight > : 2000
CloseReader

View File

@ -17,181 +17,154 @@
package org.apache.lucene.search.uhighlight;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.function.Function;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.FilteringTokenFilter;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.memory.MemoryIndex;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.automaton.Automata;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
/**
* Uses an {@link Analyzer} on content to get offsets. It may use a {@link MemoryIndex} too.
* Provides a base class for analysis based offset strategies to extend from.
* Requires an Analyzer and provides an override-able method for altering how
* the TokenStream is created.
*
* @lucene.internal
*/
public class AnalysisOffsetStrategy extends FieldOffsetStrategy {
public abstract class AnalysisOffsetStrategy extends FieldOffsetStrategy {
//TODO: Consider splitting this highlighter into a MemoryIndexFieldHighlighter and a TokenStreamFieldHighlighter
private static final BytesRef[] ZERO_LEN_BYTES_REF_ARRAY = new BytesRef[0];
private final Analyzer analyzer;
private final MemoryIndex memoryIndex;
private final LeafReader leafReader;
private final CharacterRunAutomaton preMemIndexFilterAutomaton;
protected final Analyzer analyzer;
public AnalysisOffsetStrategy(String field, BytesRef[] extractedTerms, PhraseHelper phraseHelper,
CharacterRunAutomaton[] automata, Analyzer analyzer,
Function<Query, Collection<Query>> multiTermQueryRewrite) {
super(field, extractedTerms, phraseHelper, automata);
public AnalysisOffsetStrategy(String field, BytesRef[] queryTerms, PhraseHelper phraseHelper, CharacterRunAutomaton[] automata, Analyzer analyzer) {
super(field, queryTerms, phraseHelper, automata);
this.analyzer = analyzer;
// Automata (Wildcards / MultiTermQuery):
this.automata = automata;
if (terms.length > 0 && !strictPhrases.hasPositionSensitivity()) {
this.automata = convertTermsToAutomata(terms, automata);
// clear the terms array now that we've moved them to be expressed as automata
terms = ZERO_LEN_BYTES_REF_ARRAY;
if (analyzer.getOffsetGap(field) != 1) { // note: 1 is the default. It is RARELY changed.
throw new IllegalArgumentException(
"offset gap of the provided analyzer should be 1 (field " + field + ")");
}
if (terms.length > 0 || strictPhrases.willRewrite()) { //needs MemoryIndex
// init MemoryIndex
boolean storePayloads = strictPhrases.hasPositionSensitivity(); // might be needed
memoryIndex = new MemoryIndex(true, storePayloads);//true==store offsets
leafReader = (LeafReader) memoryIndex.createSearcher().getIndexReader();
// preFilter for MemoryIndex
preMemIndexFilterAutomaton = buildCombinedAutomaton(field, terms, this.automata, strictPhrases,
multiTermQueryRewrite);
} else {
memoryIndex = null;
leafReader = null;
preMemIndexFilterAutomaton = null;
}
}
@Override
public UnifiedHighlighter.OffsetSource getOffsetSource() {
public final UnifiedHighlighter.OffsetSource getOffsetSource() {
return UnifiedHighlighter.OffsetSource.ANALYSIS;
}
@Override
public List<OffsetsEnum> getOffsetsEnums(IndexReader reader, int docId, String content) throws IOException {
// note: don't need LimitTokenOffsetFilter since content is already truncated to maxLength
TokenStream tokenStream = tokenStream(content);
if (memoryIndex != null) { // also handles automata.length > 0
// We use a MemoryIndex and index the tokenStream so that later we have the PostingsEnum with offsets.
// note: An *alternative* strategy is to get PostingsEnums without offsets from the main index
// and then marry this up with a fake PostingsEnum backed by a TokenStream (which has the offsets) and
// can use that to filter applicable tokens? It would have the advantage of being able to exit
// early and save some re-analysis. This would be an additional method/offset-source approach
// since it's still useful to highlight without any index (so we build MemoryIndex).
// note: probably unwise to re-use TermsEnum on reset mem index so we don't. But we do re-use the
// leaf reader, which is a bit more top level than in the guts.
memoryIndex.reset();
// Filter the tokenStream to applicable terms
if (preMemIndexFilterAutomaton != null) {
tokenStream = newKeepWordFilter(tokenStream, preMemIndexFilterAutomaton);
}
memoryIndex.addField(field, tokenStream);//note: calls tokenStream.reset() & close()
tokenStream = null; // it's consumed; done.
docId = 0;
if (automata.length > 0) {
Terms foundTerms = leafReader.terms(field);
if (foundTerms == null) {
return Collections.emptyList(); //No offsets for this field.
}
// Un-invert for the automata. Much more compact than a CachingTokenStream
tokenStream = MultiTermHighlighting.uninvertAndFilterTerms(foundTerms, 0, automata, content.length());
}
}
return createOffsetsEnums(leafReader, docId, tokenStream);
}
protected TokenStream tokenStream(String content) throws IOException {
return MultiValueTokenStream.wrap(field, analyzer, content, UnifiedHighlighter.MULTIVAL_SEP_CHAR);
}
private static CharacterRunAutomaton[] convertTermsToAutomata(BytesRef[] terms, CharacterRunAutomaton[] automata) {
CharacterRunAutomaton[] newAutomata = new CharacterRunAutomaton[terms.length + automata.length];
for (int i = 0; i < terms.length; i++) {
newAutomata[i] = MultiTermHighlighting.makeStringMatchAutomata(terms[i]);
// If there is no splitChar in content then we needn't wrap:
int splitCharIdx = content.indexOf(UnifiedHighlighter.MULTIVAL_SEP_CHAR);
if (splitCharIdx == -1) {
return analyzer.tokenStream(field, content);
}
// Append existing automata (that which is used for MTQs)
System.arraycopy(automata, 0, newAutomata, terms.length, automata.length);
return newAutomata;
TokenStream subTokenStream = analyzer.tokenStream(field, content.substring(0, splitCharIdx));
return new MultiValueTokenStream(subTokenStream, field, analyzer, content, UnifiedHighlighter.MULTIVAL_SEP_CHAR, splitCharIdx);
}
private static FilteringTokenFilter newKeepWordFilter(final TokenStream tokenStream,
final CharacterRunAutomaton charRunAutomaton) {
// it'd be nice to use KeepWordFilter but it demands a CharArraySet. TODO File JIRA? Need a new interface?
return new FilteringTokenFilter(tokenStream) {
final CharTermAttribute charAtt = addAttribute(CharTermAttribute.class);
@Override
protected boolean accept() throws IOException {
return charRunAutomaton.run(charAtt.buffer(), 0, charAtt.length());
}
};
}
/**
* Build one {@link CharacterRunAutomaton} matching any term the query might match.
* Wraps an {@link Analyzer} and string text that represents multiple values delimited by a specified character. This
* exposes a TokenStream that matches what would get indexed considering the
* {@link Analyzer#getPositionIncrementGap(String)}. Currently this assumes {@link Analyzer#getOffsetGap(String)} is
* 1; an exception will be thrown if it isn't.
* <br />
* It would be more orthogonal for this to be an Analyzer since we're wrapping an Analyzer but doing so seems like
* more work. The underlying components see a Reader not a String -- and the String is easy to
* split up without redundant buffering.
*
* @lucene.internal
*/
private static CharacterRunAutomaton buildCombinedAutomaton(String field, BytesRef[] terms,
CharacterRunAutomaton[] automata,
PhraseHelper strictPhrases,
Function<Query, Collection<Query>> multiTermQueryRewrite) {
List<CharacterRunAutomaton> allAutomata = new ArrayList<>();
if (terms.length > 0) {
allAutomata.add(new CharacterRunAutomaton(Automata.makeStringUnion(Arrays.asList(terms))));
}
Collections.addAll(allAutomata, automata);
for (SpanQuery spanQuery : strictPhrases.getSpanQueries()) {
Collections.addAll(allAutomata,
MultiTermHighlighting.extractAutomata(spanQuery, field, true, multiTermQueryRewrite));//true==lookInSpan
private static final class MultiValueTokenStream extends TokenFilter {
private final String fieldName;
private final Analyzer indexAnalyzer;
private final String content;
private final char splitChar;
private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private int startValIdx = 0;
private int endValIdx;
private int remainingPosInc = 0;
private MultiValueTokenStream(TokenStream subTokenStream, String fieldName, Analyzer indexAnalyzer,
String content, char splitChar, int splitCharIdx) {
super(subTokenStream); // subTokenStream is already initialized to operate on the first value
this.fieldName = fieldName;
this.indexAnalyzer = indexAnalyzer;
this.content = content;
this.splitChar = splitChar;
this.endValIdx = splitCharIdx;
}
if (allAutomata.size() == 1) {
return allAutomata.get(0);
}
//TODO it'd be nice if we could get at the underlying Automaton in CharacterRunAutomaton so that we
// could union them all. But it's not exposed, and note TermRangeQuery isn't modelled as an Automaton
// by MultiTermHighlighting.
// Return an aggregate CharacterRunAutomaton of others
return new CharacterRunAutomaton(Automata.makeEmpty()) {// the makeEmpty() is bogus; won't be used
@Override
public boolean run(char[] chars, int offset, int length) {
for (int i = 0; i < allAutomata.size(); i++) {// don't use foreach to avoid Iterator allocation
if (allAutomata.get(i).run(chars, offset, length)) {
return true;
}
}
return false;
@Override
public void reset() throws IOException {
if (startValIdx != 0) {
throw new IllegalStateException("This TokenStream wasn't developed to be re-used.");
// ... although we could if a need for it arises.
}
};
}
super.reset();
}
@Override
public boolean incrementToken() throws IOException {
while (true) {
if (input.incrementToken()) {
// Position tracking:
if (remainingPosInc > 0) {//usually true first token of additional values (not first val)
posIncAtt.setPositionIncrement(remainingPosInc + posIncAtt.getPositionIncrement());
remainingPosInc = 0;//reset
}
// Offset tracking:
offsetAtt.setOffset(
startValIdx + offsetAtt.startOffset(),
startValIdx + offsetAtt.endOffset()
);
return true;
}
if (endValIdx == content.length()) {//no more
return false;
}
input.end(); // might adjust position increment
remainingPosInc += posIncAtt.getPositionIncrement();
input.close();
remainingPosInc += indexAnalyzer.getPositionIncrementGap(fieldName);
// Get new tokenStream based on next segment divided by the splitChar
startValIdx = endValIdx + 1;
endValIdx = content.indexOf(splitChar, startValIdx);
if (endValIdx == -1) {//EOF
endValIdx = content.length();
}
TokenStream tokenStream = indexAnalyzer.tokenStream(fieldName, content.substring(startValIdx, endValIdx));
if (tokenStream != input) {// (input is defined in TokenFilter set in the constructor)
// This is a grand trick we do -- knowing that the analyzer's re-use strategy is going to produce the
// very same tokenStream instance and thus have the same AttributeSource as this wrapping TokenStream
// since we used it as our input in the constructor.
// Were this not the case, we'd have to copy every attribute of interest since we can't alter the
// AttributeSource of this wrapping TokenStream post-construction (it's all private/final).
// If this is a problem, we could do that instead; maybe with a custom CharTermAttribute that allows
// us to easily set the char[] reference without literally copying char by char.
throw new IllegalStateException("Require TokenStream re-use. Unsupported re-use strategy?: " +
indexAnalyzer.getReuseStrategy());
}
tokenStream.reset();
} // while loop to increment token of this new value
}
@Override
public void end() throws IOException {
super.end();
// Offset tracking:
offsetAtt.setOffset(
startValIdx + offsetAtt.startOffset(),
startValIdx + offsetAtt.endOffset());
}
}
}

View File

@ -0,0 +1,145 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.uhighlight;
import java.io.IOException;
import java.util.List;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.PriorityQueue;
/**
* Provides a view over several underlying PostingsEnums for the iteration of offsets on the current document only.
* It's not general purpose; the position returned is always -1 and it doesn't iterate the documents.
*/
final class CompositeOffsetsPostingsEnum extends PostingsEnum {
private final int docId;
private final int freq;
private final PriorityQueue<BoundsCheckingPostingsEnum> queue;
private boolean firstPositionConsumed = false;
/**
* This class is used to ensure we don't over iterate the underlying
* postings enum by keeping track of the position relative to the
* frequency.
* Ideally this would've been an implementation of a PostingsEnum
* but it would have to delegate most methods and it seemed easier
* to just wrap the tweaked method.
*/
private static final class BoundsCheckingPostingsEnum {
private final PostingsEnum postingsEnum;
private int remainingPositions;
BoundsCheckingPostingsEnum(PostingsEnum postingsEnum) throws IOException {
this.postingsEnum = postingsEnum;
this.remainingPositions = postingsEnum.freq();
nextPosition();
}
/** Advances to the next position and returns true, or returns false if it can't. */
private boolean nextPosition() throws IOException {
if (remainingPositions-- > 0) {
postingsEnum.nextPosition(); // ignore the actual position; we don't care.
return true;
} else {
return false;
}
}
}
/** The provided {@link PostingsEnum}s must all be positioned to the same document, and must have offsets. */
CompositeOffsetsPostingsEnum(List<PostingsEnum> postingsEnums) throws IOException {
queue = new PriorityQueue<BoundsCheckingPostingsEnum>(postingsEnums.size()) {
@Override
protected boolean lessThan(BoundsCheckingPostingsEnum a, BoundsCheckingPostingsEnum b) {
try {
return a.postingsEnum.startOffset() < b.postingsEnum.startOffset();
} catch (IOException e) {
throw new RuntimeException(e);
}
}
};
int freqAdd = 0;
for (PostingsEnum postingsEnum : postingsEnums) {
queue.add(new BoundsCheckingPostingsEnum(postingsEnum));
freqAdd += postingsEnum.freq();
}
freq = freqAdd;
this.docId = queue.top().postingsEnum.docID();
}
@Override
public int freq() throws IOException {
return freq;
}
/** Advances to the next position. Always returns -1; the caller is assumed not to care for the highlighter. */
@Override
public int nextPosition() throws IOException {
if (!firstPositionConsumed) {
firstPositionConsumed = true;
} else if (queue.size() == 0) {
throw new IllegalStateException("nextPosition called too many times");
} else if (queue.top().nextPosition()) { // advance head
queue.updateTop(); //the new position may be behind another postingsEnum in the queue
} else {
queue.pop(); //this postingsEnum is consumed; get rid of it. Another will take it's place.
}
assert queue.size() > 0;
return -1;
}
@Override
public int startOffset() throws IOException {
return queue.top().postingsEnum.startOffset();
}
@Override
public int endOffset() throws IOException {
return queue.top().postingsEnum.endOffset();
}
@Override
public BytesRef getPayload() throws IOException {
return queue.top().postingsEnum.getPayload();
}
@Override
public int docID() {
return docId;
}
@Override
public int nextDoc() throws IOException {
return NO_MORE_DOCS;
}
@Override
public int advance(int target) throws IOException {
return NO_MORE_DOCS;
}
@Override
public long cost() {
return 1L; //at most 1 doc is returned
}
}

View File

@ -14,16 +14,14 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.uhighlight;
import java.io.Closeable;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.PostingsEnum;
@ -31,6 +29,7 @@ import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.spans.Spans;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRefBuilder;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
/**
@ -42,14 +41,14 @@ import org.apache.lucene.util.automaton.CharacterRunAutomaton;
public abstract class FieldOffsetStrategy {
protected final String field;
protected BytesRef[] terms; // Query: free-standing terms
protected PhraseHelper strictPhrases; // Query: position-sensitive information TODO: rename
protected CharacterRunAutomaton[] automata; // Query: free-standing wildcards (multi-term query)
protected final PhraseHelper phraseHelper; // Query: position-sensitive information TODO: rename
protected final BytesRef[] terms; // Query: free-standing terms
protected final CharacterRunAutomaton[] automata; // Query: free-standing wildcards (multi-term query)
public FieldOffsetStrategy(String field, BytesRef[] queryTerms, PhraseHelper phraseHelper, CharacterRunAutomaton[] automata) {
this.field = field;
this.terms = queryTerms;
this.strictPhrases = phraseHelper;
this.phraseHelper = phraseHelper;
this.automata = automata;
}
@ -65,58 +64,90 @@ public abstract class FieldOffsetStrategy {
*/
public abstract List<OffsetsEnum> getOffsetsEnums(IndexReader reader, int docId, String content) throws IOException;
protected List<OffsetsEnum> createOffsetsEnums(LeafReader leafReader, int doc, TokenStream tokenStream) throws IOException {
List<OffsetsEnum> offsetsEnums = createOffsetsEnumsFromReader(leafReader, doc);
if (automata.length > 0) {
offsetsEnums.add(createOffsetsEnumFromTokenStream(doc, tokenStream));
protected List<OffsetsEnum> createOffsetsEnumsFromReader(LeafReader leafReader, int doc) throws IOException {
final Terms termsIndex = leafReader.terms(field);
if (termsIndex == null) {
return Collections.emptyList();
}
return offsetsEnums;
}
protected List<OffsetsEnum> createOffsetsEnumsFromReader(LeafReader atomicReader, int doc) throws IOException {
// For strict positions, get a Map of term to Spans:
// note: ScriptPhraseHelper.NONE does the right thing for these method calls
final Map<BytesRef, Spans> strictPhrasesTermToSpans =
strictPhrases.getTermToSpans(atomicReader, doc);
phraseHelper.getTermToSpans(leafReader, doc);
// Usually simply wraps terms in a List; but if willRewrite() then can be expanded
final List<BytesRef> sourceTerms =
strictPhrases.expandTermsIfRewrite(terms, strictPhrasesTermToSpans);
phraseHelper.expandTermsIfRewrite(terms, strictPhrasesTermToSpans);
final List<OffsetsEnum> offsetsEnums = new ArrayList<>(sourceTerms.size() + 1);
final List<OffsetsEnum> offsetsEnums = new ArrayList<>(sourceTerms.size() + automata.length);
Terms termsIndex = atomicReader == null || sourceTerms.isEmpty() ? null : atomicReader.terms(field);
if (termsIndex != null) {
// Handle sourceTerms:
if (!sourceTerms.isEmpty()) {
TermsEnum termsEnum = termsIndex.iterator();//does not return null
for (BytesRef term : sourceTerms) {
if (!termsEnum.seekExact(term)) {
continue; // term not found
}
PostingsEnum postingsEnum = termsEnum.postings(null, PostingsEnum.OFFSETS);
if (postingsEnum == null) {
// no offsets or positions available
throw new IllegalArgumentException("field '" + field + "' was indexed without offsets, cannot highlight");
}
if (doc != postingsEnum.advance(doc)) { // now it's positioned, although may be exhausted
continue;
}
postingsEnum = strictPhrases.filterPostings(term, postingsEnum, strictPhrasesTermToSpans.get(term));
if (postingsEnum == null) {
continue;// completely filtered out
}
if (termsEnum.seekExact(term)) {
PostingsEnum postingsEnum = termsEnum.postings(null, PostingsEnum.OFFSETS);
offsetsEnums.add(new OffsetsEnum(term, postingsEnum));
if (postingsEnum == null) {
// no offsets or positions available
throw new IllegalArgumentException("field '" + field + "' was indexed without offsets, cannot highlight");
}
if (doc == postingsEnum.advance(doc)) { // now it's positioned, although may be exhausted
postingsEnum = phraseHelper.filterPostings(term, postingsEnum, strictPhrasesTermToSpans.get(term));
if (postingsEnum != null) {
offsetsEnums.add(new OffsetsEnum(term, postingsEnum));
}
}
}
}
}
// Handle automata
if (automata.length > 0) {
offsetsEnums.addAll(createAutomataOffsetsFromTerms(termsIndex, doc));
}
return offsetsEnums;
}
protected OffsetsEnum createOffsetsEnumFromTokenStream(int doc, TokenStream tokenStream) throws IOException {
// if there are automata (MTQ), we have to initialize the "fake" enum wrapping them.
assert tokenStream != null;
// TODO Opt: we sometimes evaluate the automata twice when this TS isn't the original; can we avoid?
PostingsEnum mtqPostingsEnum = MultiTermHighlighting.getDocsEnum(tokenStream, automata);
assert mtqPostingsEnum instanceof Closeable; // FYI we propagate close() later.
mtqPostingsEnum.advance(doc);
return new OffsetsEnum(null, mtqPostingsEnum);
protected List<OffsetsEnum> createAutomataOffsetsFromTerms(Terms termsIndex, int doc) throws IOException {
List<List<PostingsEnum>> automataPostings = new ArrayList<>(automata.length);
for (int i = 0; i < automata.length; i++) {
automataPostings.add(new ArrayList<>());
}
TermsEnum termsEnum = termsIndex.iterator();
BytesRef term;
CharsRefBuilder refBuilder = new CharsRefBuilder();
while ((term = termsEnum.next()) != null) {
for (int i = 0; i < automata.length; i++) {
CharacterRunAutomaton automaton = automata[i];
refBuilder.copyUTF8Bytes(term);
if (automaton.run(refBuilder.chars(), 0, refBuilder.length())) {
PostingsEnum postings = termsEnum.postings(null, PostingsEnum.OFFSETS);
if (doc == postings.advance(doc)) {
automataPostings.get(i).add(postings);
}
}
}
}
List<OffsetsEnum> offsetsEnums = new ArrayList<>(automata.length); //will be at most this long
for (int i = 0; i < automata.length; i++) {
CharacterRunAutomaton automaton = automata[i];
List<PostingsEnum> postingsEnums = automataPostings.get(i);
int size = postingsEnums.size();
if (size > 0) { //only add if we have offsets
BytesRef wildcardTerm = new BytesRef(automaton.toString());
if (size == 1) { //don't wrap in a composite if there's only one OffsetsEnum
offsetsEnums.add(new OffsetsEnum(wildcardTerm, postingsEnums.get(0)));
} else {
offsetsEnums.add(new OffsetsEnum(wildcardTerm, new CompositeOffsetsPostingsEnum(postingsEnums)));
}
}
}
return offsetsEnums;
}
}

View File

@ -0,0 +1,129 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.uhighlight;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.function.Function;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.FilteringTokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.memory.MemoryIndex;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.automaton.Automata;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
/**
* Uses an {@link Analyzer} on content to get offsets and then populates a {@link MemoryIndex}.
*
* @lucene.internal
*/
public class MemoryIndexOffsetStrategy extends AnalysisOffsetStrategy {
private final MemoryIndex memoryIndex;
private final LeafReader leafReader;
private final CharacterRunAutomaton preMemIndexFilterAutomaton;
public MemoryIndexOffsetStrategy(String field, BytesRef[] extractedTerms, PhraseHelper phraseHelper,
CharacterRunAutomaton[] automata, Analyzer analyzer,
Function<Query, Collection<Query>> multiTermQueryRewrite) {
super(field, extractedTerms, phraseHelper, automata, analyzer);
boolean storePayloads = phraseHelper.hasPositionSensitivity(); // might be needed
memoryIndex = new MemoryIndex(true, storePayloads);//true==store offsets
leafReader = (LeafReader) memoryIndex.createSearcher().getIndexReader(); // appears to be re-usable
// preFilter for MemoryIndex
preMemIndexFilterAutomaton = buildCombinedAutomaton(field, terms, this.automata, phraseHelper, multiTermQueryRewrite);
}
/**
* Build one {@link CharacterRunAutomaton} matching any term the query might match.
*/
private static CharacterRunAutomaton buildCombinedAutomaton(String field, BytesRef[] terms,
CharacterRunAutomaton[] automata,
PhraseHelper strictPhrases,
Function<Query, Collection<Query>> multiTermQueryRewrite) {
List<CharacterRunAutomaton> allAutomata = new ArrayList<>();
if (terms.length > 0) {
allAutomata.add(new CharacterRunAutomaton(Automata.makeStringUnion(Arrays.asList(terms))));
}
Collections.addAll(allAutomata, automata);
for (SpanQuery spanQuery : strictPhrases.getSpanQueries()) {
Collections.addAll(allAutomata,
MultiTermHighlighting.extractAutomata(spanQuery, field, true, multiTermQueryRewrite));//true==lookInSpan
}
if (allAutomata.size() == 1) {
return allAutomata.get(0);
}
//TODO it'd be nice if we could get at the underlying Automaton in CharacterRunAutomaton so that we
// could union them all. But it's not exposed, and note TermRangeQuery isn't modelled as an Automaton
// by MultiTermHighlighting.
// Return an aggregate CharacterRunAutomaton of others
return new CharacterRunAutomaton(Automata.makeEmpty()) {// the makeEmpty() is bogus; won't be used
@Override
public boolean run(char[] chars, int offset, int length) {
for (int i = 0; i < allAutomata.size(); i++) {// don't use foreach to avoid Iterator allocation
if (allAutomata.get(i).run(chars, offset, length)) {
return true;
}
}
return false;
}
};
}
@Override
public List<OffsetsEnum> getOffsetsEnums(IndexReader reader, int docId, String content) throws IOException {
// note: don't need LimitTokenOffsetFilter since content is already truncated to maxLength
TokenStream tokenStream = tokenStream(content);
// Filter the tokenStream to applicable terms
tokenStream = newKeepWordFilter(tokenStream, preMemIndexFilterAutomaton);
memoryIndex.reset();
memoryIndex.addField(field, tokenStream);//note: calls tokenStream.reset() & close()
docId = 0;
return createOffsetsEnumsFromReader(leafReader, docId);
}
private static FilteringTokenFilter newKeepWordFilter(final TokenStream tokenStream,
final CharacterRunAutomaton charRunAutomaton) {
// it'd be nice to use KeepWordFilter but it demands a CharArraySet. TODO File JIRA? Need a new interface?
return new FilteringTokenFilter(tokenStream) {
final CharTermAttribute charAtt = addAttribute(CharTermAttribute.class);
@Override
protected boolean accept() throws IOException {
return charRunAutomaton.run(charAtt.buffer(), 0, charAtt.length());
}
};
}
}

View File

@ -16,8 +16,6 @@
*/
package org.apache.lucene.search.uhighlight;
import java.io.Closeable;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
@ -25,15 +23,7 @@ import java.util.Comparator;
import java.util.List;
import java.util.function.Function;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.index.FilterLeafReader;
import org.apache.lucene.index.FilteredTermsEnum;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.AutomatonQuery;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
@ -48,9 +38,7 @@ import org.apache.lucene.search.spans.SpanNearQuery;
import org.apache.lucene.search.spans.SpanNotQuery;
import org.apache.lucene.search.spans.SpanOrQuery;
import org.apache.lucene.search.spans.SpanPositionCheckQuery;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.CharsRefBuilder;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util.automaton.Automata;
import org.apache.lucene.util.automaton.Automaton;
@ -210,182 +198,4 @@ class MultiTermHighlighting {
return list.toArray(new CharacterRunAutomaton[list.size()]);
}
/**
* Returns a "fake" DocsAndPositionsEnum over the tokenstream, returning offsets where {@code matchers}
* matches tokens.
* <p>
* This is solely used internally by PostingsHighlighter: <b>DO NOT USE THIS METHOD!</b>
*/
public static PostingsEnum getDocsEnum(final TokenStream ts, final CharacterRunAutomaton[] matchers) throws IOException {
return new TokenStreamPostingsEnum(ts, matchers);
}
// TODO: we could use CachingWrapperFilter, (or consume twice) to allow us to have a true freq()
// but this would have a performance cost for likely little gain in the user experience, it
// would only serve to make this method less bogus.
// instead, we always return freq() = Integer.MAX_VALUE and let the highlighter terminate based on offset...
// TODO: DWS perhaps instead OffsetsEnum could become abstract and this would be an impl?
private static class TokenStreamPostingsEnum extends PostingsEnum implements Closeable {
TokenStream stream; // becomes null when closed
final CharacterRunAutomaton[] matchers;
final CharTermAttribute charTermAtt;
final OffsetAttribute offsetAtt;
int currentDoc = -1;
int currentMatch = -1;
int currentStartOffset = -1;
int currentEndOffset = -1;
final BytesRef matchDescriptions[];
TokenStreamPostingsEnum(TokenStream ts, CharacterRunAutomaton[] matchers) throws IOException {
this.stream = ts;
this.matchers = matchers;
matchDescriptions = new BytesRef[matchers.length];
charTermAtt = ts.addAttribute(CharTermAttribute.class);
offsetAtt = ts.addAttribute(OffsetAttribute.class);
ts.reset();
}
@Override
public int nextPosition() throws IOException {
if (stream != null) {
while (stream.incrementToken()) {
for (int i = 0; i < matchers.length; i++) {
if (matchers[i].run(charTermAtt.buffer(), 0, charTermAtt.length())) {
currentStartOffset = offsetAtt.startOffset();
currentEndOffset = offsetAtt.endOffset();
currentMatch = i;
return 0;
}
}
}
stream.end();
close();
}
// exhausted
currentStartOffset = currentEndOffset = Integer.MAX_VALUE;
return Integer.MAX_VALUE;
}
@Override
public int freq() throws IOException {
return Integer.MAX_VALUE; // lie
}
@Override
public int startOffset() throws IOException {
assert currentStartOffset >= 0;
return currentStartOffset;
}
@Override
public int endOffset() throws IOException {
assert currentEndOffset >= 0;
return currentEndOffset;
}
@Override
public BytesRef getPayload() throws IOException {
if (matchDescriptions[currentMatch] == null) {
matchDescriptions[currentMatch] = new BytesRef(matchers[currentMatch].toString());
}
return matchDescriptions[currentMatch];
}
@Override
public int docID() {
return currentDoc;
}
@Override
public int nextDoc() throws IOException {
throw new UnsupportedOperationException();
}
@Override
public int advance(int target) throws IOException {
return currentDoc = target;
}
@Override
public long cost() {
return 0;
}
@Override
public void close() throws IOException {
if (stream != null) {
stream.close();
stream = null;
}
}
}
/**
* Return a TokenStream un-inverted from the provided Terms, but filtered based on the automata. The
* Terms must have exactly one doc count (e.g. term vector or MemoryIndex).
*/
//TODO: Alternatively, produce a list of OffsetsEnums from the Terms that match the automata.
public static TokenStream uninvertAndFilterTerms(Terms termsIndex,
int doc,
final CharacterRunAutomaton[] automata,
int offsetLength)
throws IOException {
assert automata.length > 0;
//Note: if automata were plain Automaton (not CharacterRunAutomaton), we might instead use
// TermsEnum.intersect(compiledAutomaton). But probably won't help due to O(N) TV impl so whatever.
FilterLeafReader.FilterTerms filteredTermsIndex = new FilterLeafReader.FilterTerms(termsIndex) {
@Override
public TermsEnum iterator() throws IOException {
return new FilteredTermsEnum(super.iterator(), false) {//false == no seek
CharsRefBuilder tempCharsRefBuilder = new CharsRefBuilder();//reuse only for UTF8->UTF16 call
@Override
protected AcceptStatus accept(BytesRef termBytesRef) throws IOException {
//Grab the term (in same way as BytesRef.utf8ToString() but we don't want a String obj)
tempCharsRefBuilder.grow(termBytesRef.length);
final int charLen = UnicodeUtil.UTF8toUTF16(termBytesRef, tempCharsRefBuilder.chars());
for (CharacterRunAutomaton runAutomaton : automata) {
if (runAutomaton.run(tempCharsRefBuilder.chars(), 0, charLen)) {
return AcceptStatus.YES;
}
}
return AcceptStatus.NO;
}
};
}
@Override
public long size() throws IOException {
return -1; // unknown
}
@Override
public long getSumTotalTermFreq() throws IOException {
return -1; // unknown
}
@Override
public long getSumDocFreq() throws IOException {
return -1; // unknown
}
};
float loadFactor = 1f / 64f;
return new TokenStreamFromTermVector(filteredTermsIndex, doc, offsetLength, loadFactor);
}
/**
* Returns a simple automata that matches the specified term.
*/
public static CharacterRunAutomaton makeStringMatchAutomata(BytesRef term) {
String termString = term.utf8ToString();
return new CharacterRunAutomaton(Automata.makeString(termString)) {
@Override
public String toString() {
return termString;
}
};
}
}

View File

@ -1,148 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.uhighlight;
import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
/**
* Wraps an {@link Analyzer} and string text that represents multiple values delimited by a specified character. This
* exposes a TokenStream that matches what would get indexed considering the
* {@link Analyzer#getPositionIncrementGap(String)}. Currently this assumes {@link Analyzer#getOffsetGap(String)} is
* 1; an exception will be thrown if it isn't.
* <br />
* It would be more orthogonal for this to be an Analyzer since we're wrapping an Analyzer but doing so seems like
* more work. The underlying components see a Reader not a String -- and the String is easy to
* split up without redundant buffering.
*
* @lucene.internal
*/
final class MultiValueTokenStream extends TokenFilter {
private final String fieldName;
private final Analyzer indexAnalyzer;
private final String content;
private final char splitChar;
private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private int startValIdx = 0;
private int endValIdx;
private int remainingPosInc = 0;
/** note: The caller must remember to close the TokenStream eventually. */
static TokenStream wrap(String fieldName, Analyzer indexAnalyzer, String content, char splitChar)
throws IOException {
if (indexAnalyzer.getOffsetGap(fieldName) != 1) { // note: 1 is the default. It is RARELY changed.
throw new IllegalArgumentException(
"offset gap of the provided analyzer should be 1 (field " + fieldName + ")");
}
// If there is no splitChar in content then we needn't wrap:
int splitCharIdx = content.indexOf(splitChar);
if (splitCharIdx == -1) {
return indexAnalyzer.tokenStream(fieldName, content);
}
TokenStream subTokenStream = indexAnalyzer.tokenStream(fieldName, content.substring(0, splitCharIdx));
return new MultiValueTokenStream(subTokenStream, fieldName, indexAnalyzer, content, splitChar, splitCharIdx);
}
private MultiValueTokenStream(TokenStream subTokenStream, String fieldName, Analyzer indexAnalyzer,
String content, char splitChar, int splitCharIdx) {
super(subTokenStream); // subTokenStream is already initialized to operate on the first value
this.fieldName = fieldName;
this.indexAnalyzer = indexAnalyzer;
this.content = content;
this.splitChar = splitChar;
this.endValIdx = splitCharIdx;
}
@Override
public void reset() throws IOException {
if (startValIdx != 0) {
throw new IllegalStateException("This TokenStream wasn't developed to be re-used.");
// ... although we could if a need for it arises.
}
super.reset();
}
@Override
public boolean incrementToken() throws IOException {
while (true) {
if (input.incrementToken()) {
// Position tracking:
if (remainingPosInc > 0) {//usually true first token of additional values (not first val)
posIncAtt.setPositionIncrement(remainingPosInc + posIncAtt.getPositionIncrement());
remainingPosInc = 0;//reset
}
// Offset tracking:
offsetAtt.setOffset(
startValIdx + offsetAtt.startOffset(),
startValIdx + offsetAtt.endOffset()
);
return true;
}
if (endValIdx == content.length()) {//no more
return false;
}
input.end(); // might adjust position increment
remainingPosInc += posIncAtt.getPositionIncrement();
input.close();
remainingPosInc += indexAnalyzer.getPositionIncrementGap(fieldName);
// Get new tokenStream based on next segment divided by the splitChar
startValIdx = endValIdx + 1;
endValIdx = content.indexOf(splitChar, startValIdx);
if (endValIdx == -1) {//EOF
endValIdx = content.length();
}
TokenStream tokenStream = indexAnalyzer.tokenStream(fieldName, content.substring(startValIdx, endValIdx));
if (tokenStream != input) {// (input is defined in TokenFilter set in the constructor)
// This is a grand trick we do -- knowing that the analyzer's re-use strategy is going to produce the
// very same tokenStream instance and thus have the same AttributeSource as this wrapping TokenStream
// since we used it as our input in the constructor.
// Were this not the case, we'd have to copy every attribute of interest since we can't alter the
// AttributeSource of this wrapping TokenStream post-construction (it's all private/final).
// If this is a problem, we could do that instead; maybe with a custom CharTermAttribute that allows
// us to easily set the char[] reference without literally copying char by char.
throw new IllegalStateException("Require TokenStream re-use. Unsupported re-use strategy?: " +
indexAnalyzer.getReuseStrategy());
}
tokenStream.reset();
} // while loop to increment token of this new value
}
@Override
public void end() throws IOException {
super.end();
// Offset tracking:
offsetAtt.setOffset(
startValIdx + offsetAtt.startOffset(),
startValIdx + offsetAtt.endOffset());
}
}

View File

@ -76,6 +76,7 @@ public class OffsetsEnum implements Comparable<OffsetsEnum>, Closeable {
}
void nextPosition() throws IOException {
assert hasMorePositions();
pos++;
postingsEnum.nextPosition();
}

View File

@ -40,7 +40,7 @@ public final class Passage {
BytesRef matchTerms[] = new BytesRef[8];
int numMatches = 0;
void addMatch(int startOffset, int endOffset, BytesRef term) {
public void addMatch(int startOffset, int endOffset, BytesRef term) {
assert startOffset >= this.startOffset && startOffset <= this.endOffset;
if (numMatches == matchStarts.length) {
int newLength = ArrayUtil.oversize(numMatches+1, RamUsageEstimator.NUM_BYTES_OBJECT_REF);

View File

@ -266,7 +266,7 @@ public class PhraseHelper {
}
/**
* Returns terms as a List, but expanded to any terms in strictPhrases' keySet if present. That can only
* Returns terms as a List, but expanded to any terms in phraseHelper' keySet if present. That can only
* happen if willRewrite() is true.
*/
List<BytesRef> expandTermsIfRewrite(BytesRef[] terms, Map<BytesRef, Spans> strictPhrasesTermToSpans) {

View File

@ -41,7 +41,7 @@ public class PostingsOffsetStrategy extends FieldOffsetStrategy {
@Override
public List<OffsetsEnum> getOffsetsEnums(IndexReader reader, int docId, String content) throws IOException {
LeafReader leafReader;
final LeafReader leafReader;
if (reader instanceof LeafReader) {
leafReader = (LeafReader) reader;
} else {
@ -54,6 +54,7 @@ public class PostingsOffsetStrategy extends FieldOffsetStrategy {
return createOffsetsEnumsFromReader(leafReader, docId);
}
@Override
public UnifiedHighlighter.OffsetSource getOffsetSource() {
return UnifiedHighlighter.OffsetSource.POSTINGS;

View File

@ -20,7 +20,6 @@ import java.io.IOException;
import java.util.Collections;
import java.util.List;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
@ -58,14 +57,11 @@ public class PostingsWithTermVectorsOffsetStrategy extends FieldOffsetStrategy {
}
leafReader = new TermVectorFilteredLeafReader(leafReader, docTerms);
TokenStream tokenStream = automata.length > 0 ? MultiTermHighlighting
.uninvertAndFilterTerms(leafReader.terms(field), docId, this.automata, content.length()) : null;
return createOffsetsEnums(leafReader, docId, tokenStream);
return createOffsetsEnumsFromReader(leafReader, docId);
}
@Override
public UnifiedHighlighter.OffsetSource getOffsetSource() {
return UnifiedHighlighter.OffsetSource.POSTINGS;
return UnifiedHighlighter.OffsetSource.POSTINGS_WITH_TERM_VECTORS;
}
}

View File

@ -20,7 +20,6 @@ import java.io.IOException;
import java.util.Collections;
import java.util.List;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.Terms;
@ -51,18 +50,10 @@ public class TermVectorOffsetStrategy extends FieldOffsetStrategy {
return Collections.emptyList();
}
LeafReader leafReader = null;
if ((terms.length > 0) || strictPhrases.willRewrite()) {
leafReader = new TermVectorLeafReader(field, tvTerms);
docId = 0;
}
LeafReader leafReader = new TermVectorLeafReader(field, tvTerms);
docId = 0;
TokenStream tokenStream = null;
if (automata.length > 0) {
tokenStream = MultiTermHighlighting.uninvertAndFilterTerms(tvTerms, 0, automata, content.length());
}
return createOffsetsEnums(leafReader, docId, tokenStream);
return createOffsetsEnumsFromReader(leafReader, docId);
}
}

View File

@ -1,395 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.uhighlight;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefArray;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.CharsRefBuilder;
import org.apache.lucene.util.Counter;
import org.apache.lucene.util.UnicodeUtil;
/**
* TokenStream created from a term vector field. The term vector requires positions and/or offsets (either). If you
* want payloads add PayloadAttributeImpl (as you would normally) but don't assume the attribute is already added just
* because you know the term vector has payloads, since the first call to incrementToken() will observe if you asked
* for them and if not then won't get them. This TokenStream supports an efficient {@link #reset()}, so there's
* no need to wrap with a caching impl.
*
* @lucene.internal
*/
final class TokenStreamFromTermVector extends TokenStream {
// note: differs from similar class in the standard highlighter. This one is optimized for sparse cases.
/**
* content length divided by distinct positions; an average of dense text.
*/
private static final double AVG_CHARS_PER_POSITION = 6;
private static final int INSERTION_SORT_THRESHOLD = 16;
private final Terms vector;
private final int filteredDocId;
private final CharTermAttribute termAttribute;
private final PositionIncrementAttribute positionIncrementAttribute;
private final int offsetLength;
private final float loadFactor;
private OffsetAttribute offsetAttribute;//maybe null
private PayloadAttribute payloadAttribute;//maybe null
private CharsRefBuilder termCharsBuilder;//term data here
private BytesRefArray payloadsBytesRefArray;//only used when payloadAttribute is non-null
private BytesRefBuilder spareBytesRefBuilder;//only used when payloadAttribute is non-null
private TokenLL firstToken = null; // the head of a linked-list
private TokenLL incrementToken = null;
private boolean initialized = false;//lazy
public TokenStreamFromTermVector(Terms vector, int offsetLength) throws IOException {
this(vector, 0, offsetLength, 1f);
}
/**
* Constructor.
*
* @param vector Terms that contains the data for
* creating the TokenStream. Must have positions and/or offsets.
* @param filteredDocId The docID we will process.
* @param offsetLength Supply the character length of the text being uninverted, or a lower value if you don't want
* to invert text beyond an offset (in so doing this will act as a filter). If you don't
* know the length, pass -1. In conjunction with {@code loadFactor}, it's used to
* determine how many buckets to create during uninversion.
* It's also used to filter out tokens with a start offset exceeding this value.
* @param loadFactor The percent of tokens from the original terms (by position count) that are
* expected to be inverted. If they are filtered (e.g.
* {@link org.apache.lucene.index.FilterLeafReader.FilterTerms})
* then consider using less than 1.0 to avoid wasting space.
* 1.0 means all, 1/64th would suggest 1/64th of all tokens coming from vector.
*/
TokenStreamFromTermVector(Terms vector, int filteredDocId, int offsetLength, float loadFactor) throws IOException {
super();
this.filteredDocId = filteredDocId;
this.offsetLength = offsetLength == Integer.MAX_VALUE ? -1 : offsetLength;
if (loadFactor <= 0f || loadFactor > 1f) {
throw new IllegalArgumentException("loadFactor should be > 0 and <= 1");
}
this.loadFactor = loadFactor;
assert !hasAttribute(PayloadAttribute.class) : "AttributeFactory shouldn't have payloads *yet*";
if (!vector.hasPositions() && !vector.hasOffsets()) {
throw new IllegalArgumentException("The term vector needs positions and/or offsets.");
}
assert vector.hasFreqs();
this.vector = vector;
termAttribute = addAttribute(CharTermAttribute.class);
positionIncrementAttribute = addAttribute(PositionIncrementAttribute.class);
}
public Terms getTermVectorTerms() {
return vector;
}
@Override
public void reset() throws IOException {
incrementToken = null;
super.reset();
}
//We delay initialization because we can see which attributes the consumer wants, particularly payloads
private void init() throws IOException {
assert !initialized;
int dpEnumFlags = 0;
if (vector.hasOffsets()) {
offsetAttribute = addAttribute(OffsetAttribute.class);
dpEnumFlags |= PostingsEnum.OFFSETS;
}
if (vector.hasPayloads() && hasAttribute(PayloadAttribute.class)) {
payloadAttribute = getAttribute(PayloadAttribute.class);
payloadsBytesRefArray = new BytesRefArray(Counter.newCounter());
spareBytesRefBuilder = new BytesRefBuilder();
dpEnumFlags |= PostingsEnum.PAYLOADS;
}
// We put term data here
termCharsBuilder = new CharsRefBuilder();
termCharsBuilder.grow(initTotalTermCharLen());
// Step 1: iterate termsEnum and create a token, placing into a bucketed array (given a load factor)
final TokenLL[] tokenBuckets = initTokenBucketsArray();
final double OFFSET_TO_BUCKET_IDX = loadFactor / AVG_CHARS_PER_POSITION;
final double POSITION_TO_BUCKET_IDX = loadFactor;
final TermsEnum termsEnum = vector.iterator();
BytesRef termBytesRef;
PostingsEnum dpEnum = null;
final CharsRefBuilder tempCharsRefBuilder = new CharsRefBuilder();//only for UTF8->UTF16 call
TERM_LOOP:
while ((termBytesRef = termsEnum.next()) != null) {
//Grab the term (in same way as BytesRef.utf8ToString() but we don't want a String obj)
// note: if term vectors supported seek by ord then we might just keep an int and seek by ord on-demand
tempCharsRefBuilder.grow(termBytesRef.length);
final int termCharsLen = UnicodeUtil.UTF8toUTF16(termBytesRef, tempCharsRefBuilder.chars());
final int termCharsOff = termCharsBuilder.length();
termCharsBuilder.append(tempCharsRefBuilder.chars(), 0, termCharsLen);
dpEnum = termsEnum.postings(dpEnum, dpEnumFlags);
assert dpEnum != null; // presumably checked by TokenSources.hasPositions earlier
int currentDocId = dpEnum.advance(filteredDocId);
if (currentDocId != filteredDocId) {
continue; //Not expected
}
final int freq = dpEnum.freq();
for (int j = 0; j < freq; j++) {
TokenLL token = new TokenLL();
token.position = dpEnum.nextPosition(); // can be -1 if not in the TV
token.termCharsOff = termCharsOff;
token.termCharsLen = (short) Math.min(termCharsLen, Short.MAX_VALUE);
// copy offset (if it's there) and compute bucketIdx
int bucketIdx;
if (offsetAttribute != null) {
token.startOffset = dpEnum.startOffset();
if (offsetLength >= 0 && token.startOffset > offsetLength) {
continue TERM_LOOP;//filter this token out; exceeds threshold
}
token.endOffsetInc = (short) Math.min(dpEnum.endOffset() - token.startOffset, Short.MAX_VALUE);
bucketIdx = (int) (token.startOffset * OFFSET_TO_BUCKET_IDX);
} else {
bucketIdx = (int) (token.position * POSITION_TO_BUCKET_IDX);
}
if (bucketIdx >= tokenBuckets.length) {
bucketIdx = tokenBuckets.length - 1;
}
if (payloadAttribute != null) {
final BytesRef payload = dpEnum.getPayload();
token.payloadIndex = payload == null ? -1 : payloadsBytesRefArray.append(payload);
}
//Add token to the head of the bucket linked list
token.next = tokenBuckets[bucketIdx];
tokenBuckets[bucketIdx] = token;
}
}
// Step 2: Link all Tokens into a linked-list and sort all tokens at the same position
firstToken = initLinkAndSortTokens(tokenBuckets);
// If the term vector didn't have positions, synthesize them
if (!vector.hasPositions() && firstToken != null) {
TokenLL prevToken = firstToken;
prevToken.position = 0;
for (TokenLL token = prevToken.next; token != null; prevToken = token, token = token.next) {
if (prevToken.startOffset == token.startOffset) {
token.position = prevToken.position;
} else {
token.position = prevToken.position + 1;
}
}
}
initialized = true;
}
private static TokenLL initLinkAndSortTokens(TokenLL[] tokenBuckets) {
TokenLL firstToken = null;
List<TokenLL> scratchTokenArray = new ArrayList<>(); // declare here for re-use. TODO use native array
TokenLL prevToken = null;
for (TokenLL tokenHead : tokenBuckets) {
if (tokenHead == null) {
continue;
}
//sort tokens at this position and link them; return the first
TokenLL tokenTail;
// just one token
if (tokenHead.next == null) {
tokenTail = tokenHead;
} else {
// add the linked list to a temporary array
for (TokenLL cur = tokenHead; cur != null; cur = cur.next) {
scratchTokenArray.add(cur);
}
// sort; and set tokenHead & tokenTail
if (scratchTokenArray.size() < INSERTION_SORT_THRESHOLD) {
// insertion sort by creating a linked list (leave scratchTokenArray alone)
tokenHead = tokenTail = scratchTokenArray.get(0);
tokenHead.next = null;
for (int i = 1; i < scratchTokenArray.size(); i++) {
TokenLL insertToken = scratchTokenArray.get(i);
if (insertToken.compareTo(tokenHead) <= 0) {
// takes the place of tokenHead
insertToken.next = tokenHead;
tokenHead = insertToken;
} else {
// goes somewhere after tokenHead
for (TokenLL prev = tokenHead; true; prev = prev.next) {
if (prev.next == null || insertToken.compareTo(prev.next) <= 0) {
if (prev.next == null) {
tokenTail = insertToken;
}
insertToken.next = prev.next;
prev.next = insertToken;
break;
}
}
}
}
} else {
Collections.sort(scratchTokenArray);
// take back out and create a linked list
TokenLL prev = tokenHead = scratchTokenArray.get(0);
for (int i = 1; i < scratchTokenArray.size(); i++) {
prev.next = scratchTokenArray.get(i);
prev = prev.next;
}
tokenTail = prev;
tokenTail.next = null;
}
scratchTokenArray.clear();//too bad ArrayList nulls it out; we don't actually need that
}
//link to previous
if (prevToken != null) {
assert prevToken.next == null;
prevToken.next = tokenHead; //concatenate linked-list
assert prevToken.compareTo(tokenHead) < 0 : "wrong offset / position ordering expectations";
} else {
assert firstToken == null;
firstToken = tokenHead;
}
prevToken = tokenTail;
}
return firstToken;
}
private int initTotalTermCharLen() throws IOException {
int guessNumTerms;
if (vector.size() != -1) {
guessNumTerms = (int) vector.size();
} else if (offsetLength != -1) {
guessNumTerms = (int) (offsetLength * 0.33);//guess 1/3rd
} else {
return 128;
}
return Math.max(64, (int) (guessNumTerms * loadFactor * 7.0));//7 is over-estimate of average term len
}
private TokenLL[] initTokenBucketsArray() throws IOException {
// Estimate the number of non-empty positions (number of tokens, excluding same-position synonyms).
int positionsEstimate;
if (offsetLength == -1) { // no clue what the char length is.
// Estimate the number of position slots we need from term stats based on Wikipedia.
int sumTotalTermFreq = (int) vector.getSumTotalTermFreq();
if (sumTotalTermFreq == -1) {//unfortunately term vectors seem to not have this stat
int size = (int) vector.size();
if (size == -1) {//doesn't happen with term vectors, it seems, but pick a default any way
size = 128;
}
sumTotalTermFreq = (int) (size * 2.4);
}
positionsEstimate = (int) (sumTotalTermFreq * 1.5);//less than 1 in 10 docs exceed this
} else {
// guess number of token positions by this factor.
positionsEstimate = (int) (offsetLength / AVG_CHARS_PER_POSITION);
}
// apply the load factor.
return new TokenLL[Math.max(1, (int) (positionsEstimate * loadFactor))];
}
@Override
public boolean incrementToken() throws IOException {
int posInc;
if (incrementToken == null) {
if (!initialized) {
init();
assert initialized;
}
incrementToken = firstToken;
if (incrementToken == null) {
return false;
}
posInc = incrementToken.position + 1;//first token normally has pos 0; add 1 to get posInc
} else if (incrementToken.next != null) {
int lastPosition = incrementToken.position;
incrementToken = incrementToken.next;
posInc = incrementToken.position - lastPosition;
} else {
return false;
}
clearAttributes();
termAttribute.copyBuffer(termCharsBuilder.chars(), incrementToken.termCharsOff, incrementToken.termCharsLen);
positionIncrementAttribute.setPositionIncrement(posInc);
if (offsetAttribute != null) {
offsetAttribute.setOffset(incrementToken.startOffset, incrementToken.startOffset + incrementToken.endOffsetInc);
}
if (payloadAttribute != null && incrementToken.payloadIndex >= 0) {
payloadAttribute.setPayload(payloadsBytesRefArray.get(spareBytesRefBuilder, incrementToken.payloadIndex));
}
return true;
}
private static class TokenLL implements Comparable<TokenLL> {
// This class should weigh 32 bytes, including object header
int termCharsOff; // see termCharsBuilder
short termCharsLen;
int position;
int startOffset;
short endOffsetInc; // add to startOffset to get endOffset
int payloadIndex;
TokenLL next;
@Override
public int compareTo(TokenLL tokenB) {
int cmp = Integer.compare(this.position, tokenB.position);
if (cmp == 0) {
cmp = Integer.compare(this.startOffset, tokenB.startOffset);
if (cmp == 0) {
cmp = Short.compare(this.endOffsetInc, tokenB.endOffsetInc);
}
}
return cmp;
}
}
}

View File

@ -0,0 +1,173 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.uhighlight;
import java.io.Closeable;
import java.io.IOException;
import java.util.Collections;
import java.util.List;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.automaton.Automata;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
/**
* Analyzes the text, producing a single {@link OffsetsEnum} wrapping the {@link TokenStream} filtered to terms
* in the query, including wildcards. It can't handle position-sensitive queries (phrases). Passage accuracy suffers
* because the freq() is unknown -- it's always {@link Integer#MAX_VALUE} instead.
*/
public class TokenStreamOffsetStrategy extends AnalysisOffsetStrategy {
private static final BytesRef[] ZERO_LEN_BYTES_REF_ARRAY = new BytesRef[0];
public TokenStreamOffsetStrategy(String field, BytesRef[] terms, PhraseHelper phraseHelper, CharacterRunAutomaton[] automata, Analyzer indexAnalyzer) {
super(field, ZERO_LEN_BYTES_REF_ARRAY, phraseHelper, convertTermsToAutomata(terms, automata), indexAnalyzer);
assert phraseHelper.hasPositionSensitivity() == false;
}
private static CharacterRunAutomaton[] convertTermsToAutomata(BytesRef[] terms, CharacterRunAutomaton[] automata) {
CharacterRunAutomaton[] newAutomata = new CharacterRunAutomaton[terms.length + automata.length];
for (int i = 0; i < terms.length; i++) {
String termString = terms[i].utf8ToString();
newAutomata[i] = new CharacterRunAutomaton(Automata.makeString(termString)) {
@Override
public String toString() {
return termString;
}
};
}
// Append existing automata (that which is used for MTQs)
System.arraycopy(automata, 0, newAutomata, terms.length, automata.length);
return newAutomata;
}
@Override
public List<OffsetsEnum> getOffsetsEnums(IndexReader reader, int docId, String content) throws IOException {
TokenStream tokenStream = tokenStream(content);
PostingsEnum mtqPostingsEnum = new TokenStreamPostingsEnum(tokenStream, automata);
mtqPostingsEnum.advance(docId);
return Collections.singletonList(new OffsetsEnum(null, mtqPostingsEnum));
}
// but this would have a performance cost for likely little gain in the user experience, it
// would only serve to make this method less bogus.
// instead, we always return freq() = Integer.MAX_VALUE and let the highlighter terminate based on offset...
// TODO: DWS perhaps instead OffsetsEnum could become abstract and this would be an impl?
private static class TokenStreamPostingsEnum extends PostingsEnum implements Closeable {
TokenStream stream; // becomes null when closed
final CharacterRunAutomaton[] matchers;
final CharTermAttribute charTermAtt;
final OffsetAttribute offsetAtt;
int currentDoc = -1;
int currentMatch = -1;
int currentStartOffset = -1;
int currentEndOffset = -1;
final BytesRef matchDescriptions[];
TokenStreamPostingsEnum(TokenStream ts, CharacterRunAutomaton[] matchers) throws IOException {
this.stream = ts;
this.matchers = matchers;
matchDescriptions = new BytesRef[matchers.length];
charTermAtt = ts.addAttribute(CharTermAttribute.class);
offsetAtt = ts.addAttribute(OffsetAttribute.class);
ts.reset();
}
@Override
public int nextPosition() throws IOException {
if (stream != null) {
while (stream.incrementToken()) {
for (int i = 0; i < matchers.length; i++) {
if (matchers[i].run(charTermAtt.buffer(), 0, charTermAtt.length())) {
currentStartOffset = offsetAtt.startOffset();
currentEndOffset = offsetAtt.endOffset();
currentMatch = i;
return 0;
}
}
}
stream.end();
close();
}
// exhausted
currentStartOffset = currentEndOffset = Integer.MAX_VALUE;
return Integer.MAX_VALUE;
}
@Override
public int freq() throws IOException {
return Integer.MAX_VALUE; // lie
}
@Override
public int startOffset() throws IOException {
assert currentStartOffset >= 0;
return currentStartOffset;
}
@Override
public int endOffset() throws IOException {
assert currentEndOffset >= 0;
return currentEndOffset;
}
@Override
public BytesRef getPayload() throws IOException {
if (matchDescriptions[currentMatch] == null) {
matchDescriptions[currentMatch] = new BytesRef(matchers[currentMatch].toString());
}
return matchDescriptions[currentMatch];
}
@Override
public int docID() {
return currentDoc;
}
@Override
public int nextDoc() throws IOException {
throw new UnsupportedOperationException();
}
@Override
public int advance(int target) throws IOException {
return currentDoc = target;
}
@Override
public long cost() {
return 0;
}
@Override
public void close() throws IOException {
if (stream != null) {
stream.close();
stream = null;
}
}
}
}

View File

@ -117,6 +117,8 @@ public class UnifiedHighlighter {
private boolean defaultHighlightPhrasesStrictly = true; // AKA "accuracy" or "query debugging"
private boolean defaultPassageRelevancyOverSpeed = true; //For analysis, prefer MemoryIndexOffsetStrategy
// private boolean defaultRequireFieldMatch = true; TODO
private int maxLength = DEFAULT_MAX_LENGTH;
@ -213,6 +215,12 @@ public class UnifiedHighlighter {
return defaultHighlightPhrasesStrictly;
}
protected boolean shouldPreferPassageRelevancyOverSpeed(String field) {
return defaultPassageRelevancyOverSpeed;
}
/**
* The maximum content size to process. Content will be truncated to this size before highlighting. Typically
* snippets closer to the beginning of the document better summarize its content.
@ -716,8 +724,13 @@ public class UnifiedHighlighter {
}
protected FieldHighlighter getFieldHighlighter(String field, Query query, SortedSet<Term> allTerms, int maxPassages) {
BytesRef[] terms = filterExtractedTerms(field, allTerms);
Set<HighlightFlag> highlightFlags = getFlags(field);
PhraseHelper phraseHelper = getPhraseHelper(field, query, highlightFlags);
CharacterRunAutomaton[] automata = getAutomata(field, query, highlightFlags);
OffsetSource offsetSource = getOptimizedOffsetSource(field, terms, phraseHelper, automata);
return new FieldHighlighter(field,
getOffsetStrategy(field, query, allTerms),
getOffsetStrategy(offsetSource, field, terms, phraseHelper, automata, highlightFlags),
new SplittingBreakIterator(getBreakIterator(field), UnifiedHighlighter.MULTIVAL_SEP_CHAR),
getScorer(field),
maxPassages,
@ -725,41 +738,7 @@ public class UnifiedHighlighter {
getFormatter(field));
}
protected FieldOffsetStrategy getOffsetStrategy(String field, Query query, SortedSet<Term> allTerms) {
EnumSet<HighlightFlag> highlightFlags = getFlags(field);
BytesRef[] terms = filterExtractedTerms(field, allTerms);
PhraseHelper phraseHelper = getPhraseHelper(field, query, highlightFlags);
CharacterRunAutomaton[] automata = getAutomata(field, query, highlightFlags);
OffsetSource offsetSource = getOptimizedOffsetSource(field, terms, phraseHelper, automata);
switch (offsetSource) {
case ANALYSIS:
return new AnalysisOffsetStrategy(field, terms, phraseHelper, automata, getIndexAnalyzer(),
this::preMultiTermQueryRewrite);
case NONE_NEEDED:
return NoOpOffsetStrategy.INSTANCE;
case TERM_VECTORS:
return new TermVectorOffsetStrategy(field, terms, phraseHelper, automata);
case POSTINGS:
return new PostingsOffsetStrategy(field, terms, phraseHelper, automata);
case POSTINGS_WITH_TERM_VECTORS:
return new PostingsWithTermVectorsOffsetStrategy(field, terms, phraseHelper, automata);
default:
throw new IllegalArgumentException("Unrecognized offset source " + offsetSource);
}
}
protected EnumSet<HighlightFlag> getFlags(String field) {
EnumSet<HighlightFlag> highlightFlags = EnumSet.noneOf(HighlightFlag.class);
if (shouldHandleMultiTermQuery(field)) {
highlightFlags.add(HighlightFlag.MULTI_TERM_QUERY);
}
if (shouldHighlightPhrasesStrictly(field)) {
highlightFlags.add(HighlightFlag.PHRASES);
}
return highlightFlags;
}
protected BytesRef[] filterExtractedTerms(String field, SortedSet<Term> queryTerms) {
protected static BytesRef[] filterExtractedTerms(String field, SortedSet<Term> queryTerms) {
// TODO consider requireFieldMatch
Term floor = new Term(field, "");
Term ceiling = new Term(field, UnicodeUtil.BIG_TERM);
@ -774,7 +753,21 @@ public class UnifiedHighlighter {
return terms;
}
protected PhraseHelper getPhraseHelper(String field, Query query, EnumSet<HighlightFlag> highlightFlags) {
protected Set<HighlightFlag> getFlags(String field) {
Set<HighlightFlag> highlightFlags = EnumSet.noneOf(HighlightFlag.class);
if (shouldHandleMultiTermQuery(field)) {
highlightFlags.add(HighlightFlag.MULTI_TERM_QUERY);
}
if (shouldHighlightPhrasesStrictly(field)) {
highlightFlags.add(HighlightFlag.PHRASES);
}
if (shouldPreferPassageRelevancyOverSpeed(field)) {
highlightFlags.add(HighlightFlag.PASSAGE_RELEVANCY_OVER_SPEED);
}
return highlightFlags;
}
protected PhraseHelper getPhraseHelper(String field, Query query, Set<HighlightFlag> highlightFlags) {
boolean highlightPhrasesStrictly = highlightFlags.contains(HighlightFlag.PHRASES);
boolean handleMultiTermQuery = highlightFlags.contains(HighlightFlag.MULTI_TERM_QUERY);
return highlightPhrasesStrictly ?
@ -782,7 +775,7 @@ public class UnifiedHighlighter {
PhraseHelper.NONE;
}
protected CharacterRunAutomaton[] getAutomata(String field, Query query, EnumSet<HighlightFlag> highlightFlags) {
protected CharacterRunAutomaton[] getAutomata(String field, Query query, Set<HighlightFlag> highlightFlags) {
return highlightFlags.contains(HighlightFlag.MULTI_TERM_QUERY)
? MultiTermHighlighting.extractAutomata(query, field, !highlightFlags.contains(HighlightFlag.PHRASES),
this::preMultiTermQueryRewrite)
@ -790,11 +783,12 @@ public class UnifiedHighlighter {
}
protected OffsetSource getOptimizedOffsetSource(String field, BytesRef[] terms, PhraseHelper phraseHelper, CharacterRunAutomaton[] automata) {
OffsetSource offsetSource = getOffsetSource(field);
if (terms.length == 0 && automata.length == 0 && !phraseHelper.willRewrite()) {
return OffsetSource.NONE_NEEDED; //nothing to highlight
}
OffsetSource offsetSource = getOffsetSource(field);
switch (offsetSource) {
case POSTINGS:
if (phraseHelper.willRewrite()) {
@ -822,6 +816,32 @@ public class UnifiedHighlighter {
return offsetSource;
}
protected FieldOffsetStrategy getOffsetStrategy(OffsetSource offsetSource, String field, BytesRef[] terms,
PhraseHelper phraseHelper, CharacterRunAutomaton[] automata,
Set<HighlightFlag> highlightFlags) {
switch (offsetSource) {
case ANALYSIS:
if (!phraseHelper.hasPositionSensitivity() &&
!highlightFlags.contains(HighlightFlag.PASSAGE_RELEVANCY_OVER_SPEED)) {
//skip using a memory index since it's pure term filtering
return new TokenStreamOffsetStrategy(field, terms, phraseHelper, automata, getIndexAnalyzer());
} else {
return new MemoryIndexOffsetStrategy(field, terms, phraseHelper, automata, getIndexAnalyzer(),
this::preMultiTermQueryRewrite);
}
case NONE_NEEDED:
return NoOpOffsetStrategy.INSTANCE;
case TERM_VECTORS:
return new TermVectorOffsetStrategy(field, terms, phraseHelper, automata);
case POSTINGS:
return new PostingsOffsetStrategy(field, terms, phraseHelper, automata);
case POSTINGS_WITH_TERM_VECTORS:
return new PostingsWithTermVectorsOffsetStrategy(field, terms, phraseHelper, automata);
default:
throw new IllegalArgumentException("Unrecognized offset source " + offsetSource);
}
}
/**
* When highlighting phrases accurately, we need to know which {@link SpanQuery}'s need to have
* {@link Query#rewrite(IndexReader)} called on them. It helps performance to avoid it if it's not needed.
@ -1041,10 +1061,9 @@ public class UnifiedHighlighter {
*/
public enum HighlightFlag {
PHRASES,
MULTI_TERM_QUERY
MULTI_TERM_QUERY,
PASSAGE_RELEVANCY_OVER_SPEED
// TODO: ignoreQueryFields
// TODO: useQueryBoosts
// TODO: avoidMemoryIndexIfPossible
// TODO: preferMemoryIndexForStats
}
}

View File

@ -773,7 +773,40 @@ public class TestUnifiedHighlighterMTQ extends LuceneTestCase {
ir.close();
}
public void testTokenStreamIsClosed() throws IOException {
public void testWithMaxLenAndMultipleWildcardMatches() throws IOException {
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, indexAnalyzer);
Field body = new Field("body", "", fieldType);
Document doc = new Document();
doc.add(body);
//tests interleaving of multiple wildcard matches with the CompositePostingsEnum
//In this case the CompositePostingsEnum will have an underlying PostingsEnum that jumps form pos 1 to 9 for bravo
//and a second with position 2 for Bravado
body.setStringValue("Alpha Bravo Bravado foo foo foo. Foo foo Alpha Bravo");
iw.addDocument(doc);
IndexReader ir = iw.getReader();
iw.close();
IndexSearcher searcher = newSearcher(ir);
UnifiedHighlighter highlighter = new UnifiedHighlighter(searcher, indexAnalyzer);
highlighter.setMaxLength(32);//a little past first sentence
BooleanQuery query = new BooleanQuery.Builder()
.add(new TermQuery(new Term("body", "alpha")), BooleanClause.Occur.MUST)
.add(new PrefixQuery(new Term("body", "bra")), BooleanClause.Occur.MUST)
.build();
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
String snippets[] = highlighter.highlight("body", query, topDocs, 2);//ask for 2 but we'll only get 1
assertArrayEquals(
new String[]{"<b>Alpha</b> <b>Bravo</b> <b>Bravado</b> foo foo foo."}, snippets
);
ir.close();
}
public void testTokenStreamIsClosed() throws Exception {
// note: test is a derivative of testWithMaxLen()
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, indexAnalyzer);
@ -828,8 +861,8 @@ public class TestUnifiedHighlighterMTQ extends LuceneTestCase {
if (fieldType == UHTestHelper.reanalysisType) {
fail("Expecting EXPECTED IOException");
}
} catch (IOException e) {
if (!e.getMessage().equals("EXPECTED")) {
} catch (Exception e) {
if (!e.getMessage().contains("EXPECTED")) {
throw e;
}
}

View File

@ -50,9 +50,8 @@ public class TestUnifiedHighlighterRanking extends LuceneTestCase {
Analyzer indexAnalyzer;
// note: don't choose reanalysis because it doesn't always know the term frequency, which is a statistic used
// in passage ranking. Sometimes it does (e.g. when it builds a MemoryIndex) but not necessarily.
final FieldType fieldType = UHTestHelper.randomFieldType(random(), UHTestHelper.postingsType, UHTestHelper.tvType);
// note: all offset sources, by default, use term freq, so it shouldn't matter which we choose.
final FieldType fieldType = UHTestHelper.randomFieldType(random());
/**
* indexes a bunch of gibberish, and then highlights top(n).

View File

@ -22,11 +22,13 @@ import java.text.BreakIterator;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.SortedSet;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.IndexSearcher;
@ -68,6 +70,11 @@ public class TestUnifiedHighlighterExtensibility extends LuceneTestCase {
return Collections.emptyList();
}
@Override
protected List<OffsetsEnum> createOffsetsEnumsFromReader(LeafReader leafReader, int doc) throws IOException {
return super.createOffsetsEnumsFromReader(leafReader, doc);
}
};
assertEquals(offsetSource, strategy.getOffsetSource());
}
@ -142,8 +149,8 @@ public class TestUnifiedHighlighterExtensibility extends LuceneTestCase {
}
@Override
protected FieldOffsetStrategy getOffsetStrategy(String field, Query query, SortedSet<Term> allTerms) {
return super.getOffsetStrategy(field, query, allTerms);
protected FieldOffsetStrategy getOffsetStrategy(OffsetSource offsetSource, String field, BytesRef[] terms, PhraseHelper phraseHelper, CharacterRunAutomaton[] automata, Set<HighlightFlag> highlightFlags) {
return super.getOffsetStrategy(offsetSource, field, terms, phraseHelper, automata, highlightFlags);
}
@Override