LUCENE-7815: Removed the PostingsHighlighter

This commit is contained in:
David Smiley 2017-05-23 14:39:51 -04:00
parent 14320a584c
commit 0d3c73eaa2
21 changed files with 522 additions and 3993 deletions

View File

@ -53,6 +53,10 @@ API Changes
* LUCENE-7741: DoubleValuesSource now has an explain() method (Alan Woodward,
Adrien Grand)
* LUCENE-7815: Removed the PostingsHighlighter; you should use the UnifiedHighlighter
instead, which derived from the UH. WholeBreakIterator and
CustomSeparatorBreakIterator were moved to UH's package. (David Smiley)
Bug Fixes
* LUCENE-7626: IndexWriter will no longer accept broken token offsets

View File

@ -38,7 +38,7 @@ file.query.maker.file=conf/query-terms.txt
log.queries=false
log.step.SearchTravRetHighlight=-1
highlighter=HlImpl:NONE:SH_A:UH_A:PH_P:UH_P:UH_PV
highlighter=HlImpl:NONE:SH_A:UH_A:UH_P:UH_PV
{ "Populate"
CreateIndex
@ -60,6 +60,6 @@ highlighter=HlImpl:NONE:SH_A:UH_A:PH_P:UH_P:UH_PV
CloseReader
NewRound
} : 6
} : 5
RepSumByPrefRound HL

View File

@ -42,7 +42,6 @@ import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.search.highlight.TokenSources;
import org.apache.lucene.search.postingshighlight.PostingsHighlighter;
import org.apache.lucene.search.uhighlight.UnifiedHighlighter;
import org.apache.lucene.search.vectorhighlight.BoundaryScanner;
import org.apache.lucene.search.vectorhighlight.BreakIteratorBoundaryScanner;
@ -133,8 +132,6 @@ public class SearchTravRetHighlightTask extends SearchTravTask {
case "UH_P": hlImpl = new UnifiedHLImpl(UnifiedHighlighter.OffsetSource.POSTINGS); break;
case "UH_PV": hlImpl = new UnifiedHLImpl(UnifiedHighlighter.OffsetSource.POSTINGS_WITH_TERM_VECTORS); break;
case "PH_P": hlImpl = new PostingsHLImpl(); break;
default: throw new Exception("unrecognized highlighter type: " + type + " (try 'UH')");
}
}
@ -224,33 +221,6 @@ public class SearchTravRetHighlightTask extends SearchTravTask {
return clone;
}
private class PostingsHLImpl implements HLImpl {
PostingsHighlighter highlighter;
String[] fields = hlFields.toArray(new String[hlFields.size()]);
int[] maxPassages;
PostingsHLImpl() {
highlighter = new PostingsHighlighter(maxDocCharsToAnalyze) {
@Override
protected Analyzer getIndexAnalyzer(String field) { // thus support wildcards
return analyzer;
}
@Override
protected BreakIterator getBreakIterator(String field) {
return BreakIterator.getSentenceInstance(Locale.ENGLISH);
}
};
maxPassages = new int[hlFields.size()];
Arrays.fill(maxPassages, maxFrags);
}
@Override
public void withTopDocs(IndexSearcher searcher, Query q, TopDocs hits) throws Exception {
Map<String, String[]> result = highlighter.highlightFields(fields, q, searcher, hits, maxPassages);
preventOptimizeAway = result.size();
}
}
private class UnifiedHLImpl implements HLImpl {
UnifiedHighlighter highlighter;
IndexSearcher lastSearcher;

View File

@ -1,137 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.postingshighlight;
/**
* Creates a formatted snippet from the top passages.
* <p>
* The default implementation marks the query terms as bold, and places
* ellipses between unconnected passages.
*/
public class DefaultPassageFormatter extends PassageFormatter {
/** text that will appear before highlighted terms */
protected final String preTag;
/** text that will appear after highlighted terms */
protected final String postTag;
/** text that will appear between two unconnected passages */
protected final String ellipsis;
/** true if we should escape for html */
protected final boolean escape;
/**
* Creates a new DefaultPassageFormatter with the default tags.
*/
public DefaultPassageFormatter() {
this("<b>", "</b>", "... ", false);
}
/**
* Creates a new DefaultPassageFormatter with custom tags.
* @param preTag text which should appear before a highlighted term.
* @param postTag text which should appear after a highlighted term.
* @param ellipsis text which should be used to connect two unconnected passages.
* @param escape true if text should be html-escaped
*/
public DefaultPassageFormatter(String preTag, String postTag, String ellipsis, boolean escape) {
if (preTag == null || postTag == null || ellipsis == null) {
throw new NullPointerException();
}
this.preTag = preTag;
this.postTag = postTag;
this.ellipsis = ellipsis;
this.escape = escape;
}
@Override
public String format(Passage passages[], String content) {
StringBuilder sb = new StringBuilder();
int pos = 0;
for (Passage passage : passages) {
// don't add ellipsis if it's the first one, or if it's connected.
if (passage.startOffset > pos && pos > 0) {
sb.append(ellipsis);
}
pos = passage.startOffset;
for (int i = 0; i < passage.numMatches; i++) {
int start = passage.matchStarts[i];
int end = passage.matchEnds[i];
// it's possible to have overlapping terms
if (start > pos) {
append(sb, content, pos, start);
}
if (end > pos) {
sb.append(preTag);
append(sb, content, Math.max(pos, start), end);
sb.append(postTag);
pos = end;
}
}
// it's possible a "term" from the analyzer could span a sentence boundary.
append(sb, content, pos, Math.max(pos, passage.endOffset));
pos = passage.endOffset;
}
return sb.toString();
}
/**
* Appends original text to the response.
* @param dest resulting text, possibly transformed or encoded
* @param content original text content
* @param start index of the first character in content
* @param end index of the character following the last character in content
*/
protected void append(StringBuilder dest, String content, int start, int end) {
if (escape) {
// note: these are the rules from owasp.org
for (int i = start; i < end; i++) {
char ch = content.charAt(i);
switch(ch) {
case '&':
dest.append("&amp;");
break;
case '<':
dest.append("&lt;");
break;
case '>':
dest.append("&gt;");
break;
case '"':
dest.append("&quot;");
break;
case '\'':
dest.append("&#x27;");
break;
case '/':
dest.append("&#x2F;");
break;
default:
if (ch >= 0x30 && ch <= 0x39 || ch >= 0x41 && ch <= 0x5A || ch >= 0x61 && ch <= 0x7A) {
dest.append(ch);
} else if (ch < 0xff) {
dest.append("&#");
dest.append((int)ch);
dest.append(";");
} else {
dest.append(ch);
}
}
}
} else {
dest.append(content, start, end);
}
}
}

View File

@ -1,282 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.postingshighlight;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.List;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.AutomatonQuery;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.ConstantScoreQuery;
import org.apache.lucene.search.DisjunctionMaxQuery;
import org.apache.lucene.search.FuzzyQuery;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermRangeQuery;
import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper;
import org.apache.lucene.search.spans.SpanNearQuery;
import org.apache.lucene.search.spans.SpanNotQuery;
import org.apache.lucene.search.spans.SpanOrQuery;
import org.apache.lucene.search.spans.SpanPositionCheckQuery;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util.automaton.Automata;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
import org.apache.lucene.util.automaton.LevenshteinAutomata;
import org.apache.lucene.util.automaton.Operations;
/**
* Support for highlighting multiterm queries in PostingsHighlighter.
*/
class MultiTermHighlighting {
/**
* Extracts all MultiTermQueries for {@code field}, and returns equivalent
* automata that will match terms.
*/
static CharacterRunAutomaton[] extractAutomata(Query query, String field) {
List<CharacterRunAutomaton> list = new ArrayList<>();
if (query instanceof BooleanQuery) {
for (BooleanClause clause : (BooleanQuery) query) {
if (!clause.isProhibited()) {
list.addAll(Arrays.asList(extractAutomata(clause.getQuery(), field)));
}
}
} else if (query instanceof ConstantScoreQuery) {
list.addAll(Arrays.asList(extractAutomata(((ConstantScoreQuery) query).getQuery(), field)));
} else if (query instanceof DisjunctionMaxQuery) {
for (Query sub : ((DisjunctionMaxQuery) query).getDisjuncts()) {
list.addAll(Arrays.asList(extractAutomata(sub, field)));
}
} else if (query instanceof SpanOrQuery) {
for (Query sub : ((SpanOrQuery) query).getClauses()) {
list.addAll(Arrays.asList(extractAutomata(sub, field)));
}
} else if (query instanceof SpanNearQuery) {
for (Query sub : ((SpanNearQuery) query).getClauses()) {
list.addAll(Arrays.asList(extractAutomata(sub, field)));
}
} else if (query instanceof SpanNotQuery) {
list.addAll(Arrays.asList(extractAutomata(((SpanNotQuery) query).getInclude(), field)));
} else if (query instanceof SpanPositionCheckQuery) {
list.addAll(Arrays.asList(extractAutomata(((SpanPositionCheckQuery) query).getMatch(), field)));
} else if (query instanceof SpanMultiTermQueryWrapper) {
list.addAll(Arrays.asList(extractAutomata(((SpanMultiTermQueryWrapper<?>) query).getWrappedQuery(), field)));
} else if (query instanceof PrefixQuery) {
final PrefixQuery pq = (PrefixQuery) query;
Term prefix = pq.getPrefix();
if (prefix.field().equals(field)) {
list.add(new CharacterRunAutomaton(Operations.concatenate(Automata.makeString(prefix.text()),
Automata.makeAnyString())) {
@Override
public String toString() {
return pq.toString();
}
});
}
} else if (query instanceof FuzzyQuery) {
final FuzzyQuery fq = (FuzzyQuery) query;
if (fq.getField().equals(field)) {
String utf16 = fq.getTerm().text();
int termText[] = new int[utf16.codePointCount(0, utf16.length())];
for (int cp, i = 0, j = 0; i < utf16.length(); i += Character.charCount(cp)) {
termText[j++] = cp = utf16.codePointAt(i);
}
int termLength = termText.length;
int prefixLength = Math.min(fq.getPrefixLength(), termLength);
String suffix = UnicodeUtil.newString(termText, prefixLength, termText.length - prefixLength);
LevenshteinAutomata builder = new LevenshteinAutomata(suffix, fq.getTranspositions());
String prefix = UnicodeUtil.newString(termText, 0, prefixLength);
Automaton automaton = builder.toAutomaton(fq.getMaxEdits(), prefix);
list.add(new CharacterRunAutomaton(automaton) {
@Override
public String toString() {
return fq.toString();
}
});
}
} else if (query instanceof TermRangeQuery) {
final TermRangeQuery tq = (TermRangeQuery) query;
if (tq.getField().equals(field)) {
final CharsRef lowerBound;
if (tq.getLowerTerm() == null) {
lowerBound = null;
} else {
lowerBound = new CharsRef(tq.getLowerTerm().utf8ToString());
}
final CharsRef upperBound;
if (tq.getUpperTerm() == null) {
upperBound = null;
} else {
upperBound = new CharsRef(tq.getUpperTerm().utf8ToString());
}
final boolean includeLower = tq.includesLower();
final boolean includeUpper = tq.includesUpper();
final CharsRef scratch = new CharsRef();
final Comparator<CharsRef> comparator = CharsRef.getUTF16SortedAsUTF8Comparator();
// this is *not* an automaton, but it's very simple
list.add(new CharacterRunAutomaton(Automata.makeEmpty()) {
@Override
public boolean run(char[] s, int offset, int length) {
scratch.chars = s;
scratch.offset = offset;
scratch.length = length;
if (lowerBound != null) {
int cmp = comparator.compare(scratch, lowerBound);
if (cmp < 0 || (!includeLower && cmp == 0)) {
return false;
}
}
if (upperBound != null) {
int cmp = comparator.compare(scratch, upperBound);
if (cmp > 0 || (!includeUpper && cmp == 0)) {
return false;
}
}
return true;
}
@Override
public String toString() {
return tq.toString();
}
});
}
} else if (query instanceof AutomatonQuery) {
final AutomatonQuery aq = (AutomatonQuery) query;
if (aq.getField().equals(field)) {
list.add(new CharacterRunAutomaton(aq.getAutomaton()) {
@Override
public String toString() {
return aq.toString();
}
});
}
}
return list.toArray(new CharacterRunAutomaton[list.size()]);
}
/**
* Returns a "fake" DocsAndPositionsEnum over the tokenstream, returning offsets where {@code matchers}
* matches tokens.
* <p>
* This is solely used internally by PostingsHighlighter: <b>DO NOT USE THIS METHOD!</b>
*/
static PostingsEnum getDocsEnum(final TokenStream ts, final CharacterRunAutomaton[] matchers) throws IOException {
final CharTermAttribute charTermAtt = ts.addAttribute(CharTermAttribute.class);
final OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
ts.reset();
// TODO: we could use CachingWrapperFilter, (or consume twice) to allow us to have a true freq()
// but this would have a performance cost for likely little gain in the user experience, it
// would only serve to make this method less bogus.
// instead, we always return freq() = Integer.MAX_VALUE and let PH terminate based on offset...
return new PostingsEnum() {
int currentDoc = -1;
int currentMatch = -1;
int currentStartOffset = -1;
int currentEndOffset = -1;
TokenStream stream = ts;
final BytesRef matchDescriptions[] = new BytesRef[matchers.length];
@Override
public int nextPosition() throws IOException {
if (stream != null) {
while (stream.incrementToken()) {
for (int i = 0; i < matchers.length; i++) {
if (matchers[i].run(charTermAtt.buffer(), 0, charTermAtt.length())) {
currentStartOffset = offsetAtt.startOffset();
currentEndOffset = offsetAtt.endOffset();
currentMatch = i;
return 0;
}
}
}
stream.end();
stream.close();
stream = null;
}
// exhausted
currentStartOffset = currentEndOffset = Integer.MAX_VALUE;
return Integer.MAX_VALUE;
}
@Override
public int freq() throws IOException {
return Integer.MAX_VALUE; // lie
}
@Override
public int startOffset() throws IOException {
assert currentStartOffset >= 0;
return currentStartOffset;
}
@Override
public int endOffset() throws IOException {
assert currentEndOffset >= 0;
return currentEndOffset;
}
@Override
public BytesRef getPayload() throws IOException {
if (matchDescriptions[currentMatch] == null) {
matchDescriptions[currentMatch] = new BytesRef(matchers[currentMatch].toString());
}
return matchDescriptions[currentMatch];
}
@Override
public int docID() {
return currentDoc;
}
@Override
public int nextDoc() throws IOException {
throw new UnsupportedOperationException();
}
@Override
public int advance(int target) throws IOException {
return currentDoc = target;
}
@Override
public long cost() {
return 0;
}
};
}
}

View File

@ -1,159 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.postingshighlight;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.InPlaceMergeSorter;
import org.apache.lucene.util.RamUsageEstimator;
/**
* Represents a passage (typically a sentence of the document).
* <p>
* A passage contains {@link #getNumMatches} highlights from the query,
* and the offsets and query terms that correspond with each match.
* @lucene.experimental
*/
public final class Passage {
int startOffset = -1;
int endOffset = -1;
float score = 0.0f;
int matchStarts[] = new int[8];
int matchEnds[] = new int[8];
BytesRef matchTerms[] = new BytesRef[8];
int numMatches = 0;
void addMatch(int startOffset, int endOffset, BytesRef term) {
assert startOffset >= this.startOffset && startOffset <= this.endOffset;
if (numMatches == matchStarts.length) {
int newLength = ArrayUtil.oversize(numMatches+1, RamUsageEstimator.NUM_BYTES_OBJECT_REF);
int newMatchStarts[] = new int[newLength];
int newMatchEnds[] = new int[newLength];
BytesRef newMatchTerms[] = new BytesRef[newLength];
System.arraycopy(matchStarts, 0, newMatchStarts, 0, numMatches);
System.arraycopy(matchEnds, 0, newMatchEnds, 0, numMatches);
System.arraycopy(matchTerms, 0, newMatchTerms, 0, numMatches);
matchStarts = newMatchStarts;
matchEnds = newMatchEnds;
matchTerms = newMatchTerms;
}
assert matchStarts.length == matchEnds.length && matchEnds.length == matchTerms.length;
matchStarts[numMatches] = startOffset;
matchEnds[numMatches] = endOffset;
matchTerms[numMatches] = term;
numMatches++;
}
void sort() {
final int starts[] = matchStarts;
final int ends[] = matchEnds;
final BytesRef terms[] = matchTerms;
new InPlaceMergeSorter() {
@Override
protected void swap(int i, int j) {
int temp = starts[i];
starts[i] = starts[j];
starts[j] = temp;
temp = ends[i];
ends[i] = ends[j];
ends[j] = temp;
BytesRef tempTerm = terms[i];
terms[i] = terms[j];
terms[j] = tempTerm;
}
@Override
protected int compare(int i, int j) {
return Integer.compare(starts[i], starts[j]);
}
}.sort(0, numMatches);
}
void reset() {
startOffset = endOffset = -1;
score = 0.0f;
numMatches = 0;
}
/**
* Start offset of this passage.
* @return start index (inclusive) of the passage in the
* original content: always &gt;= 0.
*/
public int getStartOffset() {
return startOffset;
}
/**
* End offset of this passage.
* @return end index (exclusive) of the passage in the
* original content: always &gt;= {@link #getStartOffset()}
*/
public int getEndOffset() {
return endOffset;
}
/**
* Passage's score.
*/
public float getScore() {
return score;
}
/**
* Number of term matches available in
* {@link #getMatchStarts}, {@link #getMatchEnds},
* {@link #getMatchTerms}
*/
public int getNumMatches() {
return numMatches;
}
/**
* Start offsets of the term matches, in increasing order.
* <p>
* Only {@link #getNumMatches} are valid. Note that these
* offsets are absolute (not relative to {@link #getStartOffset()}).
*/
public int[] getMatchStarts() {
return matchStarts;
}
/**
* End offsets of the term matches, corresponding with {@link #getMatchStarts}.
* <p>
* Only {@link #getNumMatches} are valid. Note that it's possible that an end offset
* could exceed beyond the bounds of the passage ({@link #getEndOffset()}), if the
* Analyzer produced a term which spans a passage boundary.
*/
public int[] getMatchEnds() {
return matchEnds;
}
/**
* BytesRef (term text) of the matches, corresponding with {@link #getMatchStarts()}.
* <p>
* Only {@link #getNumMatches()} are valid.
*/
public BytesRef[] getMatchTerms() {
return matchTerms;
}
}

View File

@ -1,40 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.postingshighlight;
/**
* Creates a formatted snippet from the top passages.
*
* @lucene.experimental
*/
public abstract class PassageFormatter {
/**
* Formats the top <code>passages</code> from <code>content</code>
* into a human-readable text snippet.
*
* @param passages top-N passages for the field. Note these are sorted in
* the order that they appear in the document for convenience.
* @param content content for the field.
* @return formatted highlight. Note that for the
* non-expert APIs in {@link PostingsHighlighter} that
* return String, the toString method on the Object
* returned by this method is used to compute the string.
*/
public abstract Object format(Passage passages[], String content);
}

View File

@ -1,104 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.postingshighlight;
/**
* Ranks passages found by {@link PostingsHighlighter}.
* <p>
* Each passage is scored as a miniature document within the document.
* The final score is computed as {@link #norm} * &sum; ({@link #weight} * {@link #tf}).
* The default implementation is {@link #norm} * BM25.
* @lucene.experimental
*/
public class PassageScorer {
// TODO: this formula is completely made up. It might not provide relevant snippets!
/** BM25 k1 parameter, controls term frequency normalization */
final float k1;
/** BM25 b parameter, controls length normalization. */
final float b;
/** A pivot used for length normalization. */
final float pivot;
/**
* Creates PassageScorer with these default values:
* <ul>
* <li>{@code k1 = 1.2},
* <li>{@code b = 0.75}.
* <li>{@code pivot = 87}
* </ul>
*/
public PassageScorer() {
// 1.2 and 0.75 are well-known bm25 defaults (but maybe not the best here) ?
// 87 is typical average english sentence length.
this(1.2f, 0.75f, 87f);
}
/**
* Creates PassageScorer with specified scoring parameters
* @param k1 Controls non-linear term frequency normalization (saturation).
* @param b Controls to what degree passage length normalizes tf values.
* @param pivot Pivot value for length normalization (some rough idea of average sentence length in characters).
*/
public PassageScorer(float k1, float b, float pivot) {
this.k1 = k1;
this.b = b;
this.pivot = pivot;
}
/**
* Computes term importance, given its in-document statistics.
*
* @param contentLength length of document in characters
* @param totalTermFreq number of time term occurs in document
* @return term importance
*/
public float weight(int contentLength, int totalTermFreq) {
// approximate #docs from content length
float numDocs = 1 + contentLength / pivot;
// numDocs not numDocs - docFreq (ala DFR), since we approximate numDocs
return (k1 + 1) * (float) Math.log(1 + (numDocs + 0.5D)/(totalTermFreq + 0.5D));
}
/**
* Computes term weight, given the frequency within the passage
* and the passage's length.
*
* @param freq number of occurrences of within this passage
* @param passageLen length of the passage in characters.
* @return term weight
*/
public float tf(int freq, int passageLen) {
float norm = k1 * ((1 - b) + b * (passageLen / pivot));
return freq / (freq + norm);
}
/**
* Normalize a passage according to its position in the document.
* <p>
* Typically passages towards the beginning of the document are
* more useful for summarizing the contents.
* <p>
* The default implementation is <code>1 + 1/log(pivot + passageStart)</code>
* @param passageStart start offset of the passage
* @return a boost value multiplied into the passage's core.
*/
public float norm(int passageStart) {
return 1 + 1/(float)Math.log(pivot + passageStart);
}
}

View File

@ -1,820 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.postingshighlight;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.text.BreakIterator;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.PriorityQueue;
import java.util.SortedSet;
import java.util.TreeSet;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexReaderContext;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.MultiReader;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.ReaderUtil;
import org.apache.lucene.index.StoredFieldVisitor;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.InPlaceMergeSorter;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
/**
* Simple highlighter that does not analyze fields nor use
* term vectors. Instead it requires
* {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}.
* <p>
* PostingsHighlighter treats the single original document as the whole corpus, and then scores individual
* passages as if they were documents in this corpus. It uses a {@link BreakIterator} to find
* passages in the text; by default it breaks using {@link BreakIterator#getSentenceInstance(Locale)
* getSentenceInstance(Locale.ROOT)}. It then iterates in parallel (merge sorting by offset) through
* the positions of all terms from the query, coalescing those hits that occur in a single passage
* into a {@link Passage}, and then scores each Passage using a separate {@link PassageScorer}.
* Passages are finally formatted into highlighted snippets with a {@link PassageFormatter}.
* <p>
* You can customize the behavior by subclassing this highlighter, some important hooks:
* <ul>
* <li>{@link #getBreakIterator(String)}: Customize how the text is divided into passages.
* <li>{@link #getScorer(String)}: Customize how passages are ranked.
* <li>{@link #getFormatter(String)}: Customize how snippets are formatted.
* <li>{@link #getIndexAnalyzer(String)}: Enable highlighting of MultiTermQuerys such as {@code WildcardQuery}.
* </ul>
* <p>
* <b>WARNING</b>: The code is very new and probably still has some exciting bugs!
* <p>
* Example usage:
* <pre class="prettyprint">
* // configure field with offsets at index time
* FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
* offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
* Field body = new Field("body", "foobar", offsetsType);
*
* // retrieve highlights at query time
* PostingsHighlighter highlighter = new PostingsHighlighter();
* Query query = new TermQuery(new Term("body", "highlighting"));
* TopDocs topDocs = searcher.search(query, n);
* String highlights[] = highlighter.highlight("body", query, searcher, topDocs);
* </pre>
* <p>
* This is thread-safe, and can be used across different readers.
* @lucene.experimental
*/
public class PostingsHighlighter {
// TODO: maybe allow re-analysis for tiny fields? currently we require offsets,
// but if the analyzer is really fast and the field is tiny, this might really be
// unnecessary.
/** for rewriting: we don't want slow processing from MTQs */
private static final IndexSearcher EMPTY_INDEXSEARCHER;
static {
try {
IndexReader emptyReader = new MultiReader();
EMPTY_INDEXSEARCHER = new IndexSearcher(emptyReader);
EMPTY_INDEXSEARCHER.setQueryCache(null);
} catch (IOException bogus) {
throw new RuntimeException(bogus);
}
}
/** Default maximum content size to process. Typically snippets
* closer to the beginning of the document better summarize its content */
public static final int DEFAULT_MAX_LENGTH = 10000;
private final int maxLength;
/** Set the first time {@link #getFormatter} is called,
* and then reused. */
private PassageFormatter defaultFormatter;
/** Set the first time {@link #getScorer} is called,
* and then reused. */
private PassageScorer defaultScorer;
/**
* Creates a new highlighter with {@link #DEFAULT_MAX_LENGTH}.
*/
public PostingsHighlighter() {
this(DEFAULT_MAX_LENGTH);
}
/**
* Creates a new highlighter, specifying maximum content length.
* @param maxLength maximum content size to process.
* @throws IllegalArgumentException if <code>maxLength</code> is negative or <code>Integer.MAX_VALUE</code>
*/
public PostingsHighlighter(int maxLength) {
if (maxLength < 0 || maxLength == Integer.MAX_VALUE) {
// two reasons: no overflow problems in BreakIterator.preceding(offset+1),
// our sentinel in the offsets queue uses this value to terminate.
throw new IllegalArgumentException("maxLength must be < Integer.MAX_VALUE");
}
this.maxLength = maxLength;
}
/** Returns the {@link BreakIterator} to use for
* dividing text into passages. This returns
* {@link BreakIterator#getSentenceInstance(Locale)} by default;
* subclasses can override to customize. */
protected BreakIterator getBreakIterator(String field) {
return BreakIterator.getSentenceInstance(Locale.ROOT);
}
/** Returns the {@link PassageFormatter} to use for
* formatting passages into highlighted snippets. This
* returns a new {@code PassageFormatter} by default;
* subclasses can override to customize. */
protected PassageFormatter getFormatter(String field) {
if (defaultFormatter == null) {
defaultFormatter = new DefaultPassageFormatter();
}
return defaultFormatter;
}
/** Returns the {@link PassageScorer} to use for
* ranking passages. This
* returns a new {@code PassageScorer} by default;
* subclasses can override to customize. */
protected PassageScorer getScorer(String field) {
if (defaultScorer == null) {
defaultScorer = new PassageScorer();
}
return defaultScorer;
}
/**
* Highlights the top passages from a single field.
*
* @param field field name to highlight.
* Must have a stored string value and also be indexed with offsets.
* @param query query to highlight.
* @param searcher searcher that was previously used to execute the query.
* @param topDocs TopDocs containing the summary result documents to highlight.
* @return Array of formatted snippets corresponding to the documents in <code>topDocs</code>.
* If no highlights were found for a document, the
* first sentence for the field will be returned.
* @throws IOException if an I/O error occurred during processing
* @throws IllegalArgumentException if <code>field</code> was indexed without
* {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
*/
public String[] highlight(String field, Query query, IndexSearcher searcher, TopDocs topDocs) throws IOException {
return highlight(field, query, searcher, topDocs, 1);
}
/**
* Highlights the top-N passages from a single field.
*
* @param field field name to highlight.
* Must have a stored string value and also be indexed with offsets.
* @param query query to highlight.
* @param searcher searcher that was previously used to execute the query.
* @param topDocs TopDocs containing the summary result documents to highlight.
* @param maxPassages The maximum number of top-N ranked passages used to
* form the highlighted snippets.
* @return Array of formatted snippets corresponding to the documents in <code>topDocs</code>.
* If no highlights were found for a document, the
* first {@code maxPassages} sentences from the
* field will be returned.
* @throws IOException if an I/O error occurred during processing
* @throws IllegalArgumentException if <code>field</code> was indexed without
* {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
*/
public String[] highlight(String field, Query query, IndexSearcher searcher, TopDocs topDocs, int maxPassages) throws IOException {
Map<String,String[]> res = highlightFields(new String[] { field }, query, searcher, topDocs, new int[] { maxPassages });
return res.get(field);
}
/**
* Highlights the top passages from multiple fields.
* <p>
* Conceptually, this behaves as a more efficient form of:
* <pre class="prettyprint">
* Map m = new HashMap();
* for (String field : fields) {
* m.put(field, highlight(field, query, searcher, topDocs));
* }
* return m;
* </pre>
*
* @param fields field names to highlight.
* Must have a stored string value and also be indexed with offsets.
* @param query query to highlight.
* @param searcher searcher that was previously used to execute the query.
* @param topDocs TopDocs containing the summary result documents to highlight.
* @return Map keyed on field name, containing the array of formatted snippets
* corresponding to the documents in <code>topDocs</code>.
* If no highlights were found for a document, the
* first sentence from the field will be returned.
* @throws IOException if an I/O error occurred during processing
* @throws IllegalArgumentException if <code>field</code> was indexed without
* {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
*/
public Map<String,String[]> highlightFields(String fields[], Query query, IndexSearcher searcher, TopDocs topDocs) throws IOException {
int maxPassages[] = new int[fields.length];
Arrays.fill(maxPassages, 1);
return highlightFields(fields, query, searcher, topDocs, maxPassages);
}
/**
* Highlights the top-N passages from multiple fields.
* <p>
* Conceptually, this behaves as a more efficient form of:
* <pre class="prettyprint">
* Map m = new HashMap();
* for (String field : fields) {
* m.put(field, highlight(field, query, searcher, topDocs, maxPassages));
* }
* return m;
* </pre>
*
* @param fields field names to highlight.
* Must have a stored string value and also be indexed with offsets.
* @param query query to highlight.
* @param searcher searcher that was previously used to execute the query.
* @param topDocs TopDocs containing the summary result documents to highlight.
* @param maxPassages The maximum number of top-N ranked passages per-field used to
* form the highlighted snippets.
* @return Map keyed on field name, containing the array of formatted snippets
* corresponding to the documents in <code>topDocs</code>.
* If no highlights were found for a document, the
* first {@code maxPassages} sentences from the
* field will be returned.
* @throws IOException if an I/O error occurred during processing
* @throws IllegalArgumentException if <code>field</code> was indexed without
* {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
*/
public Map<String,String[]> highlightFields(String fields[], Query query, IndexSearcher searcher, TopDocs topDocs, int maxPassages[]) throws IOException {
final ScoreDoc scoreDocs[] = topDocs.scoreDocs;
int docids[] = new int[scoreDocs.length];
for (int i = 0; i < docids.length; i++) {
docids[i] = scoreDocs[i].doc;
}
return highlightFields(fields, query, searcher, docids, maxPassages);
}
/**
* Highlights the top-N passages from multiple fields,
* for the provided int[] docids.
*
* @param fieldsIn field names to highlight.
* Must have a stored string value and also be indexed with offsets.
* @param query query to highlight.
* @param searcher searcher that was previously used to execute the query.
* @param docidsIn containing the document IDs to highlight.
* @param maxPassagesIn The maximum number of top-N ranked passages per-field used to
* form the highlighted snippets.
* @return Map keyed on field name, containing the array of formatted snippets
* corresponding to the documents in <code>docidsIn</code>.
* If no highlights were found for a document, the
* first {@code maxPassages} from the field will
* be returned.
* @throws IOException if an I/O error occurred during processing
* @throws IllegalArgumentException if <code>field</code> was indexed without
* {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
*/
public Map<String,String[]> highlightFields(String fieldsIn[], Query query, IndexSearcher searcher, int[] docidsIn, int maxPassagesIn[]) throws IOException {
Map<String,String[]> snippets = new HashMap<>();
for(Map.Entry<String,Object[]> ent : highlightFieldsAsObjects(fieldsIn, query, searcher, docidsIn, maxPassagesIn).entrySet()) {
Object[] snippetObjects = ent.getValue();
String[] snippetStrings = new String[snippetObjects.length];
snippets.put(ent.getKey(), snippetStrings);
for(int i=0;i<snippetObjects.length;i++) {
Object snippet = snippetObjects[i];
if (snippet != null) {
snippetStrings[i] = snippet.toString();
}
}
}
return snippets;
}
/**
* Expert: highlights the top-N passages from multiple fields,
* for the provided int[] docids, to custom Object as
* returned by the {@link PassageFormatter}. Use
* this API to render to something other than String.
*
* @param fieldsIn field names to highlight.
* Must have a stored string value and also be indexed with offsets.
* @param query query to highlight.
* @param searcher searcher that was previously used to execute the query.
* @param docidsIn containing the document IDs to highlight.
* @param maxPassagesIn The maximum number of top-N ranked passages per-field used to
* form the highlighted snippets.
* @return Map keyed on field name, containing the array of formatted snippets
* corresponding to the documents in <code>docidsIn</code>.
* If no highlights were found for a document, the
* first {@code maxPassages} from the field will
* be returned.
* @throws IOException if an I/O error occurred during processing
* @throws IllegalArgumentException if <code>field</code> was indexed without
* {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
*/
protected Map<String,Object[]> highlightFieldsAsObjects(String fieldsIn[], Query query, IndexSearcher searcher, int[] docidsIn, int maxPassagesIn[]) throws IOException {
if (fieldsIn.length < 1) {
throw new IllegalArgumentException("fieldsIn must not be empty");
}
if (fieldsIn.length != maxPassagesIn.length) {
throw new IllegalArgumentException("invalid number of maxPassagesIn");
}
SortedSet<Term> queryTerms = new TreeSet<>();
EMPTY_INDEXSEARCHER.createNormalizedWeight(query, false).extractTerms(queryTerms);
IndexReaderContext readerContext = searcher.getIndexReader().getContext();
List<LeafReaderContext> leaves = readerContext.leaves();
// Make our own copies because we sort in-place:
int[] docids = new int[docidsIn.length];
System.arraycopy(docidsIn, 0, docids, 0, docidsIn.length);
final String fields[] = new String[fieldsIn.length];
System.arraycopy(fieldsIn, 0, fields, 0, fieldsIn.length);
final int maxPassages[] = new int[maxPassagesIn.length];
System.arraycopy(maxPassagesIn, 0, maxPassages, 0, maxPassagesIn.length);
// sort for sequential io
Arrays.sort(docids);
new InPlaceMergeSorter() {
@Override
protected void swap(int i, int j) {
String tmp = fields[i];
fields[i] = fields[j];
fields[j] = tmp;
int tmp2 = maxPassages[i];
maxPassages[i] = maxPassages[j];
maxPassages[j] = tmp2;
}
@Override
protected int compare(int i, int j) {
return fields[i].compareTo(fields[j]);
}
}.sort(0, fields.length);
// pull stored data:
String[][] contents = loadFieldValues(searcher, fields, docids, maxLength);
Map<String,Object[]> highlights = new HashMap<>();
for (int i = 0; i < fields.length; i++) {
String field = fields[i];
int numPassages = maxPassages[i];
Term floor = new Term(field, "");
Term ceiling = new Term(field, UnicodeUtil.BIG_TERM);
SortedSet<Term> fieldTerms = queryTerms.subSet(floor, ceiling);
// TODO: should we have some reasonable defaults for term pruning? (e.g. stopwords)
// Strip off the redundant field:
BytesRef terms[] = new BytesRef[fieldTerms.size()];
int termUpto = 0;
for(Term term : fieldTerms) {
terms[termUpto++] = term.bytes();
}
Map<Integer,Object> fieldHighlights = highlightField(field, contents[i], getBreakIterator(field), terms, docids, leaves, numPassages, query);
Object[] result = new Object[docids.length];
for (int j = 0; j < docidsIn.length; j++) {
result[j] = fieldHighlights.get(docidsIn[j]);
}
highlights.put(field, result);
}
return highlights;
}
/** Loads the String values for each field X docID to be
* highlighted. By default this loads from stored
* fields, but a subclass can change the source. This
* method should allocate the String[fields.length][docids.length]
* and fill all values. The returned Strings must be
* identical to what was indexed. */
protected String[][] loadFieldValues(IndexSearcher searcher, String[] fields, int[] docids, int maxLength) throws IOException {
String contents[][] = new String[fields.length][docids.length];
char valueSeparators[] = new char[fields.length];
for (int i = 0; i < fields.length; i++) {
valueSeparators[i] = getMultiValuedSeparator(fields[i]);
}
LimitedStoredFieldVisitor visitor = new LimitedStoredFieldVisitor(fields, valueSeparators, maxLength);
for (int i = 0; i < docids.length; i++) {
searcher.doc(docids[i], visitor);
for (int j = 0; j < fields.length; j++) {
contents[j][i] = visitor.getValue(j).toString();
}
visitor.reset();
}
return contents;
}
/**
* Returns the logical separator between values for multi-valued fields.
* The default value is a space character, which means passages can span across values,
* but a subclass can override, for example with {@code U+2029 PARAGRAPH SEPARATOR (PS)}
* if each value holds a discrete passage for highlighting.
*/
protected char getMultiValuedSeparator(String field) {
return ' ';
}
/**
* Returns the analyzer originally used to index the content for {@code field}.
* <p>
* This is used to highlight some MultiTermQueries.
* @return Analyzer or null (the default, meaning no special multi-term processing)
*/
protected Analyzer getIndexAnalyzer(String field) {
return null;
}
private Map<Integer,Object> highlightField(String field, String contents[], BreakIterator bi, BytesRef terms[], int[] docids, List<LeafReaderContext> leaves, int maxPassages, Query query) throws IOException {
Map<Integer,Object> highlights = new HashMap<>();
PassageFormatter fieldFormatter = getFormatter(field);
if (fieldFormatter == null) {
throw new NullPointerException("PassageFormatter must not be null");
}
// check if we should do any multiterm processing
Analyzer analyzer = getIndexAnalyzer(field);
CharacterRunAutomaton automata[] = new CharacterRunAutomaton[0];
if (analyzer != null) {
automata = MultiTermHighlighting.extractAutomata(query, field);
}
// resize 'terms', where the last term is the multiterm matcher
if (automata.length > 0) {
BytesRef newTerms[] = new BytesRef[terms.length + 1];
System.arraycopy(terms, 0, newTerms, 0, terms.length);
terms = newTerms;
}
// we are processing in increasing docid order, so we only need to reinitialize stuff on segment changes
// otherwise, we will just advance() existing enums to the new document in the same segment.
PostingsEnum postings[] = null;
TermsEnum termsEnum = null;
int lastLeaf = -1;
for (int i = 0; i < docids.length; i++) {
String content = contents[i];
if (content.length() == 0) {
continue; // nothing to do
}
bi.setText(content);
int doc = docids[i];
int leaf = ReaderUtil.subIndex(doc, leaves);
LeafReaderContext subContext = leaves.get(leaf);
LeafReader r = subContext.reader();
assert leaf >= lastLeaf; // increasing order
// if the segment has changed, we must initialize new enums.
if (leaf != lastLeaf) {
Terms t = r.terms(field);
if (t != null) {
if (!t.hasOffsets()) {
// no offsets available
throw new IllegalArgumentException("field '" + field + "' was indexed without offsets, cannot highlight");
}
termsEnum = t.iterator();
postings = new PostingsEnum[terms.length];
} else {
termsEnum = null;
}
}
if (termsEnum == null) {
continue; // no terms for this field, nothing to do
}
// if there are multi-term matches, we have to initialize the "fake" enum for each document
if (automata.length > 0) {
PostingsEnum dp = MultiTermHighlighting.getDocsEnum(analyzer.tokenStream(field, content), automata);
dp.advance(doc - subContext.docBase);
postings[terms.length-1] = dp; // last term is the multiterm matcher
}
Passage passages[] = highlightDoc(field, terms, content.length(), bi, doc - subContext.docBase, termsEnum, postings, maxPassages);
if (passages.length == 0) {
// no passages were returned, so ask for a default summary
passages = getEmptyHighlight(field, bi, maxPassages);
}
if (passages.length > 0) {
highlights.put(doc, fieldFormatter.format(passages, content));
}
lastLeaf = leaf;
}
return highlights;
}
// algorithm: treat sentence snippets as miniature documents
// we can intersect these with the postings lists via BreakIterator.preceding(offset),s
// score each sentence as norm(sentenceStartOffset) * sum(weight * tf(freq))
private Passage[] highlightDoc(String field, BytesRef terms[], int contentLength, BreakIterator bi, int doc,
TermsEnum termsEnum, PostingsEnum[] postings, int n) throws IOException {
PassageScorer scorer = getScorer(field);
if (scorer == null) {
throw new NullPointerException("PassageScorer must not be null");
}
PriorityQueue<OffsetsEnum> pq = new PriorityQueue<>();
float weights[] = new float[terms.length];
// initialize postings
for (int i = 0; i < terms.length; i++) {
PostingsEnum de = postings[i];
int pDoc;
if (de == EMPTY) {
continue;
} else if (de == null) {
postings[i] = EMPTY; // initially
if (!termsEnum.seekExact(terms[i])) {
continue; // term not found
}
de = postings[i] = termsEnum.postings(null, PostingsEnum.OFFSETS);
assert de != null;
pDoc = de.advance(doc);
} else {
pDoc = de.docID();
if (pDoc < doc) {
pDoc = de.advance(doc);
}
}
if (doc == pDoc) {
weights[i] = scorer.weight(contentLength, de.freq());
de.nextPosition();
pq.add(new OffsetsEnum(de, i));
}
}
pq.add(new OffsetsEnum(EMPTY, Integer.MAX_VALUE)); // a sentinel for termination
PriorityQueue<Passage> passageQueue = new PriorityQueue<>(n, new Comparator<Passage>() {
@Override
public int compare(Passage left, Passage right) {
if (left.score < right.score) {
return -1;
} else if (left.score > right.score) {
return 1;
} else {
return left.startOffset - right.startOffset;
}
}
});
Passage current = new Passage();
OffsetsEnum off;
while ((off = pq.poll()) != null) {
final PostingsEnum dp = off.dp;
int start = dp.startOffset();
assert start >= 0;
int end = dp.endOffset();
// LUCENE-5166: this hit would span the content limit... however more valid
// hits may exist (they are sorted by start). so we pretend like we never
// saw this term, it won't cause a passage to be added to passageQueue or anything.
assert EMPTY.startOffset() == Integer.MAX_VALUE;
if (start < contentLength && end > contentLength) {
continue;
}
if (start >= current.endOffset) {
if (current.startOffset >= 0) {
// finalize current
current.score *= scorer.norm(current.startOffset);
// new sentence: first add 'current' to queue
if (passageQueue.size() == n && current.score < passageQueue.peek().score) {
current.reset(); // can't compete, just reset it
} else {
passageQueue.offer(current);
if (passageQueue.size() > n) {
current = passageQueue.poll();
current.reset();
} else {
current = new Passage();
}
}
}
// if we exceed limit, we are done
if (start >= contentLength) {
Passage passages[] = new Passage[passageQueue.size()];
passageQueue.toArray(passages);
for (Passage p : passages) {
p.sort();
}
// sort in ascending order
Arrays.sort(passages, new Comparator<Passage>() {
@Override
public int compare(Passage left, Passage right) {
return left.startOffset - right.startOffset;
}
});
return passages;
}
// advance breakiterator
assert BreakIterator.DONE < 0;
current.startOffset = Math.max(bi.preceding(start+1), 0);
current.endOffset = Math.min(bi.next(), contentLength);
}
int tf = 0;
while (true) {
tf++;
BytesRef term = terms[off.id];
if (term == null) {
// multitermquery match, pull from payload
term = off.dp.getPayload();
assert term != null;
}
current.addMatch(start, end, term);
if (off.pos == dp.freq()) {
break; // removed from pq
} else {
off.pos++;
dp.nextPosition();
start = dp.startOffset();
end = dp.endOffset();
}
if (start >= current.endOffset || end > contentLength) {
pq.offer(off);
break;
}
}
current.score += weights[off.id] * scorer.tf(tf, current.endOffset - current.startOffset);
}
// Dead code but compiler disagrees:
assert false;
return null;
}
/** Called to summarize a document when no hits were
* found. By default this just returns the first
* {@code maxPassages} sentences; subclasses can override
* to customize. */
protected Passage[] getEmptyHighlight(String fieldName, BreakIterator bi, int maxPassages) {
// BreakIterator should be un-next'd:
List<Passage> passages = new ArrayList<>();
int pos = bi.current();
assert pos == 0;
while (passages.size() < maxPassages) {
int next = bi.next();
if (next == BreakIterator.DONE) {
break;
}
Passage passage = new Passage();
passage.score = Float.NaN;
passage.startOffset = pos;
passage.endOffset = next;
passages.add(passage);
pos = next;
}
return passages.toArray(new Passage[passages.size()]);
}
private static class OffsetsEnum implements Comparable<OffsetsEnum> {
PostingsEnum dp;
int pos;
int id;
OffsetsEnum(PostingsEnum dp, int id) throws IOException {
this.dp = dp;
this.id = id;
this.pos = 1;
}
@Override
public int compareTo(OffsetsEnum other) {
try {
int off = dp.startOffset();
int otherOff = other.dp.startOffset();
if (off == otherOff) {
return id - other.id;
} else {
return Integer.compare(off, otherOff);
}
} catch (IOException e) {
throw new RuntimeException(e);
}
}
}
private static final PostingsEnum EMPTY = new PostingsEnum() {
@Override
public int nextPosition() throws IOException { return -1; }
@Override
public int startOffset() throws IOException { return Integer.MAX_VALUE; }
@Override
public int endOffset() throws IOException { return Integer.MAX_VALUE; }
@Override
public BytesRef getPayload() throws IOException { return null; }
@Override
public int freq() throws IOException { return 0; }
@Override
public int docID() { return NO_MORE_DOCS; }
@Override
public int nextDoc() throws IOException { return NO_MORE_DOCS; }
@Override
public int advance(int target) throws IOException { return NO_MORE_DOCS; }
@Override
public long cost() { return 0; }
};
private static class LimitedStoredFieldVisitor extends StoredFieldVisitor {
private final String fields[];
private final char valueSeparators[];
private final int maxLength;
private final StringBuilder builders[];
private int currentField = -1;
public LimitedStoredFieldVisitor(String fields[], char valueSeparators[], int maxLength) {
assert fields.length == valueSeparators.length;
this.fields = fields;
this.valueSeparators = valueSeparators;
this.maxLength = maxLength;
builders = new StringBuilder[fields.length];
for (int i = 0; i < builders.length; i++) {
builders[i] = new StringBuilder();
}
}
@Override
public void stringField(FieldInfo fieldInfo, byte[] bytes) throws IOException {
String value = new String(bytes, StandardCharsets.UTF_8);
assert currentField >= 0;
StringBuilder builder = builders[currentField];
if (builder.length() > 0 && builder.length() < maxLength) {
builder.append(valueSeparators[currentField]);
}
if (builder.length() + value.length() > maxLength) {
builder.append(value, 0, maxLength - builder.length());
} else {
builder.append(value);
}
}
@Override
public Status needsField(FieldInfo fieldInfo) throws IOException {
currentField = Arrays.binarySearch(fields, fieldInfo.name);
if (currentField < 0) {
return Status.NO;
} else if (builders[currentField].length() > maxLength) {
return fields.length == 1 ? Status.STOP : Status.NO;
}
return Status.YES;
}
String getValue(int i) {
return builders[i].toString();
}
void reset() {
currentField = -1;
for (int i = 0; i < fields.length; i++) {
builders[i].setLength(0);
}
}
}
}

View File

@ -1,21 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Highlighter implementation that uses offsets from postings lists.
*/
package org.apache.lucene.search.postingshighlight;

View File

@ -0,0 +1,150 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.uhighlight;
import java.text.BreakIterator;
import java.text.CharacterIterator;
/**
* A {@link BreakIterator} that breaks the text whenever a certain separator, provided as a constructor argument, is found.
*/
public final class CustomSeparatorBreakIterator extends BreakIterator {
private final char separator;
private CharacterIterator text;
private int current;
public CustomSeparatorBreakIterator(char separator) {
this.separator = separator;
}
@Override
public int current() {
return current;
}
@Override
public int first() {
text.setIndex(text.getBeginIndex());
return current = text.getIndex();
}
@Override
public int last() {
text.setIndex(text.getEndIndex());
return current = text.getIndex();
}
@Override
public int next() {
if (text.getIndex() == text.getEndIndex()) {
return DONE;
} else {
return advanceForward();
}
}
private int advanceForward() {
char c;
while ((c = text.next()) != CharacterIterator.DONE) {
if (c == separator) {
return current = text.getIndex() + 1;
}
}
assert text.getIndex() == text.getEndIndex();
return current = text.getIndex();
}
@Override
public int following(int pos) {
if (pos < text.getBeginIndex() || pos > text.getEndIndex()) {
throw new IllegalArgumentException("offset out of bounds");
} else if (pos == text.getEndIndex()) {
// this conflicts with the javadocs, but matches actual behavior (Oracle has a bug in something)
// https://bugs.openjdk.java.net/browse/JDK-8015110
text.setIndex(text.getEndIndex());
current = text.getIndex();
return DONE;
} else {
text.setIndex(pos);
current = text.getIndex();
return advanceForward();
}
}
@Override
public int previous() {
if (text.getIndex() == text.getBeginIndex()) {
return DONE;
} else {
return advanceBackward();
}
}
private int advanceBackward() {
char c;
while ((c = text.previous()) != CharacterIterator.DONE) {
if (c == separator) {
return current = text.getIndex() + 1;
}
}
assert text.getIndex() == text.getBeginIndex();
return current = text.getIndex();
}
@Override
public int preceding(int pos) {
if (pos < text.getBeginIndex() || pos > text.getEndIndex()) {
throw new IllegalArgumentException("offset out of bounds");
} else if (pos == text.getBeginIndex()) {
// this conflicts with the javadocs, but matches actual behavior (Oracle has a bug in something)
// https://bugs.openjdk.java.net/browse/JDK-8015110
text.setIndex(text.getBeginIndex());
current = text.getIndex();
return DONE;
} else {
text.setIndex(pos);
current = text.getIndex();
return advanceBackward();
}
}
@Override
public int next(int n) {
if (n < 0) {
for (int i = 0; i < -n; i++) {
previous();
}
} else {
for (int i = 0; i < n; i++) {
next();
}
}
return current();
}
@Override
public CharacterIterator getText() {
return text;
}
@Override
public void setText(CharacterIterator newText) {
text = newText;
current = text.getBeginIndex();
}
}

View File

@ -0,0 +1,116 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.uhighlight;
import java.text.BreakIterator;
import java.text.CharacterIterator;
/** Just produces one single fragment for the entire text */
public final class WholeBreakIterator extends BreakIterator {
private CharacterIterator text;
private int start;
private int end;
private int current;
@Override
public int current() {
return current;
}
@Override
public int first() {
return (current = start);
}
@Override
public int following(int pos) {
if (pos < start || pos > end) {
throw new IllegalArgumentException("offset out of bounds");
} else if (pos == end) {
// this conflicts with the javadocs, but matches actual behavior (Oracle has a bug in something)
// https://bugs.openjdk.java.net/browse/JDK-8015110
current = end;
return DONE;
} else {
return last();
}
}
@Override
public CharacterIterator getText() {
return text;
}
@Override
public int last() {
return (current = end);
}
@Override
public int next() {
if (current == end) {
return DONE;
} else {
return last();
}
}
@Override
public int next(int n) {
if (n < 0) {
for (int i = 0; i < -n; i++) {
previous();
}
} else {
for (int i = 0; i < n; i++) {
next();
}
}
return current();
}
@Override
public int preceding(int pos) {
if (pos < start || pos > end) {
throw new IllegalArgumentException("offset out of bounds");
} else if (pos == start) {
// this conflicts with the javadocs, but matches actual behavior (Oracle has a bug in something)
// https://bugs.openjdk.java.net/browse/JDK-8015110
current = start;
return DONE;
} else {
return first();
}
}
@Override
public int previous() {
if (current == start) {
return DONE;
} else {
return first();
}
}
@Override
public void setText(CharacterIterator newText) {
start = newText.getBeginIndex();
end = newText.getEndIndex();
text = newText;
current = start;
}
}

File diff suppressed because one or more lines are too long

View File

@ -1,884 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.postingshighlight;
import java.util.Collections;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.ConstantScoreQuery;
import org.apache.lucene.search.DisjunctionMaxQuery;
import org.apache.lucene.search.FuzzyQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.RegexpQuery;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TermRangeQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.search.spans.SpanFirstQuery;
import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper;
import org.apache.lucene.search.spans.SpanNearQuery;
import org.apache.lucene.search.spans.SpanNotQuery;
import org.apache.lucene.search.spans.SpanOrQuery;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase;
/**
* Some tests that override {@link PostingsHighlighter#getIndexAnalyzer} to
* highlight wilcard, fuzzy, etc queries.
*/
public class TestMultiTermHighlighting extends LuceneTestCase {
public void testWildcards() throws Exception {
Directory dir = newDirectory();
// use simpleanalyzer for more natural tokenization (else "test." is a token)
final Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);
IndexWriterConfig iwc = newIndexWriterConfig(analyzer);
iwc.setMergePolicy(newLogMergePolicy());
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
Field body = new Field("body", "", offsetsType);
Document doc = new Document();
doc.add(body);
body.setStringValue("This is a test.");
iw.addDocument(doc);
body.setStringValue("Test a one sentence document.");
iw.addDocument(doc);
IndexReader ir = iw.getReader();
iw.close();
IndexSearcher searcher = newSearcher(ir);
PostingsHighlighter highlighter = new PostingsHighlighter() {
@Override
protected Analyzer getIndexAnalyzer(String field) {
return analyzer;
}
};
Query query = new WildcardQuery(new Term("body", "te*"));
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
assertEquals(2, topDocs.totalHits);
String snippets[] = highlighter.highlight("body", query, searcher, topDocs);
assertEquals(2, snippets.length);
assertEquals("This is a <b>test</b>.", snippets[0]);
assertEquals("<b>Test</b> a one sentence document.", snippets[1]);
// wrong field
BooleanQuery.Builder bq = new BooleanQuery.Builder();
bq.add(new MatchAllDocsQuery(), BooleanClause.Occur.SHOULD);
bq.add(new WildcardQuery(new Term("bogus", "te*")), BooleanClause.Occur.SHOULD);
topDocs = searcher.search(bq.build(), 10, Sort.INDEXORDER);
assertEquals(2, topDocs.totalHits);
snippets = highlighter.highlight("body", bq.build(), searcher, topDocs);
assertEquals(2, snippets.length);
assertEquals("This is a test.", snippets[0]);
assertEquals("Test a one sentence document.", snippets[1]);
ir.close();
dir.close();
}
public void testOnePrefix() throws Exception {
Directory dir = newDirectory();
// use simpleanalyzer for more natural tokenization (else "test." is a token)
final Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);
IndexWriterConfig iwc = newIndexWriterConfig(analyzer);
iwc.setMergePolicy(newLogMergePolicy());
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
Field body = new Field("body", "", offsetsType);
Document doc = new Document();
doc.add(body);
body.setStringValue("This is a test.");
iw.addDocument(doc);
body.setStringValue("Test a one sentence document.");
iw.addDocument(doc);
IndexReader ir = iw.getReader();
iw.close();
IndexSearcher searcher = newSearcher(ir);
PostingsHighlighter highlighter = new PostingsHighlighter() {
@Override
protected Analyzer getIndexAnalyzer(String field) {
return analyzer;
}
};
Query query = new PrefixQuery(new Term("body", "te"));
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
assertEquals(2, topDocs.totalHits);
String snippets[] = highlighter.highlight("body", query, searcher, topDocs);
assertEquals(2, snippets.length);
assertEquals("This is a <b>test</b>.", snippets[0]);
assertEquals("<b>Test</b> a one sentence document.", snippets[1]);
// wrong field
BooleanQuery.Builder bq = new BooleanQuery.Builder();
bq.add(new MatchAllDocsQuery(), BooleanClause.Occur.SHOULD);
bq.add(new PrefixQuery(new Term("bogus", "te")), BooleanClause.Occur.SHOULD);
topDocs = searcher.search(bq.build(), 10, Sort.INDEXORDER);
assertEquals(2, topDocs.totalHits);
snippets = highlighter.highlight("body", bq.build(), searcher, topDocs);
assertEquals(2, snippets.length);
assertEquals("This is a test.", snippets[0]);
assertEquals("Test a one sentence document.", snippets[1]);
ir.close();
dir.close();
}
public void testOneRegexp() throws Exception {
Directory dir = newDirectory();
// use simpleanalyzer for more natural tokenization (else "test." is a token)
final Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);
IndexWriterConfig iwc = newIndexWriterConfig(analyzer);
iwc.setMergePolicy(newLogMergePolicy());
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
Field body = new Field("body", "", offsetsType);
Document doc = new Document();
doc.add(body);
body.setStringValue("This is a test.");
iw.addDocument(doc);
body.setStringValue("Test a one sentence document.");
iw.addDocument(doc);
IndexReader ir = iw.getReader();
iw.close();
IndexSearcher searcher = newSearcher(ir);
PostingsHighlighter highlighter = new PostingsHighlighter() {
@Override
protected Analyzer getIndexAnalyzer(String field) {
return analyzer;
}
};
Query query = new RegexpQuery(new Term("body", "te.*"));
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
assertEquals(2, topDocs.totalHits);
String snippets[] = highlighter.highlight("body", query, searcher, topDocs);
assertEquals(2, snippets.length);
assertEquals("This is a <b>test</b>.", snippets[0]);
assertEquals("<b>Test</b> a one sentence document.", snippets[1]);
// wrong field
BooleanQuery.Builder bq = new BooleanQuery.Builder();
bq.add(new MatchAllDocsQuery(), BooleanClause.Occur.SHOULD);
bq.add(new RegexpQuery(new Term("bogus", "te.*")), BooleanClause.Occur.SHOULD);
topDocs = searcher.search(bq.build(), 10, Sort.INDEXORDER);
assertEquals(2, topDocs.totalHits);
snippets = highlighter.highlight("body", bq.build(), searcher, topDocs);
assertEquals(2, snippets.length);
assertEquals("This is a test.", snippets[0]);
assertEquals("Test a one sentence document.", snippets[1]);
ir.close();
dir.close();
}
public void testOneFuzzy() throws Exception {
Directory dir = newDirectory();
// use simpleanalyzer for more natural tokenization (else "test." is a token)
final Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);
IndexWriterConfig iwc = newIndexWriterConfig(analyzer);
iwc.setMergePolicy(newLogMergePolicy());
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
Field body = new Field("body", "", offsetsType);
Document doc = new Document();
doc.add(body);
body.setStringValue("This is a test.");
iw.addDocument(doc);
body.setStringValue("Test a one sentence document.");
iw.addDocument(doc);
IndexReader ir = iw.getReader();
iw.close();
IndexSearcher searcher = newSearcher(ir);
PostingsHighlighter highlighter = new PostingsHighlighter() {
@Override
protected Analyzer getIndexAnalyzer(String field) {
return analyzer;
}
};
Query query = new FuzzyQuery(new Term("body", "tets"), 1);
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
assertEquals(2, topDocs.totalHits);
String snippets[] = highlighter.highlight("body", query, searcher, topDocs);
assertEquals(2, snippets.length);
assertEquals("This is a <b>test</b>.", snippets[0]);
assertEquals("<b>Test</b> a one sentence document.", snippets[1]);
// with prefix
query = new FuzzyQuery(new Term("body", "tets"), 1, 2);
topDocs = searcher.search(query, 10, Sort.INDEXORDER);
assertEquals(2, topDocs.totalHits);
snippets = highlighter.highlight("body", query, searcher, topDocs);
assertEquals(2, snippets.length);
assertEquals("This is a <b>test</b>.", snippets[0]);
assertEquals("<b>Test</b> a one sentence document.", snippets[1]);
// wrong field
BooleanQuery.Builder bq = new BooleanQuery.Builder();
bq.add(new MatchAllDocsQuery(), BooleanClause.Occur.SHOULD);
bq.add(new FuzzyQuery(new Term("bogus", "tets"), 1), BooleanClause.Occur.SHOULD);
topDocs = searcher.search(bq.build(), 10, Sort.INDEXORDER);
assertEquals(2, topDocs.totalHits);
snippets = highlighter.highlight("body", bq.build(), searcher, topDocs);
assertEquals(2, snippets.length);
assertEquals("This is a test.", snippets[0]);
assertEquals("Test a one sentence document.", snippets[1]);
ir.close();
dir.close();
}
public void testRanges() throws Exception {
Directory dir = newDirectory();
// use simpleanalyzer for more natural tokenization (else "test." is a token)
final Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);
IndexWriterConfig iwc = newIndexWriterConfig(analyzer);
iwc.setMergePolicy(newLogMergePolicy());
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
Field body = new Field("body", "", offsetsType);
Document doc = new Document();
doc.add(body);
body.setStringValue("This is a test.");
iw.addDocument(doc);
body.setStringValue("Test a one sentence document.");
iw.addDocument(doc);
IndexReader ir = iw.getReader();
iw.close();
IndexSearcher searcher = newSearcher(ir);
PostingsHighlighter highlighter = new PostingsHighlighter() {
@Override
protected Analyzer getIndexAnalyzer(String field) {
return analyzer;
}
};
Query query = TermRangeQuery.newStringRange("body", "ta", "tf", true, true);
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
assertEquals(2, topDocs.totalHits);
String snippets[] = highlighter.highlight("body", query, searcher, topDocs);
assertEquals(2, snippets.length);
assertEquals("This is a <b>test</b>.", snippets[0]);
assertEquals("<b>Test</b> a one sentence document.", snippets[1]);
// null start
query = TermRangeQuery.newStringRange("body", null, "tf", true, true);
topDocs = searcher.search(query, 10, Sort.INDEXORDER);
assertEquals(2, topDocs.totalHits);
snippets = highlighter.highlight("body", query, searcher, topDocs);
assertEquals(2, snippets.length);
assertEquals("This <b>is</b> <b>a</b> <b>test</b>.", snippets[0]);
assertEquals("<b>Test</b> <b>a</b> <b>one</b> <b>sentence</b> <b>document</b>.", snippets[1]);
// null end
query = TermRangeQuery.newStringRange("body", "ta", null, true, true);
topDocs = searcher.search(query, 10, Sort.INDEXORDER);
assertEquals(2, topDocs.totalHits);
snippets = highlighter.highlight("body", query, searcher, topDocs);
assertEquals(2, snippets.length);
assertEquals("<b>This</b> is a <b>test</b>.", snippets[0]);
assertEquals("<b>Test</b> a one sentence document.", snippets[1]);
// exact start inclusive
query = TermRangeQuery.newStringRange("body", "test", "tf", true, true);
topDocs = searcher.search(query, 10, Sort.INDEXORDER);
assertEquals(2, topDocs.totalHits);
snippets = highlighter.highlight("body", query, searcher, topDocs);
assertEquals(2, snippets.length);
assertEquals("This is a <b>test</b>.", snippets[0]);
assertEquals("<b>Test</b> a one sentence document.", snippets[1]);
// exact end inclusive
query = TermRangeQuery.newStringRange("body", "ta", "test", true, true);
topDocs = searcher.search(query, 10, Sort.INDEXORDER);
assertEquals(2, topDocs.totalHits);
snippets = highlighter.highlight("body", query, searcher, topDocs);
assertEquals(2, snippets.length);
assertEquals("This is a <b>test</b>.", snippets[0]);
assertEquals("<b>Test</b> a one sentence document.", snippets[1]);
// exact start exclusive
BooleanQuery.Builder bq = new BooleanQuery.Builder();
bq.add(new MatchAllDocsQuery(), BooleanClause.Occur.SHOULD);
bq.add(TermRangeQuery.newStringRange("body", "test", "tf", false, true), BooleanClause.Occur.SHOULD);
topDocs = searcher.search(bq.build(), 10, Sort.INDEXORDER);
assertEquals(2, topDocs.totalHits);
snippets = highlighter.highlight("body", bq.build(), searcher, topDocs);
assertEquals(2, snippets.length);
assertEquals("This is a test.", snippets[0]);
assertEquals("Test a one sentence document.", snippets[1]);
// exact end exclusive
bq = new BooleanQuery.Builder();
bq.add(new MatchAllDocsQuery(), BooleanClause.Occur.SHOULD);
bq.add(TermRangeQuery.newStringRange("body", "ta", "test", true, false), BooleanClause.Occur.SHOULD);
topDocs = searcher.search(bq.build(), 10, Sort.INDEXORDER);
assertEquals(2, topDocs.totalHits);
snippets = highlighter.highlight("body", bq.build(), searcher, topDocs);
assertEquals(2, snippets.length);
assertEquals("This is a test.", snippets[0]);
assertEquals("Test a one sentence document.", snippets[1]);
// wrong field
bq = new BooleanQuery.Builder();
bq.add(new MatchAllDocsQuery(), BooleanClause.Occur.SHOULD);
bq.add(TermRangeQuery.newStringRange("bogus", "ta", "tf", true, true), BooleanClause.Occur.SHOULD);
topDocs = searcher.search(bq.build(), 10, Sort.INDEXORDER);
assertEquals(2, topDocs.totalHits);
snippets = highlighter.highlight("body", bq.build(), searcher, topDocs);
assertEquals(2, snippets.length);
assertEquals("This is a test.", snippets[0]);
assertEquals("Test a one sentence document.", snippets[1]);
ir.close();
dir.close();
}
public void testWildcardInBoolean() throws Exception {
Directory dir = newDirectory();
// use simpleanalyzer for more natural tokenization (else "test." is a token)
final Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);
IndexWriterConfig iwc = newIndexWriterConfig(analyzer);
iwc.setMergePolicy(newLogMergePolicy());
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
Field body = new Field("body", "", offsetsType);
Document doc = new Document();
doc.add(body);
body.setStringValue("This is a test.");
iw.addDocument(doc);
body.setStringValue("Test a one sentence document.");
iw.addDocument(doc);
IndexReader ir = iw.getReader();
iw.close();
IndexSearcher searcher = newSearcher(ir);
PostingsHighlighter highlighter = new PostingsHighlighter() {
@Override
protected Analyzer getIndexAnalyzer(String field) {
return analyzer;
}
};
BooleanQuery.Builder query = new BooleanQuery.Builder();
query.add(new WildcardQuery(new Term("body", "te*")), BooleanClause.Occur.SHOULD);
TopDocs topDocs = searcher.search(query.build(), 10, Sort.INDEXORDER);
assertEquals(2, topDocs.totalHits);
String snippets[] = highlighter.highlight("body", query.build(), searcher, topDocs);
assertEquals(2, snippets.length);
assertEquals("This is a <b>test</b>.", snippets[0]);
assertEquals("<b>Test</b> a one sentence document.", snippets[1]);
// must not
query = new BooleanQuery.Builder();
query.add(new MatchAllDocsQuery(), BooleanClause.Occur.SHOULD);
query.add(new WildcardQuery(new Term("bogus", "te*")), BooleanClause.Occur.MUST_NOT);
topDocs = searcher.search(query.build(), 10, Sort.INDEXORDER);
assertEquals(2, topDocs.totalHits);
snippets = highlighter.highlight("body", query.build(), searcher, topDocs);
assertEquals(2, snippets.length);
assertEquals("This is a test.", snippets[0]);
assertEquals("Test a one sentence document.", snippets[1]);
ir.close();
dir.close();
}
public void testWildcardInFiltered() throws Exception {
Directory dir = newDirectory();
// use simpleanalyzer for more natural tokenization (else "test." is a token)
final Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);
IndexWriterConfig iwc = newIndexWriterConfig(analyzer);
iwc.setMergePolicy(newLogMergePolicy());
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
Field body = new Field("body", "", offsetsType);
Document doc = new Document();
doc.add(body);
body.setStringValue("This is a test.");
iw.addDocument(doc);
body.setStringValue("Test a one sentence document.");
iw.addDocument(doc);
IndexReader ir = iw.getReader();
iw.close();
IndexSearcher searcher = newSearcher(ir);
PostingsHighlighter highlighter = new PostingsHighlighter() {
@Override
protected Analyzer getIndexAnalyzer(String field) {
return analyzer;
}
};
Query query = new BooleanQuery.Builder()
.add(new WildcardQuery(new Term("body", "te*")), Occur.MUST)
.add(new TermQuery(new Term("body", "test")), Occur.FILTER)
.build();
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
assertEquals(2, topDocs.totalHits);
String snippets[] = highlighter.highlight("body", query, searcher, topDocs);
assertEquals(2, snippets.length);
assertEquals("This is a <b>test</b>.", snippets[0]);
assertEquals("<b>Test</b> a one sentence document.", snippets[1]);
ir.close();
dir.close();
}
public void testWildcardInConstantScore() throws Exception {
Directory dir = newDirectory();
// use simpleanalyzer for more natural tokenization (else "test." is a token)
final Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);
IndexWriterConfig iwc = newIndexWriterConfig(analyzer);
iwc.setMergePolicy(newLogMergePolicy());
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
Field body = new Field("body", "", offsetsType);
Document doc = new Document();
doc.add(body);
body.setStringValue("This is a test.");
iw.addDocument(doc);
body.setStringValue("Test a one sentence document.");
iw.addDocument(doc);
IndexReader ir = iw.getReader();
iw.close();
IndexSearcher searcher = newSearcher(ir);
PostingsHighlighter highlighter = new PostingsHighlighter() {
@Override
protected Analyzer getIndexAnalyzer(String field) {
return analyzer;
}
};
ConstantScoreQuery query = new ConstantScoreQuery(new WildcardQuery(new Term("body", "te*")));
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
assertEquals(2, topDocs.totalHits);
String snippets[] = highlighter.highlight("body", query, searcher, topDocs);
assertEquals(2, snippets.length);
assertEquals("This is a <b>test</b>.", snippets[0]);
assertEquals("<b>Test</b> a one sentence document.", snippets[1]);
ir.close();
dir.close();
}
public void testWildcardInDisjunctionMax() throws Exception {
Directory dir = newDirectory();
// use simpleanalyzer for more natural tokenization (else "test." is a token)
final Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);
IndexWriterConfig iwc = newIndexWriterConfig(analyzer);
iwc.setMergePolicy(newLogMergePolicy());
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
Field body = new Field("body", "", offsetsType);
Document doc = new Document();
doc.add(body);
body.setStringValue("This is a test.");
iw.addDocument(doc);
body.setStringValue("Test a one sentence document.");
iw.addDocument(doc);
IndexReader ir = iw.getReader();
iw.close();
IndexSearcher searcher = newSearcher(ir);
PostingsHighlighter highlighter = new PostingsHighlighter() {
@Override
protected Analyzer getIndexAnalyzer(String field) {
return analyzer;
}
};
DisjunctionMaxQuery query = new DisjunctionMaxQuery(
Collections.singleton(new WildcardQuery(new Term("body", "te*"))), 0);
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
assertEquals(2, topDocs.totalHits);
String snippets[] = highlighter.highlight("body", query, searcher, topDocs);
assertEquals(2, snippets.length);
assertEquals("This is a <b>test</b>.", snippets[0]);
assertEquals("<b>Test</b> a one sentence document.", snippets[1]);
ir.close();
dir.close();
}
public void testSpanWildcard() throws Exception {
Directory dir = newDirectory();
// use simpleanalyzer for more natural tokenization (else "test." is a token)
final Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);
IndexWriterConfig iwc = newIndexWriterConfig(analyzer);
iwc.setMergePolicy(newLogMergePolicy());
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
Field body = new Field("body", "", offsetsType);
Document doc = new Document();
doc.add(body);
body.setStringValue("This is a test.");
iw.addDocument(doc);
body.setStringValue("Test a one sentence document.");
iw.addDocument(doc);
IndexReader ir = iw.getReader();
iw.close();
IndexSearcher searcher = newSearcher(ir);
PostingsHighlighter highlighter = new PostingsHighlighter() {
@Override
protected Analyzer getIndexAnalyzer(String field) {
return analyzer;
}
};
Query query = new SpanMultiTermQueryWrapper<>(new WildcardQuery(new Term("body", "te*")));
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
assertEquals(2, topDocs.totalHits);
String snippets[] = highlighter.highlight("body", query, searcher, topDocs);
assertEquals(2, snippets.length);
assertEquals("This is a <b>test</b>.", snippets[0]);
assertEquals("<b>Test</b> a one sentence document.", snippets[1]);
ir.close();
dir.close();
}
public void testSpanOr() throws Exception {
Directory dir = newDirectory();
// use simpleanalyzer for more natural tokenization (else "test." is a token)
final Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);
IndexWriterConfig iwc = newIndexWriterConfig(analyzer);
iwc.setMergePolicy(newLogMergePolicy());
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
Field body = new Field("body", "", offsetsType);
Document doc = new Document();
doc.add(body);
body.setStringValue("This is a test.");
iw.addDocument(doc);
body.setStringValue("Test a one sentence document.");
iw.addDocument(doc);
IndexReader ir = iw.getReader();
iw.close();
IndexSearcher searcher = newSearcher(ir);
PostingsHighlighter highlighter = new PostingsHighlighter() {
@Override
protected Analyzer getIndexAnalyzer(String field) {
return analyzer;
}
};
SpanQuery childQuery = new SpanMultiTermQueryWrapper<>(new WildcardQuery(new Term("body", "te*")));
Query query = new SpanOrQuery(new SpanQuery[] { childQuery });
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
assertEquals(2, topDocs.totalHits);
String snippets[] = highlighter.highlight("body", query, searcher, topDocs);
assertEquals(2, snippets.length);
assertEquals("This is a <b>test</b>.", snippets[0]);
assertEquals("<b>Test</b> a one sentence document.", snippets[1]);
ir.close();
dir.close();
}
public void testSpanNear() throws Exception {
Directory dir = newDirectory();
// use simpleanalyzer for more natural tokenization (else "test." is a token)
final Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);
IndexWriterConfig iwc = newIndexWriterConfig(analyzer);
iwc.setMergePolicy(newLogMergePolicy());
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
Field body = new Field("body", "", offsetsType);
Document doc = new Document();
doc.add(body);
body.setStringValue("This is a test.");
iw.addDocument(doc);
body.setStringValue("Test a one sentence document.");
iw.addDocument(doc);
IndexReader ir = iw.getReader();
iw.close();
IndexSearcher searcher = newSearcher(ir);
PostingsHighlighter highlighter = new PostingsHighlighter() {
@Override
protected Analyzer getIndexAnalyzer(String field) {
return analyzer;
}
};
SpanQuery childQuery = new SpanMultiTermQueryWrapper<>(new WildcardQuery(new Term("body", "te*")));
Query query = new SpanNearQuery(new SpanQuery[] { childQuery, childQuery }, 0, false);
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
assertEquals(2, topDocs.totalHits);
String snippets[] = highlighter.highlight("body", query, searcher, topDocs);
assertEquals(2, snippets.length);
assertEquals("This is a <b>test</b>.", snippets[0]);
assertEquals("<b>Test</b> a one sentence document.", snippets[1]);
ir.close();
dir.close();
}
public void testSpanNot() throws Exception {
Directory dir = newDirectory();
// use simpleanalyzer for more natural tokenization (else "test." is a token)
final Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);
IndexWriterConfig iwc = newIndexWriterConfig(analyzer);
iwc.setMergePolicy(newLogMergePolicy());
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
Field body = new Field("body", "", offsetsType);
Document doc = new Document();
doc.add(body);
body.setStringValue("This is a test.");
iw.addDocument(doc);
body.setStringValue("Test a one sentence document.");
iw.addDocument(doc);
IndexReader ir = iw.getReader();
iw.close();
IndexSearcher searcher = newSearcher(ir);
PostingsHighlighter highlighter = new PostingsHighlighter() {
@Override
protected Analyzer getIndexAnalyzer(String field) {
return analyzer;
}
};
SpanQuery include = new SpanMultiTermQueryWrapper<>(new WildcardQuery(new Term("body", "te*")));
SpanQuery exclude = new SpanTermQuery(new Term("body", "bogus"));
Query query = new SpanNotQuery(include, exclude);
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
assertEquals(2, topDocs.totalHits);
String snippets[] = highlighter.highlight("body", query, searcher, topDocs);
assertEquals(2, snippets.length);
assertEquals("This is a <b>test</b>.", snippets[0]);
assertEquals("<b>Test</b> a one sentence document.", snippets[1]);
ir.close();
dir.close();
}
public void testSpanPositionCheck() throws Exception {
Directory dir = newDirectory();
// use simpleanalyzer for more natural tokenization (else "test." is a token)
final Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);
IndexWriterConfig iwc = newIndexWriterConfig(analyzer);
iwc.setMergePolicy(newLogMergePolicy());
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
Field body = new Field("body", "", offsetsType);
Document doc = new Document();
doc.add(body);
body.setStringValue("This is a test.");
iw.addDocument(doc);
body.setStringValue("Test a one sentence document.");
iw.addDocument(doc);
IndexReader ir = iw.getReader();
iw.close();
IndexSearcher searcher = newSearcher(ir);
PostingsHighlighter highlighter = new PostingsHighlighter() {
@Override
protected Analyzer getIndexAnalyzer(String field) {
return analyzer;
}
};
SpanQuery childQuery = new SpanMultiTermQueryWrapper<>(new WildcardQuery(new Term("body", "te*")));
Query query = new SpanFirstQuery(childQuery, 1000000);
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
assertEquals(2, topDocs.totalHits);
String snippets[] = highlighter.highlight("body", query, searcher, topDocs);
assertEquals(2, snippets.length);
assertEquals("This is a <b>test</b>.", snippets[0]);
assertEquals("<b>Test</b> a one sentence document.", snippets[1]);
ir.close();
dir.close();
}
/** Runs a query with two MTQs and confirms the formatter
* can tell which query matched which hit. */
public void testWhichMTQMatched() throws Exception {
Directory dir = newDirectory();
// use simpleanalyzer for more natural tokenization (else "test." is a token)
final Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);
IndexWriterConfig iwc = newIndexWriterConfig(analyzer);
iwc.setMergePolicy(newLogMergePolicy());
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
Field body = new Field("body", "", offsetsType);
Document doc = new Document();
doc.add(body);
body.setStringValue("Test a one sentence document.");
iw.addDocument(doc);
IndexReader ir = iw.getReader();
iw.close();
IndexSearcher searcher = newSearcher(ir);
PostingsHighlighter highlighter = new PostingsHighlighter() {
@Override
protected Analyzer getIndexAnalyzer(String field) {
return analyzer;
}
};
BooleanQuery.Builder query = new BooleanQuery.Builder();
query.add(new WildcardQuery(new Term("body", "te*")), BooleanClause.Occur.SHOULD);
query.add(new WildcardQuery(new Term("body", "one")), BooleanClause.Occur.SHOULD);
query.add(new WildcardQuery(new Term("body", "se*")), BooleanClause.Occur.SHOULD);
TopDocs topDocs = searcher.search(query.build(), 10, Sort.INDEXORDER);
assertEquals(1, topDocs.totalHits);
String snippets[] = highlighter.highlight("body", query.build(), searcher, topDocs);
assertEquals(1, snippets.length);
// Default formatter just bolds each hit:
assertEquals("<b>Test</b> a <b>one</b> <b>sentence</b> document.", snippets[0]);
// Now use our own formatter, that also stuffs the
// matching term's text into the result:
highlighter = new PostingsHighlighter() {
@Override
protected Analyzer getIndexAnalyzer(String field) {
return analyzer;
}
@Override
protected PassageFormatter getFormatter(String field) {
return new PassageFormatter() {
@Override
public Object format(Passage passages[], String content) {
// Copied from DefaultPassageFormatter, but
// tweaked to include the matched term:
StringBuilder sb = new StringBuilder();
int pos = 0;
for (Passage passage : passages) {
// don't add ellipsis if it's the first one, or if it's connected.
if (passage.startOffset > pos && pos > 0) {
sb.append("... ");
}
pos = passage.startOffset;
for (int i = 0; i < passage.numMatches; i++) {
int start = passage.matchStarts[i];
int end = passage.matchEnds[i];
// it's possible to have overlapping terms
if (start > pos) {
sb.append(content, pos, start);
}
if (end > pos) {
sb.append("<b>");
sb.append(content, Math.max(pos, start), end);
sb.append('(');
sb.append(passage.getMatchTerms()[i].utf8ToString());
sb.append(')');
sb.append("</b>");
pos = end;
}
}
// it's possible a "term" from the analyzer could span a sentence boundary.
sb.append(content, pos, Math.max(pos, passage.endOffset));
pos = passage.endOffset;
}
return sb.toString();
}
};
}
};
assertEquals(1, topDocs.totalHits);
snippets = highlighter.highlight("body", query.build(), searcher, topDocs);
assertEquals(1, snippets.length);
// Default formatter bolds each hit:
assertEquals("<b>Test(body:te*)</b> a <b>one(body:one)</b> <b>sentence(body:se*)</b> document.", snippets[0]);
ir.close();
dir.close();
}
}

View File

@ -1,324 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.postingshighlight;
import java.io.IOException;
import java.util.HashSet;
import java.util.Random;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
public class TestPostingsHighlighterRanking extends LuceneTestCase {
/**
* indexes a bunch of gibberish, and then highlights top(n).
* asserts that top(n) highlights is a subset of top(n+1) up to some max N
*/
// TODO: this only tests single-valued fields. we should also index multiple values per field!
public void testRanking() throws Exception {
// number of documents: we will check each one
final int numDocs = atLeast(100);
// number of top-N snippets, we will check 1 .. N
final int maxTopN = 5;
// maximum number of elements to put in a sentence.
final int maxSentenceLength = 10;
// maximum number of sentences in a document
final int maxNumSentences = 20;
Directory dir = newDirectory();
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, new MockAnalyzer(random(), MockTokenizer.SIMPLE, true));
Document document = new Document();
Field id = new StringField("id", "", Field.Store.NO);
FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
Field body = new Field("body", "", offsetsType);
document.add(id);
document.add(body);
for (int i = 0; i < numDocs; i++) {
StringBuilder bodyText = new StringBuilder();
int numSentences = TestUtil.nextInt(random(), 1, maxNumSentences);
for (int j = 0; j < numSentences; j++) {
bodyText.append(newSentence(random(), maxSentenceLength));
}
body.setStringValue(bodyText.toString());
id.setStringValue(Integer.toString(i));
iw.addDocument(document);
}
IndexReader ir = iw.getReader();
IndexSearcher searcher = newSearcher(ir);
for (int i = 0; i < numDocs; i++) {
checkDocument(searcher, i, maxTopN);
}
iw.close();
ir.close();
dir.close();
}
private void checkDocument(IndexSearcher is, int doc, int maxTopN) throws IOException {
for (int ch = 'a'; ch <= 'z'; ch++) {
Term term = new Term("body", "" + (char)ch);
// check a simple term query
checkQuery(is, new TermQuery(term), doc, maxTopN);
// check a boolean query
BooleanQuery.Builder bq = new BooleanQuery.Builder();
bq.add(new TermQuery(term), BooleanClause.Occur.SHOULD);
Term nextTerm = new Term("body", "" + (char)(ch+1));
bq.add(new TermQuery(nextTerm), BooleanClause.Occur.SHOULD);
checkQuery(is, bq.build(), doc, maxTopN);
}
}
private void checkQuery(IndexSearcher is, Query query, int doc, int maxTopN) throws IOException {
for (int n = 1; n < maxTopN; n++) {
final FakePassageFormatter f1 = new FakePassageFormatter();
PostingsHighlighter p1 = new PostingsHighlighter(Integer.MAX_VALUE-1) {
@Override
protected PassageFormatter getFormatter(String field) {
assertEquals("body", field);
return f1;
}
};
final FakePassageFormatter f2 = new FakePassageFormatter();
PostingsHighlighter p2 = new PostingsHighlighter(Integer.MAX_VALUE-1) {
@Override
protected PassageFormatter getFormatter(String field) {
assertEquals("body", field);
return f2;
}
};
BooleanQuery.Builder bq = new BooleanQuery.Builder();
bq.add(query, BooleanClause.Occur.MUST);
bq.add(new TermQuery(new Term("id", Integer.toString(doc))), BooleanClause.Occur.MUST);
TopDocs td = is.search(bq.build(), 1);
p1.highlight("body", bq.build(), is, td, n);
p2.highlight("body", bq.build(), is, td, n+1);
assertTrue(f2.seen.containsAll(f1.seen));
}
}
/**
* returns a new random sentence, up to maxSentenceLength "words" in length.
* each word is a single character (a-z). The first one is capitalized.
*/
private String newSentence(Random r, int maxSentenceLength) {
StringBuilder sb = new StringBuilder();
int numElements = TestUtil.nextInt(r, 1, maxSentenceLength);
for (int i = 0; i < numElements; i++) {
if (sb.length() > 0) {
sb.append(' ');
sb.append((char) TestUtil.nextInt(r, 'a', 'z'));
} else {
// capitalize the first word to help breakiterator
sb.append((char) TestUtil.nextInt(r, 'A', 'Z'));
}
}
sb.append(". "); // finalize sentence
return sb.toString();
}
/**
* a fake formatter that doesn't actually format passages.
* instead it just collects them for asserts!
*/
static class FakePassageFormatter extends PassageFormatter {
HashSet<Pair> seen = new HashSet<>();
@Override
public String format(Passage passages[], String content) {
for (Passage p : passages) {
// verify some basics about the passage
assertTrue(p.getScore() >= 0);
assertTrue(p.getNumMatches() > 0);
assertTrue(p.getStartOffset() >= 0);
assertTrue(p.getStartOffset() <= content.length());
assertTrue(p.getEndOffset() >= p.getStartOffset());
assertTrue(p.getEndOffset() <= content.length());
// we use a very simple analyzer. so we can assert the matches are correct
int lastMatchStart = -1;
for (int i = 0; i < p.getNumMatches(); i++) {
BytesRef term = p.getMatchTerms()[i];
int matchStart = p.getMatchStarts()[i];
assertTrue(matchStart >= 0);
// must at least start within the passage
assertTrue(matchStart < p.getEndOffset());
int matchEnd = p.getMatchEnds()[i];
assertTrue(matchEnd >= 0);
// always moving forward
assertTrue(matchStart >= lastMatchStart);
lastMatchStart = matchStart;
// single character terms
assertEquals(matchStart+1, matchEnd);
// and the offsets must be correct...
assertEquals(1, term.length);
assertEquals((char)term.bytes[term.offset], Character.toLowerCase(content.charAt(matchStart)));
}
// record just the start/end offset for simplicity
seen.add(new Pair(p.getStartOffset(), p.getEndOffset()));
}
return "bogus!!!!!!";
}
}
static class Pair {
final int start;
final int end;
Pair(int start, int end) {
this.start = start;
this.end = end;
}
@Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result + end;
result = prime * result + start;
return result;
}
@Override
public boolean equals(Object obj) {
if (this == obj) {
return true;
}
if (obj == null) {
return false;
}
if (getClass() != obj.getClass()) {
return false;
}
Pair other = (Pair) obj;
if (end != other.end) {
return false;
}
if (start != other.start) {
return false;
}
return true;
}
@Override
public String toString() {
return "Pair [start=" + start + ", end=" + end + "]";
}
}
/** sets b=0 to disable passage length normalization */
public void testCustomB() throws Exception {
Directory dir = newDirectory();
IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true));
iwc.setMergePolicy(newLogMergePolicy());
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
Field body = new Field("body", "", offsetsType);
Document doc = new Document();
doc.add(body);
body.setStringValue("This is a test. This test is a better test but the sentence is excruiatingly long, " +
"you have no idea how painful it was for me to type this long sentence into my IDE.");
iw.addDocument(doc);
IndexReader ir = iw.getReader();
iw.close();
IndexSearcher searcher = newSearcher(ir);
PostingsHighlighter highlighter = new PostingsHighlighter(10000) {
@Override
protected PassageScorer getScorer(String field) {
return new PassageScorer(1.2f, 0, 87);
}
};
Query query = new TermQuery(new Term("body", "test"));
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
assertEquals(1, topDocs.totalHits);
String snippets[] = highlighter.highlight("body", query, searcher, topDocs, 1);
assertEquals(1, snippets.length);
assertTrue(snippets[0].startsWith("This <b>test</b> is a better <b>test</b>"));
ir.close();
dir.close();
}
/** sets k1=0 for simple coordinate-level match (# of query terms present) */
public void testCustomK1() throws Exception {
Directory dir = newDirectory();
IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true));
iwc.setMergePolicy(newLogMergePolicy());
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
Field body = new Field("body", "", offsetsType);
Document doc = new Document();
doc.add(body);
body.setStringValue("This has only foo foo. " +
"On the other hand this sentence contains both foo and bar. " +
"This has only bar bar bar bar bar bar bar bar bar bar bar bar.");
iw.addDocument(doc);
IndexReader ir = iw.getReader();
iw.close();
IndexSearcher searcher = newSearcher(ir);
PostingsHighlighter highlighter = new PostingsHighlighter(10000) {
@Override
protected PassageScorer getScorer(String field) {
return new PassageScorer(0, 0.75f, 87);
}
};
BooleanQuery.Builder query = new BooleanQuery.Builder();
query.add(new TermQuery(new Term("body", "foo")), BooleanClause.Occur.SHOULD);
query.add(new TermQuery(new Term("body", "bar")), BooleanClause.Occur.SHOULD);
TopDocs topDocs = searcher.search(query.build(), 10, Sort.INDEXORDER);
assertEquals(1, topDocs.totalHits);
String snippets[] = highlighter.highlight("body", query.build(), searcher, topDocs, 1);
assertEquals(1, snippets.length);
assertTrue(snippets[0].startsWith("On the other hand"));
ir.close();
dir.close();
}
}

View File

@ -23,7 +23,6 @@ import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.postingshighlight.CustomSeparatorBreakIterator;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.QueryBuilder;

View File

@ -0,0 +1,114 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.uhighlight;
import java.text.BreakIterator;
import java.util.Locale;
import com.carrotsearch.randomizedtesting.generators.RandomPicks;
import org.apache.lucene.util.LuceneTestCase;
import static org.apache.lucene.search.uhighlight.TestWholeBreakIterator.assertSameBreaks;
import static org.hamcrest.CoreMatchers.equalTo;
public class TestCustomSeparatorBreakIterator extends LuceneTestCase {
private static final Character[] SEPARATORS = new Character[]{' ', '\u0000', 8233};
public void testBreakOnCustomSeparator() throws Exception {
Character separator = randomSeparator();
BreakIterator bi = new CustomSeparatorBreakIterator(separator);
String source = "this" + separator + "is" + separator + "the" + separator + "first" + separator + "sentence";
bi.setText(source);
assertThat(bi.current(), equalTo(0));
assertThat(bi.first(), equalTo(0));
assertThat(source.substring(bi.current(), bi.next()), equalTo("this" + separator));
assertThat(source.substring(bi.current(), bi.next()), equalTo("is" + separator));
assertThat(source.substring(bi.current(), bi.next()), equalTo("the" + separator));
assertThat(source.substring(bi.current(), bi.next()), equalTo("first" + separator));
assertThat(source.substring(bi.current(), bi.next()), equalTo("sentence"));
assertThat(bi.next(), equalTo(BreakIterator.DONE));
assertThat(bi.last(), equalTo(source.length()));
int current = bi.current();
assertThat(source.substring(bi.previous(), current), equalTo("sentence"));
current = bi.current();
assertThat(source.substring(bi.previous(), current), equalTo("first" + separator));
current = bi.current();
assertThat(source.substring(bi.previous(), current), equalTo("the" + separator));
current = bi.current();
assertThat(source.substring(bi.previous(), current), equalTo("is" + separator));
current = bi.current();
assertThat(source.substring(bi.previous(), current), equalTo("this" + separator));
assertThat(bi.previous(), equalTo(BreakIterator.DONE));
assertThat(bi.current(), equalTo(0));
assertThat(source.substring(0, bi.following(9)), equalTo("this" + separator + "is" + separator + "the" + separator));
assertThat(source.substring(0, bi.preceding(9)), equalTo("this" + separator + "is" + separator));
assertThat(bi.first(), equalTo(0));
assertThat(source.substring(0, bi.next(3)), equalTo("this" + separator + "is" + separator + "the" + separator));
}
public void testSingleSentences() throws Exception {
BreakIterator expected = BreakIterator.getSentenceInstance(Locale.ROOT);
BreakIterator actual = new CustomSeparatorBreakIterator(randomSeparator());
assertSameBreaks("a", expected, actual);
assertSameBreaks("ab", expected, actual);
assertSameBreaks("abc", expected, actual);
assertSameBreaks("", expected, actual);
}
public void testSliceEnd() throws Exception {
BreakIterator expected = BreakIterator.getSentenceInstance(Locale.ROOT);
BreakIterator actual = new CustomSeparatorBreakIterator(randomSeparator());
assertSameBreaks("a000", 0, 1, expected, actual);
assertSameBreaks("ab000", 0, 1, expected, actual);
assertSameBreaks("abc000", 0, 1, expected, actual);
assertSameBreaks("000", 0, 0, expected, actual);
}
public void testSliceStart() throws Exception {
BreakIterator expected = BreakIterator.getSentenceInstance(Locale.ROOT);
BreakIterator actual = new CustomSeparatorBreakIterator(randomSeparator());
assertSameBreaks("000a", 3, 1, expected, actual);
assertSameBreaks("000ab", 3, 2, expected, actual);
assertSameBreaks("000abc", 3, 3, expected, actual);
assertSameBreaks("000", 3, 0, expected, actual);
}
public void testSliceMiddle() throws Exception {
BreakIterator expected = BreakIterator.getSentenceInstance(Locale.ROOT);
BreakIterator actual = new CustomSeparatorBreakIterator(randomSeparator());
assertSameBreaks("000a000", 3, 1, expected, actual);
assertSameBreaks("000ab000", 3, 2, expected, actual);
assertSameBreaks("000abc000", 3, 3, expected, actual);
assertSameBreaks("000000", 3, 0, expected, actual);
}
/** the current position must be ignored, initial position is always first() */
public void testFirstPosition() throws Exception {
BreakIterator expected = BreakIterator.getSentenceInstance(Locale.ROOT);
BreakIterator actual = new CustomSeparatorBreakIterator(randomSeparator());
assertSameBreaks("000ab000", 3, 2, 4, expected, actual);
}
private static char randomSeparator() {
return RandomPicks.randomFrom(random(), SEPARATORS);
}
}

View File

@ -49,7 +49,6 @@ import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.postingshighlight.WholeBreakIterator;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.LuceneTestCase.SuppressCodecs;

View File

@ -0,0 +1,134 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.uhighlight;
import java.text.BreakIterator;
import java.text.CharacterIterator;
import java.text.StringCharacterIterator;
import java.util.Locale;
import org.apache.lucene.util.LuceneTestCase;
public class TestWholeBreakIterator extends LuceneTestCase {
/** For single sentences, we know WholeBreakIterator should break the same as a sentence iterator */
public void testSingleSentences() throws Exception {
BreakIterator expected = BreakIterator.getSentenceInstance(Locale.ROOT);
BreakIterator actual = new WholeBreakIterator();
assertSameBreaks("a", expected, actual);
assertSameBreaks("ab", expected, actual);
assertSameBreaks("abc", expected, actual);
assertSameBreaks("", expected, actual);
}
public void testSliceEnd() throws Exception {
BreakIterator expected = BreakIterator.getSentenceInstance(Locale.ROOT);
BreakIterator actual = new WholeBreakIterator();
assertSameBreaks("a000", 0, 1, expected, actual);
assertSameBreaks("ab000", 0, 1, expected, actual);
assertSameBreaks("abc000", 0, 1, expected, actual);
assertSameBreaks("000", 0, 0, expected, actual);
}
public void testSliceStart() throws Exception {
BreakIterator expected = BreakIterator.getSentenceInstance(Locale.ROOT);
BreakIterator actual = new WholeBreakIterator();
assertSameBreaks("000a", 3, 1, expected, actual);
assertSameBreaks("000ab", 3, 2, expected, actual);
assertSameBreaks("000abc", 3, 3, expected, actual);
assertSameBreaks("000", 3, 0, expected, actual);
}
public void testSliceMiddle() throws Exception {
BreakIterator expected = BreakIterator.getSentenceInstance(Locale.ROOT);
BreakIterator actual = new WholeBreakIterator();
assertSameBreaks("000a000", 3, 1, expected, actual);
assertSameBreaks("000ab000", 3, 2, expected, actual);
assertSameBreaks("000abc000", 3, 3, expected, actual);
assertSameBreaks("000000", 3, 0, expected, actual);
}
/** the current position must be ignored, initial position is always first() */
public void testFirstPosition() throws Exception {
BreakIterator expected = BreakIterator.getSentenceInstance(Locale.ROOT);
BreakIterator actual = new WholeBreakIterator();
assertSameBreaks("000ab000", 3, 2, 4, expected, actual);
}
public static void assertSameBreaks(String text, BreakIterator expected, BreakIterator actual) {
assertSameBreaks(new StringCharacterIterator(text),
new StringCharacterIterator(text),
expected,
actual);
}
public static void assertSameBreaks(String text, int offset, int length, BreakIterator expected, BreakIterator actual) {
assertSameBreaks(text, offset, length, offset, expected, actual);
}
public static void assertSameBreaks(String text, int offset, int length, int current, BreakIterator expected, BreakIterator actual) {
assertSameBreaks(new StringCharacterIterator(text, offset, offset+length, current),
new StringCharacterIterator(text, offset, offset+length, current),
expected,
actual);
}
/** Asserts that two breakiterators break the text the same way */
public static void assertSameBreaks(CharacterIterator one, CharacterIterator two, BreakIterator expected, BreakIterator actual) {
expected.setText(one);
actual.setText(two);
assertEquals(expected.current(), actual.current());
// next()
int v = expected.current();
while (v != BreakIterator.DONE) {
assertEquals(v = expected.next(), actual.next());
assertEquals(expected.current(), actual.current());
}
// first()
assertEquals(expected.first(), actual.first());
assertEquals(expected.current(), actual.current());
// last()
assertEquals(expected.last(), actual.last());
assertEquals(expected.current(), actual.current());
// previous()
v = expected.current();
while (v != BreakIterator.DONE) {
assertEquals(v = expected.previous(), actual.previous());
assertEquals(expected.current(), actual.current());
}
// following()
for (int i = one.getBeginIndex(); i <= one.getEndIndex(); i++) {
expected.first();
actual.first();
assertEquals(expected.following(i), actual.following(i));
assertEquals(expected.current(), actual.current());
}
// preceding()
for (int i = one.getBeginIndex(); i <= one.getEndIndex(); i++) {
expected.last();
actual.last();
assertEquals(expected.preceding(i), actual.preceding(i));
assertEquals(expected.current(), actual.current());
}
}
}

View File

@ -28,13 +28,13 @@ import java.util.function.Predicate;
import org.apache.lucene.document.Document;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.postingshighlight.CustomSeparatorBreakIterator;
import org.apache.lucene.search.postingshighlight.WholeBreakIterator;
import org.apache.lucene.search.uhighlight.CustomSeparatorBreakIterator;
import org.apache.lucene.search.uhighlight.DefaultPassageFormatter;
import org.apache.lucene.search.uhighlight.LengthGoalBreakIterator;
import org.apache.lucene.search.uhighlight.PassageFormatter;
import org.apache.lucene.search.uhighlight.PassageScorer;
import org.apache.lucene.search.uhighlight.UnifiedHighlighter;
import org.apache.lucene.search.uhighlight.WholeBreakIterator;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.params.HighlightParams;
import org.apache.solr.common.params.SolrParams;