mirror of https://github.com/apache/lucene.git
LUCENE-7815: Removed the PostingsHighlighter
This commit is contained in:
parent
14320a584c
commit
0d3c73eaa2
|
@ -53,6 +53,10 @@ API Changes
|
|||
* LUCENE-7741: DoubleValuesSource now has an explain() method (Alan Woodward,
|
||||
Adrien Grand)
|
||||
|
||||
* LUCENE-7815: Removed the PostingsHighlighter; you should use the UnifiedHighlighter
|
||||
instead, which derived from the UH. WholeBreakIterator and
|
||||
CustomSeparatorBreakIterator were moved to UH's package. (David Smiley)
|
||||
|
||||
Bug Fixes
|
||||
|
||||
* LUCENE-7626: IndexWriter will no longer accept broken token offsets
|
||||
|
|
|
@ -38,7 +38,7 @@ file.query.maker.file=conf/query-terms.txt
|
|||
log.queries=false
|
||||
log.step.SearchTravRetHighlight=-1
|
||||
|
||||
highlighter=HlImpl:NONE:SH_A:UH_A:PH_P:UH_P:UH_PV
|
||||
highlighter=HlImpl:NONE:SH_A:UH_A:UH_P:UH_PV
|
||||
|
||||
{ "Populate"
|
||||
CreateIndex
|
||||
|
@ -60,6 +60,6 @@ highlighter=HlImpl:NONE:SH_A:UH_A:PH_P:UH_P:UH_PV
|
|||
CloseReader
|
||||
|
||||
NewRound
|
||||
} : 6
|
||||
} : 5
|
||||
|
||||
RepSumByPrefRound HL
|
|
@ -42,7 +42,6 @@ import org.apache.lucene.search.highlight.Highlighter;
|
|||
import org.apache.lucene.search.highlight.QueryScorer;
|
||||
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
|
||||
import org.apache.lucene.search.highlight.TokenSources;
|
||||
import org.apache.lucene.search.postingshighlight.PostingsHighlighter;
|
||||
import org.apache.lucene.search.uhighlight.UnifiedHighlighter;
|
||||
import org.apache.lucene.search.vectorhighlight.BoundaryScanner;
|
||||
import org.apache.lucene.search.vectorhighlight.BreakIteratorBoundaryScanner;
|
||||
|
@ -133,8 +132,6 @@ public class SearchTravRetHighlightTask extends SearchTravTask {
|
|||
case "UH_P": hlImpl = new UnifiedHLImpl(UnifiedHighlighter.OffsetSource.POSTINGS); break;
|
||||
case "UH_PV": hlImpl = new UnifiedHLImpl(UnifiedHighlighter.OffsetSource.POSTINGS_WITH_TERM_VECTORS); break;
|
||||
|
||||
case "PH_P": hlImpl = new PostingsHLImpl(); break;
|
||||
|
||||
default: throw new Exception("unrecognized highlighter type: " + type + " (try 'UH')");
|
||||
}
|
||||
}
|
||||
|
@ -224,33 +221,6 @@ public class SearchTravRetHighlightTask extends SearchTravTask {
|
|||
return clone;
|
||||
}
|
||||
|
||||
private class PostingsHLImpl implements HLImpl {
|
||||
PostingsHighlighter highlighter;
|
||||
String[] fields = hlFields.toArray(new String[hlFields.size()]);
|
||||
int[] maxPassages;
|
||||
PostingsHLImpl() {
|
||||
highlighter = new PostingsHighlighter(maxDocCharsToAnalyze) {
|
||||
@Override
|
||||
protected Analyzer getIndexAnalyzer(String field) { // thus support wildcards
|
||||
return analyzer;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected BreakIterator getBreakIterator(String field) {
|
||||
return BreakIterator.getSentenceInstance(Locale.ENGLISH);
|
||||
}
|
||||
};
|
||||
maxPassages = new int[hlFields.size()];
|
||||
Arrays.fill(maxPassages, maxFrags);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void withTopDocs(IndexSearcher searcher, Query q, TopDocs hits) throws Exception {
|
||||
Map<String, String[]> result = highlighter.highlightFields(fields, q, searcher, hits, maxPassages);
|
||||
preventOptimizeAway = result.size();
|
||||
}
|
||||
}
|
||||
|
||||
private class UnifiedHLImpl implements HLImpl {
|
||||
UnifiedHighlighter highlighter;
|
||||
IndexSearcher lastSearcher;
|
||||
|
|
|
@ -1,137 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.search.postingshighlight;
|
||||
|
||||
/**
|
||||
* Creates a formatted snippet from the top passages.
|
||||
* <p>
|
||||
* The default implementation marks the query terms as bold, and places
|
||||
* ellipses between unconnected passages.
|
||||
*/
|
||||
public class DefaultPassageFormatter extends PassageFormatter {
|
||||
/** text that will appear before highlighted terms */
|
||||
protected final String preTag;
|
||||
/** text that will appear after highlighted terms */
|
||||
protected final String postTag;
|
||||
/** text that will appear between two unconnected passages */
|
||||
protected final String ellipsis;
|
||||
/** true if we should escape for html */
|
||||
protected final boolean escape;
|
||||
|
||||
/**
|
||||
* Creates a new DefaultPassageFormatter with the default tags.
|
||||
*/
|
||||
public DefaultPassageFormatter() {
|
||||
this("<b>", "</b>", "... ", false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new DefaultPassageFormatter with custom tags.
|
||||
* @param preTag text which should appear before a highlighted term.
|
||||
* @param postTag text which should appear after a highlighted term.
|
||||
* @param ellipsis text which should be used to connect two unconnected passages.
|
||||
* @param escape true if text should be html-escaped
|
||||
*/
|
||||
public DefaultPassageFormatter(String preTag, String postTag, String ellipsis, boolean escape) {
|
||||
if (preTag == null || postTag == null || ellipsis == null) {
|
||||
throw new NullPointerException();
|
||||
}
|
||||
this.preTag = preTag;
|
||||
this.postTag = postTag;
|
||||
this.ellipsis = ellipsis;
|
||||
this.escape = escape;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String format(Passage passages[], String content) {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
int pos = 0;
|
||||
for (Passage passage : passages) {
|
||||
// don't add ellipsis if it's the first one, or if it's connected.
|
||||
if (passage.startOffset > pos && pos > 0) {
|
||||
sb.append(ellipsis);
|
||||
}
|
||||
pos = passage.startOffset;
|
||||
for (int i = 0; i < passage.numMatches; i++) {
|
||||
int start = passage.matchStarts[i];
|
||||
int end = passage.matchEnds[i];
|
||||
// it's possible to have overlapping terms
|
||||
if (start > pos) {
|
||||
append(sb, content, pos, start);
|
||||
}
|
||||
if (end > pos) {
|
||||
sb.append(preTag);
|
||||
append(sb, content, Math.max(pos, start), end);
|
||||
sb.append(postTag);
|
||||
pos = end;
|
||||
}
|
||||
}
|
||||
// it's possible a "term" from the analyzer could span a sentence boundary.
|
||||
append(sb, content, pos, Math.max(pos, passage.endOffset));
|
||||
pos = passage.endOffset;
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Appends original text to the response.
|
||||
* @param dest resulting text, possibly transformed or encoded
|
||||
* @param content original text content
|
||||
* @param start index of the first character in content
|
||||
* @param end index of the character following the last character in content
|
||||
*/
|
||||
protected void append(StringBuilder dest, String content, int start, int end) {
|
||||
if (escape) {
|
||||
// note: these are the rules from owasp.org
|
||||
for (int i = start; i < end; i++) {
|
||||
char ch = content.charAt(i);
|
||||
switch(ch) {
|
||||
case '&':
|
||||
dest.append("&");
|
||||
break;
|
||||
case '<':
|
||||
dest.append("<");
|
||||
break;
|
||||
case '>':
|
||||
dest.append(">");
|
||||
break;
|
||||
case '"':
|
||||
dest.append(""");
|
||||
break;
|
||||
case '\'':
|
||||
dest.append("'");
|
||||
break;
|
||||
case '/':
|
||||
dest.append("/");
|
||||
break;
|
||||
default:
|
||||
if (ch >= 0x30 && ch <= 0x39 || ch >= 0x41 && ch <= 0x5A || ch >= 0x61 && ch <= 0x7A) {
|
||||
dest.append(ch);
|
||||
} else if (ch < 0xff) {
|
||||
dest.append("&#");
|
||||
dest.append((int)ch);
|
||||
dest.append(";");
|
||||
} else {
|
||||
dest.append(ch);
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
dest.append(content, start, end);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,282 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.search.postingshighlight;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.index.PostingsEnum;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.AutomatonQuery;
|
||||
import org.apache.lucene.search.BooleanClause;
|
||||
import org.apache.lucene.search.BooleanQuery;
|
||||
import org.apache.lucene.search.ConstantScoreQuery;
|
||||
import org.apache.lucene.search.DisjunctionMaxQuery;
|
||||
import org.apache.lucene.search.FuzzyQuery;
|
||||
import org.apache.lucene.search.PrefixQuery;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.TermRangeQuery;
|
||||
import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper;
|
||||
import org.apache.lucene.search.spans.SpanNearQuery;
|
||||
import org.apache.lucene.search.spans.SpanNotQuery;
|
||||
import org.apache.lucene.search.spans.SpanOrQuery;
|
||||
import org.apache.lucene.search.spans.SpanPositionCheckQuery;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
import org.apache.lucene.util.UnicodeUtil;
|
||||
import org.apache.lucene.util.automaton.Automata;
|
||||
import org.apache.lucene.util.automaton.Automaton;
|
||||
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
|
||||
import org.apache.lucene.util.automaton.LevenshteinAutomata;
|
||||
import org.apache.lucene.util.automaton.Operations;
|
||||
|
||||
/**
|
||||
* Support for highlighting multiterm queries in PostingsHighlighter.
|
||||
*/
|
||||
class MultiTermHighlighting {
|
||||
|
||||
/**
|
||||
* Extracts all MultiTermQueries for {@code field}, and returns equivalent
|
||||
* automata that will match terms.
|
||||
*/
|
||||
static CharacterRunAutomaton[] extractAutomata(Query query, String field) {
|
||||
List<CharacterRunAutomaton> list = new ArrayList<>();
|
||||
if (query instanceof BooleanQuery) {
|
||||
for (BooleanClause clause : (BooleanQuery) query) {
|
||||
if (!clause.isProhibited()) {
|
||||
list.addAll(Arrays.asList(extractAutomata(clause.getQuery(), field)));
|
||||
}
|
||||
}
|
||||
} else if (query instanceof ConstantScoreQuery) {
|
||||
list.addAll(Arrays.asList(extractAutomata(((ConstantScoreQuery) query).getQuery(), field)));
|
||||
} else if (query instanceof DisjunctionMaxQuery) {
|
||||
for (Query sub : ((DisjunctionMaxQuery) query).getDisjuncts()) {
|
||||
list.addAll(Arrays.asList(extractAutomata(sub, field)));
|
||||
}
|
||||
} else if (query instanceof SpanOrQuery) {
|
||||
for (Query sub : ((SpanOrQuery) query).getClauses()) {
|
||||
list.addAll(Arrays.asList(extractAutomata(sub, field)));
|
||||
}
|
||||
} else if (query instanceof SpanNearQuery) {
|
||||
for (Query sub : ((SpanNearQuery) query).getClauses()) {
|
||||
list.addAll(Arrays.asList(extractAutomata(sub, field)));
|
||||
}
|
||||
} else if (query instanceof SpanNotQuery) {
|
||||
list.addAll(Arrays.asList(extractAutomata(((SpanNotQuery) query).getInclude(), field)));
|
||||
} else if (query instanceof SpanPositionCheckQuery) {
|
||||
list.addAll(Arrays.asList(extractAutomata(((SpanPositionCheckQuery) query).getMatch(), field)));
|
||||
} else if (query instanceof SpanMultiTermQueryWrapper) {
|
||||
list.addAll(Arrays.asList(extractAutomata(((SpanMultiTermQueryWrapper<?>) query).getWrappedQuery(), field)));
|
||||
} else if (query instanceof PrefixQuery) {
|
||||
final PrefixQuery pq = (PrefixQuery) query;
|
||||
Term prefix = pq.getPrefix();
|
||||
if (prefix.field().equals(field)) {
|
||||
list.add(new CharacterRunAutomaton(Operations.concatenate(Automata.makeString(prefix.text()),
|
||||
Automata.makeAnyString())) {
|
||||
@Override
|
||||
public String toString() {
|
||||
return pq.toString();
|
||||
}
|
||||
});
|
||||
}
|
||||
} else if (query instanceof FuzzyQuery) {
|
||||
final FuzzyQuery fq = (FuzzyQuery) query;
|
||||
if (fq.getField().equals(field)) {
|
||||
String utf16 = fq.getTerm().text();
|
||||
int termText[] = new int[utf16.codePointCount(0, utf16.length())];
|
||||
for (int cp, i = 0, j = 0; i < utf16.length(); i += Character.charCount(cp)) {
|
||||
termText[j++] = cp = utf16.codePointAt(i);
|
||||
}
|
||||
int termLength = termText.length;
|
||||
int prefixLength = Math.min(fq.getPrefixLength(), termLength);
|
||||
String suffix = UnicodeUtil.newString(termText, prefixLength, termText.length - prefixLength);
|
||||
LevenshteinAutomata builder = new LevenshteinAutomata(suffix, fq.getTranspositions());
|
||||
String prefix = UnicodeUtil.newString(termText, 0, prefixLength);
|
||||
Automaton automaton = builder.toAutomaton(fq.getMaxEdits(), prefix);
|
||||
list.add(new CharacterRunAutomaton(automaton) {
|
||||
@Override
|
||||
public String toString() {
|
||||
return fq.toString();
|
||||
}
|
||||
});
|
||||
}
|
||||
} else if (query instanceof TermRangeQuery) {
|
||||
final TermRangeQuery tq = (TermRangeQuery) query;
|
||||
if (tq.getField().equals(field)) {
|
||||
final CharsRef lowerBound;
|
||||
if (tq.getLowerTerm() == null) {
|
||||
lowerBound = null;
|
||||
} else {
|
||||
lowerBound = new CharsRef(tq.getLowerTerm().utf8ToString());
|
||||
}
|
||||
|
||||
final CharsRef upperBound;
|
||||
if (tq.getUpperTerm() == null) {
|
||||
upperBound = null;
|
||||
} else {
|
||||
upperBound = new CharsRef(tq.getUpperTerm().utf8ToString());
|
||||
}
|
||||
|
||||
final boolean includeLower = tq.includesLower();
|
||||
final boolean includeUpper = tq.includesUpper();
|
||||
final CharsRef scratch = new CharsRef();
|
||||
final Comparator<CharsRef> comparator = CharsRef.getUTF16SortedAsUTF8Comparator();
|
||||
|
||||
// this is *not* an automaton, but it's very simple
|
||||
list.add(new CharacterRunAutomaton(Automata.makeEmpty()) {
|
||||
@Override
|
||||
public boolean run(char[] s, int offset, int length) {
|
||||
scratch.chars = s;
|
||||
scratch.offset = offset;
|
||||
scratch.length = length;
|
||||
|
||||
if (lowerBound != null) {
|
||||
int cmp = comparator.compare(scratch, lowerBound);
|
||||
if (cmp < 0 || (!includeLower && cmp == 0)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
if (upperBound != null) {
|
||||
int cmp = comparator.compare(scratch, upperBound);
|
||||
if (cmp > 0 || (!includeUpper && cmp == 0)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return tq.toString();
|
||||
}
|
||||
});
|
||||
}
|
||||
} else if (query instanceof AutomatonQuery) {
|
||||
final AutomatonQuery aq = (AutomatonQuery) query;
|
||||
if (aq.getField().equals(field)) {
|
||||
list.add(new CharacterRunAutomaton(aq.getAutomaton()) {
|
||||
@Override
|
||||
public String toString() {
|
||||
return aq.toString();
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
return list.toArray(new CharacterRunAutomaton[list.size()]);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a "fake" DocsAndPositionsEnum over the tokenstream, returning offsets where {@code matchers}
|
||||
* matches tokens.
|
||||
* <p>
|
||||
* This is solely used internally by PostingsHighlighter: <b>DO NOT USE THIS METHOD!</b>
|
||||
*/
|
||||
static PostingsEnum getDocsEnum(final TokenStream ts, final CharacterRunAutomaton[] matchers) throws IOException {
|
||||
final CharTermAttribute charTermAtt = ts.addAttribute(CharTermAttribute.class);
|
||||
final OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
|
||||
ts.reset();
|
||||
|
||||
// TODO: we could use CachingWrapperFilter, (or consume twice) to allow us to have a true freq()
|
||||
// but this would have a performance cost for likely little gain in the user experience, it
|
||||
// would only serve to make this method less bogus.
|
||||
// instead, we always return freq() = Integer.MAX_VALUE and let PH terminate based on offset...
|
||||
|
||||
return new PostingsEnum() {
|
||||
int currentDoc = -1;
|
||||
int currentMatch = -1;
|
||||
int currentStartOffset = -1;
|
||||
int currentEndOffset = -1;
|
||||
TokenStream stream = ts;
|
||||
|
||||
final BytesRef matchDescriptions[] = new BytesRef[matchers.length];
|
||||
|
||||
@Override
|
||||
public int nextPosition() throws IOException {
|
||||
if (stream != null) {
|
||||
while (stream.incrementToken()) {
|
||||
for (int i = 0; i < matchers.length; i++) {
|
||||
if (matchers[i].run(charTermAtt.buffer(), 0, charTermAtt.length())) {
|
||||
currentStartOffset = offsetAtt.startOffset();
|
||||
currentEndOffset = offsetAtt.endOffset();
|
||||
currentMatch = i;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
stream.end();
|
||||
stream.close();
|
||||
stream = null;
|
||||
}
|
||||
// exhausted
|
||||
currentStartOffset = currentEndOffset = Integer.MAX_VALUE;
|
||||
return Integer.MAX_VALUE;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int freq() throws IOException {
|
||||
return Integer.MAX_VALUE; // lie
|
||||
}
|
||||
|
||||
@Override
|
||||
public int startOffset() throws IOException {
|
||||
assert currentStartOffset >= 0;
|
||||
return currentStartOffset;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int endOffset() throws IOException {
|
||||
assert currentEndOffset >= 0;
|
||||
return currentEndOffset;
|
||||
}
|
||||
|
||||
@Override
|
||||
public BytesRef getPayload() throws IOException {
|
||||
if (matchDescriptions[currentMatch] == null) {
|
||||
matchDescriptions[currentMatch] = new BytesRef(matchers[currentMatch].toString());
|
||||
}
|
||||
return matchDescriptions[currentMatch];
|
||||
}
|
||||
|
||||
@Override
|
||||
public int docID() {
|
||||
return currentDoc;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int nextDoc() throws IOException {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int advance(int target) throws IOException {
|
||||
return currentDoc = target;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long cost() {
|
||||
return 0;
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
|
@ -1,159 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.search.postingshighlight;
|
||||
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.InPlaceMergeSorter;
|
||||
import org.apache.lucene.util.RamUsageEstimator;
|
||||
|
||||
/**
|
||||
* Represents a passage (typically a sentence of the document).
|
||||
* <p>
|
||||
* A passage contains {@link #getNumMatches} highlights from the query,
|
||||
* and the offsets and query terms that correspond with each match.
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public final class Passage {
|
||||
int startOffset = -1;
|
||||
int endOffset = -1;
|
||||
float score = 0.0f;
|
||||
|
||||
int matchStarts[] = new int[8];
|
||||
int matchEnds[] = new int[8];
|
||||
BytesRef matchTerms[] = new BytesRef[8];
|
||||
int numMatches = 0;
|
||||
|
||||
void addMatch(int startOffset, int endOffset, BytesRef term) {
|
||||
assert startOffset >= this.startOffset && startOffset <= this.endOffset;
|
||||
if (numMatches == matchStarts.length) {
|
||||
int newLength = ArrayUtil.oversize(numMatches+1, RamUsageEstimator.NUM_BYTES_OBJECT_REF);
|
||||
int newMatchStarts[] = new int[newLength];
|
||||
int newMatchEnds[] = new int[newLength];
|
||||
BytesRef newMatchTerms[] = new BytesRef[newLength];
|
||||
System.arraycopy(matchStarts, 0, newMatchStarts, 0, numMatches);
|
||||
System.arraycopy(matchEnds, 0, newMatchEnds, 0, numMatches);
|
||||
System.arraycopy(matchTerms, 0, newMatchTerms, 0, numMatches);
|
||||
matchStarts = newMatchStarts;
|
||||
matchEnds = newMatchEnds;
|
||||
matchTerms = newMatchTerms;
|
||||
}
|
||||
assert matchStarts.length == matchEnds.length && matchEnds.length == matchTerms.length;
|
||||
matchStarts[numMatches] = startOffset;
|
||||
matchEnds[numMatches] = endOffset;
|
||||
matchTerms[numMatches] = term;
|
||||
numMatches++;
|
||||
}
|
||||
|
||||
void sort() {
|
||||
final int starts[] = matchStarts;
|
||||
final int ends[] = matchEnds;
|
||||
final BytesRef terms[] = matchTerms;
|
||||
new InPlaceMergeSorter() {
|
||||
@Override
|
||||
protected void swap(int i, int j) {
|
||||
int temp = starts[i];
|
||||
starts[i] = starts[j];
|
||||
starts[j] = temp;
|
||||
|
||||
temp = ends[i];
|
||||
ends[i] = ends[j];
|
||||
ends[j] = temp;
|
||||
|
||||
BytesRef tempTerm = terms[i];
|
||||
terms[i] = terms[j];
|
||||
terms[j] = tempTerm;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected int compare(int i, int j) {
|
||||
return Integer.compare(starts[i], starts[j]);
|
||||
}
|
||||
|
||||
}.sort(0, numMatches);
|
||||
}
|
||||
|
||||
void reset() {
|
||||
startOffset = endOffset = -1;
|
||||
score = 0.0f;
|
||||
numMatches = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Start offset of this passage.
|
||||
* @return start index (inclusive) of the passage in the
|
||||
* original content: always >= 0.
|
||||
*/
|
||||
public int getStartOffset() {
|
||||
return startOffset;
|
||||
}
|
||||
|
||||
/**
|
||||
* End offset of this passage.
|
||||
* @return end index (exclusive) of the passage in the
|
||||
* original content: always >= {@link #getStartOffset()}
|
||||
*/
|
||||
public int getEndOffset() {
|
||||
return endOffset;
|
||||
}
|
||||
|
||||
/**
|
||||
* Passage's score.
|
||||
*/
|
||||
public float getScore() {
|
||||
return score;
|
||||
}
|
||||
|
||||
/**
|
||||
* Number of term matches available in
|
||||
* {@link #getMatchStarts}, {@link #getMatchEnds},
|
||||
* {@link #getMatchTerms}
|
||||
*/
|
||||
public int getNumMatches() {
|
||||
return numMatches;
|
||||
}
|
||||
|
||||
/**
|
||||
* Start offsets of the term matches, in increasing order.
|
||||
* <p>
|
||||
* Only {@link #getNumMatches} are valid. Note that these
|
||||
* offsets are absolute (not relative to {@link #getStartOffset()}).
|
||||
*/
|
||||
public int[] getMatchStarts() {
|
||||
return matchStarts;
|
||||
}
|
||||
|
||||
/**
|
||||
* End offsets of the term matches, corresponding with {@link #getMatchStarts}.
|
||||
* <p>
|
||||
* Only {@link #getNumMatches} are valid. Note that it's possible that an end offset
|
||||
* could exceed beyond the bounds of the passage ({@link #getEndOffset()}), if the
|
||||
* Analyzer produced a term which spans a passage boundary.
|
||||
*/
|
||||
public int[] getMatchEnds() {
|
||||
return matchEnds;
|
||||
}
|
||||
|
||||
/**
|
||||
* BytesRef (term text) of the matches, corresponding with {@link #getMatchStarts()}.
|
||||
* <p>
|
||||
* Only {@link #getNumMatches()} are valid.
|
||||
*/
|
||||
public BytesRef[] getMatchTerms() {
|
||||
return matchTerms;
|
||||
}
|
||||
}
|
|
@ -1,40 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.search.postingshighlight;
|
||||
|
||||
/**
|
||||
* Creates a formatted snippet from the top passages.
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public abstract class PassageFormatter {
|
||||
|
||||
/**
|
||||
* Formats the top <code>passages</code> from <code>content</code>
|
||||
* into a human-readable text snippet.
|
||||
*
|
||||
* @param passages top-N passages for the field. Note these are sorted in
|
||||
* the order that they appear in the document for convenience.
|
||||
* @param content content for the field.
|
||||
* @return formatted highlight. Note that for the
|
||||
* non-expert APIs in {@link PostingsHighlighter} that
|
||||
* return String, the toString method on the Object
|
||||
* returned by this method is used to compute the string.
|
||||
*/
|
||||
public abstract Object format(Passage passages[], String content);
|
||||
|
||||
}
|
|
@ -1,104 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.search.postingshighlight;
|
||||
|
||||
/**
|
||||
* Ranks passages found by {@link PostingsHighlighter}.
|
||||
* <p>
|
||||
* Each passage is scored as a miniature document within the document.
|
||||
* The final score is computed as {@link #norm} * ∑ ({@link #weight} * {@link #tf}).
|
||||
* The default implementation is {@link #norm} * BM25.
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class PassageScorer {
|
||||
|
||||
// TODO: this formula is completely made up. It might not provide relevant snippets!
|
||||
|
||||
/** BM25 k1 parameter, controls term frequency normalization */
|
||||
final float k1;
|
||||
/** BM25 b parameter, controls length normalization. */
|
||||
final float b;
|
||||
/** A pivot used for length normalization. */
|
||||
final float pivot;
|
||||
|
||||
/**
|
||||
* Creates PassageScorer with these default values:
|
||||
* <ul>
|
||||
* <li>{@code k1 = 1.2},
|
||||
* <li>{@code b = 0.75}.
|
||||
* <li>{@code pivot = 87}
|
||||
* </ul>
|
||||
*/
|
||||
public PassageScorer() {
|
||||
// 1.2 and 0.75 are well-known bm25 defaults (but maybe not the best here) ?
|
||||
// 87 is typical average english sentence length.
|
||||
this(1.2f, 0.75f, 87f);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates PassageScorer with specified scoring parameters
|
||||
* @param k1 Controls non-linear term frequency normalization (saturation).
|
||||
* @param b Controls to what degree passage length normalizes tf values.
|
||||
* @param pivot Pivot value for length normalization (some rough idea of average sentence length in characters).
|
||||
*/
|
||||
public PassageScorer(float k1, float b, float pivot) {
|
||||
this.k1 = k1;
|
||||
this.b = b;
|
||||
this.pivot = pivot;
|
||||
}
|
||||
|
||||
/**
|
||||
* Computes term importance, given its in-document statistics.
|
||||
*
|
||||
* @param contentLength length of document in characters
|
||||
* @param totalTermFreq number of time term occurs in document
|
||||
* @return term importance
|
||||
*/
|
||||
public float weight(int contentLength, int totalTermFreq) {
|
||||
// approximate #docs from content length
|
||||
float numDocs = 1 + contentLength / pivot;
|
||||
// numDocs not numDocs - docFreq (ala DFR), since we approximate numDocs
|
||||
return (k1 + 1) * (float) Math.log(1 + (numDocs + 0.5D)/(totalTermFreq + 0.5D));
|
||||
}
|
||||
|
||||
/**
|
||||
* Computes term weight, given the frequency within the passage
|
||||
* and the passage's length.
|
||||
*
|
||||
* @param freq number of occurrences of within this passage
|
||||
* @param passageLen length of the passage in characters.
|
||||
* @return term weight
|
||||
*/
|
||||
public float tf(int freq, int passageLen) {
|
||||
float norm = k1 * ((1 - b) + b * (passageLen / pivot));
|
||||
return freq / (freq + norm);
|
||||
}
|
||||
|
||||
/**
|
||||
* Normalize a passage according to its position in the document.
|
||||
* <p>
|
||||
* Typically passages towards the beginning of the document are
|
||||
* more useful for summarizing the contents.
|
||||
* <p>
|
||||
* The default implementation is <code>1 + 1/log(pivot + passageStart)</code>
|
||||
* @param passageStart start offset of the passage
|
||||
* @return a boost value multiplied into the passage's core.
|
||||
*/
|
||||
public float norm(int passageStart) {
|
||||
return 1 + 1/(float)Math.log(pivot + passageStart);
|
||||
}
|
||||
}
|
|
@ -1,820 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.search.postingshighlight;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.text.BreakIterator;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.PriorityQueue;
|
||||
import java.util.SortedSet;
|
||||
import java.util.TreeSet;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.IndexOptions;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexReaderContext;
|
||||
import org.apache.lucene.index.LeafReader;
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.index.MultiReader;
|
||||
import org.apache.lucene.index.PostingsEnum;
|
||||
import org.apache.lucene.index.ReaderUtil;
|
||||
import org.apache.lucene.index.StoredFieldVisitor;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.ScoreDoc;
|
||||
import org.apache.lucene.search.TopDocs;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.InPlaceMergeSorter;
|
||||
import org.apache.lucene.util.UnicodeUtil;
|
||||
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
|
||||
|
||||
/**
|
||||
* Simple highlighter that does not analyze fields nor use
|
||||
* term vectors. Instead it requires
|
||||
* {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}.
|
||||
* <p>
|
||||
* PostingsHighlighter treats the single original document as the whole corpus, and then scores individual
|
||||
* passages as if they were documents in this corpus. It uses a {@link BreakIterator} to find
|
||||
* passages in the text; by default it breaks using {@link BreakIterator#getSentenceInstance(Locale)
|
||||
* getSentenceInstance(Locale.ROOT)}. It then iterates in parallel (merge sorting by offset) through
|
||||
* the positions of all terms from the query, coalescing those hits that occur in a single passage
|
||||
* into a {@link Passage}, and then scores each Passage using a separate {@link PassageScorer}.
|
||||
* Passages are finally formatted into highlighted snippets with a {@link PassageFormatter}.
|
||||
* <p>
|
||||
* You can customize the behavior by subclassing this highlighter, some important hooks:
|
||||
* <ul>
|
||||
* <li>{@link #getBreakIterator(String)}: Customize how the text is divided into passages.
|
||||
* <li>{@link #getScorer(String)}: Customize how passages are ranked.
|
||||
* <li>{@link #getFormatter(String)}: Customize how snippets are formatted.
|
||||
* <li>{@link #getIndexAnalyzer(String)}: Enable highlighting of MultiTermQuerys such as {@code WildcardQuery}.
|
||||
* </ul>
|
||||
* <p>
|
||||
* <b>WARNING</b>: The code is very new and probably still has some exciting bugs!
|
||||
* <p>
|
||||
* Example usage:
|
||||
* <pre class="prettyprint">
|
||||
* // configure field with offsets at index time
|
||||
* FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
|
||||
* offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
|
||||
* Field body = new Field("body", "foobar", offsetsType);
|
||||
*
|
||||
* // retrieve highlights at query time
|
||||
* PostingsHighlighter highlighter = new PostingsHighlighter();
|
||||
* Query query = new TermQuery(new Term("body", "highlighting"));
|
||||
* TopDocs topDocs = searcher.search(query, n);
|
||||
* String highlights[] = highlighter.highlight("body", query, searcher, topDocs);
|
||||
* </pre>
|
||||
* <p>
|
||||
* This is thread-safe, and can be used across different readers.
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class PostingsHighlighter {
|
||||
|
||||
// TODO: maybe allow re-analysis for tiny fields? currently we require offsets,
|
||||
// but if the analyzer is really fast and the field is tiny, this might really be
|
||||
// unnecessary.
|
||||
|
||||
/** for rewriting: we don't want slow processing from MTQs */
|
||||
private static final IndexSearcher EMPTY_INDEXSEARCHER;
|
||||
static {
|
||||
try {
|
||||
IndexReader emptyReader = new MultiReader();
|
||||
EMPTY_INDEXSEARCHER = new IndexSearcher(emptyReader);
|
||||
EMPTY_INDEXSEARCHER.setQueryCache(null);
|
||||
} catch (IOException bogus) {
|
||||
throw new RuntimeException(bogus);
|
||||
}
|
||||
}
|
||||
|
||||
/** Default maximum content size to process. Typically snippets
|
||||
* closer to the beginning of the document better summarize its content */
|
||||
public static final int DEFAULT_MAX_LENGTH = 10000;
|
||||
|
||||
private final int maxLength;
|
||||
|
||||
/** Set the first time {@link #getFormatter} is called,
|
||||
* and then reused. */
|
||||
private PassageFormatter defaultFormatter;
|
||||
|
||||
/** Set the first time {@link #getScorer} is called,
|
||||
* and then reused. */
|
||||
private PassageScorer defaultScorer;
|
||||
|
||||
/**
|
||||
* Creates a new highlighter with {@link #DEFAULT_MAX_LENGTH}.
|
||||
*/
|
||||
public PostingsHighlighter() {
|
||||
this(DEFAULT_MAX_LENGTH);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new highlighter, specifying maximum content length.
|
||||
* @param maxLength maximum content size to process.
|
||||
* @throws IllegalArgumentException if <code>maxLength</code> is negative or <code>Integer.MAX_VALUE</code>
|
||||
*/
|
||||
public PostingsHighlighter(int maxLength) {
|
||||
if (maxLength < 0 || maxLength == Integer.MAX_VALUE) {
|
||||
// two reasons: no overflow problems in BreakIterator.preceding(offset+1),
|
||||
// our sentinel in the offsets queue uses this value to terminate.
|
||||
throw new IllegalArgumentException("maxLength must be < Integer.MAX_VALUE");
|
||||
}
|
||||
this.maxLength = maxLength;
|
||||
}
|
||||
|
||||
/** Returns the {@link BreakIterator} to use for
|
||||
* dividing text into passages. This returns
|
||||
* {@link BreakIterator#getSentenceInstance(Locale)} by default;
|
||||
* subclasses can override to customize. */
|
||||
protected BreakIterator getBreakIterator(String field) {
|
||||
return BreakIterator.getSentenceInstance(Locale.ROOT);
|
||||
}
|
||||
|
||||
/** Returns the {@link PassageFormatter} to use for
|
||||
* formatting passages into highlighted snippets. This
|
||||
* returns a new {@code PassageFormatter} by default;
|
||||
* subclasses can override to customize. */
|
||||
protected PassageFormatter getFormatter(String field) {
|
||||
if (defaultFormatter == null) {
|
||||
defaultFormatter = new DefaultPassageFormatter();
|
||||
}
|
||||
return defaultFormatter;
|
||||
}
|
||||
|
||||
/** Returns the {@link PassageScorer} to use for
|
||||
* ranking passages. This
|
||||
* returns a new {@code PassageScorer} by default;
|
||||
* subclasses can override to customize. */
|
||||
protected PassageScorer getScorer(String field) {
|
||||
if (defaultScorer == null) {
|
||||
defaultScorer = new PassageScorer();
|
||||
}
|
||||
return defaultScorer;
|
||||
}
|
||||
|
||||
/**
|
||||
* Highlights the top passages from a single field.
|
||||
*
|
||||
* @param field field name to highlight.
|
||||
* Must have a stored string value and also be indexed with offsets.
|
||||
* @param query query to highlight.
|
||||
* @param searcher searcher that was previously used to execute the query.
|
||||
* @param topDocs TopDocs containing the summary result documents to highlight.
|
||||
* @return Array of formatted snippets corresponding to the documents in <code>topDocs</code>.
|
||||
* If no highlights were found for a document, the
|
||||
* first sentence for the field will be returned.
|
||||
* @throws IOException if an I/O error occurred during processing
|
||||
* @throws IllegalArgumentException if <code>field</code> was indexed without
|
||||
* {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
|
||||
*/
|
||||
public String[] highlight(String field, Query query, IndexSearcher searcher, TopDocs topDocs) throws IOException {
|
||||
return highlight(field, query, searcher, topDocs, 1);
|
||||
}
|
||||
|
||||
/**
|
||||
* Highlights the top-N passages from a single field.
|
||||
*
|
||||
* @param field field name to highlight.
|
||||
* Must have a stored string value and also be indexed with offsets.
|
||||
* @param query query to highlight.
|
||||
* @param searcher searcher that was previously used to execute the query.
|
||||
* @param topDocs TopDocs containing the summary result documents to highlight.
|
||||
* @param maxPassages The maximum number of top-N ranked passages used to
|
||||
* form the highlighted snippets.
|
||||
* @return Array of formatted snippets corresponding to the documents in <code>topDocs</code>.
|
||||
* If no highlights were found for a document, the
|
||||
* first {@code maxPassages} sentences from the
|
||||
* field will be returned.
|
||||
* @throws IOException if an I/O error occurred during processing
|
||||
* @throws IllegalArgumentException if <code>field</code> was indexed without
|
||||
* {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
|
||||
*/
|
||||
public String[] highlight(String field, Query query, IndexSearcher searcher, TopDocs topDocs, int maxPassages) throws IOException {
|
||||
Map<String,String[]> res = highlightFields(new String[] { field }, query, searcher, topDocs, new int[] { maxPassages });
|
||||
return res.get(field);
|
||||
}
|
||||
|
||||
/**
|
||||
* Highlights the top passages from multiple fields.
|
||||
* <p>
|
||||
* Conceptually, this behaves as a more efficient form of:
|
||||
* <pre class="prettyprint">
|
||||
* Map m = new HashMap();
|
||||
* for (String field : fields) {
|
||||
* m.put(field, highlight(field, query, searcher, topDocs));
|
||||
* }
|
||||
* return m;
|
||||
* </pre>
|
||||
*
|
||||
* @param fields field names to highlight.
|
||||
* Must have a stored string value and also be indexed with offsets.
|
||||
* @param query query to highlight.
|
||||
* @param searcher searcher that was previously used to execute the query.
|
||||
* @param topDocs TopDocs containing the summary result documents to highlight.
|
||||
* @return Map keyed on field name, containing the array of formatted snippets
|
||||
* corresponding to the documents in <code>topDocs</code>.
|
||||
* If no highlights were found for a document, the
|
||||
* first sentence from the field will be returned.
|
||||
* @throws IOException if an I/O error occurred during processing
|
||||
* @throws IllegalArgumentException if <code>field</code> was indexed without
|
||||
* {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
|
||||
*/
|
||||
public Map<String,String[]> highlightFields(String fields[], Query query, IndexSearcher searcher, TopDocs topDocs) throws IOException {
|
||||
int maxPassages[] = new int[fields.length];
|
||||
Arrays.fill(maxPassages, 1);
|
||||
return highlightFields(fields, query, searcher, topDocs, maxPassages);
|
||||
}
|
||||
|
||||
/**
|
||||
* Highlights the top-N passages from multiple fields.
|
||||
* <p>
|
||||
* Conceptually, this behaves as a more efficient form of:
|
||||
* <pre class="prettyprint">
|
||||
* Map m = new HashMap();
|
||||
* for (String field : fields) {
|
||||
* m.put(field, highlight(field, query, searcher, topDocs, maxPassages));
|
||||
* }
|
||||
* return m;
|
||||
* </pre>
|
||||
*
|
||||
* @param fields field names to highlight.
|
||||
* Must have a stored string value and also be indexed with offsets.
|
||||
* @param query query to highlight.
|
||||
* @param searcher searcher that was previously used to execute the query.
|
||||
* @param topDocs TopDocs containing the summary result documents to highlight.
|
||||
* @param maxPassages The maximum number of top-N ranked passages per-field used to
|
||||
* form the highlighted snippets.
|
||||
* @return Map keyed on field name, containing the array of formatted snippets
|
||||
* corresponding to the documents in <code>topDocs</code>.
|
||||
* If no highlights were found for a document, the
|
||||
* first {@code maxPassages} sentences from the
|
||||
* field will be returned.
|
||||
* @throws IOException if an I/O error occurred during processing
|
||||
* @throws IllegalArgumentException if <code>field</code> was indexed without
|
||||
* {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
|
||||
*/
|
||||
public Map<String,String[]> highlightFields(String fields[], Query query, IndexSearcher searcher, TopDocs topDocs, int maxPassages[]) throws IOException {
|
||||
final ScoreDoc scoreDocs[] = topDocs.scoreDocs;
|
||||
int docids[] = new int[scoreDocs.length];
|
||||
for (int i = 0; i < docids.length; i++) {
|
||||
docids[i] = scoreDocs[i].doc;
|
||||
}
|
||||
|
||||
return highlightFields(fields, query, searcher, docids, maxPassages);
|
||||
}
|
||||
|
||||
/**
|
||||
* Highlights the top-N passages from multiple fields,
|
||||
* for the provided int[] docids.
|
||||
*
|
||||
* @param fieldsIn field names to highlight.
|
||||
* Must have a stored string value and also be indexed with offsets.
|
||||
* @param query query to highlight.
|
||||
* @param searcher searcher that was previously used to execute the query.
|
||||
* @param docidsIn containing the document IDs to highlight.
|
||||
* @param maxPassagesIn The maximum number of top-N ranked passages per-field used to
|
||||
* form the highlighted snippets.
|
||||
* @return Map keyed on field name, containing the array of formatted snippets
|
||||
* corresponding to the documents in <code>docidsIn</code>.
|
||||
* If no highlights were found for a document, the
|
||||
* first {@code maxPassages} from the field will
|
||||
* be returned.
|
||||
* @throws IOException if an I/O error occurred during processing
|
||||
* @throws IllegalArgumentException if <code>field</code> was indexed without
|
||||
* {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
|
||||
*/
|
||||
public Map<String,String[]> highlightFields(String fieldsIn[], Query query, IndexSearcher searcher, int[] docidsIn, int maxPassagesIn[]) throws IOException {
|
||||
Map<String,String[]> snippets = new HashMap<>();
|
||||
for(Map.Entry<String,Object[]> ent : highlightFieldsAsObjects(fieldsIn, query, searcher, docidsIn, maxPassagesIn).entrySet()) {
|
||||
Object[] snippetObjects = ent.getValue();
|
||||
String[] snippetStrings = new String[snippetObjects.length];
|
||||
snippets.put(ent.getKey(), snippetStrings);
|
||||
for(int i=0;i<snippetObjects.length;i++) {
|
||||
Object snippet = snippetObjects[i];
|
||||
if (snippet != null) {
|
||||
snippetStrings[i] = snippet.toString();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return snippets;
|
||||
}
|
||||
|
||||
/**
|
||||
* Expert: highlights the top-N passages from multiple fields,
|
||||
* for the provided int[] docids, to custom Object as
|
||||
* returned by the {@link PassageFormatter}. Use
|
||||
* this API to render to something other than String.
|
||||
*
|
||||
* @param fieldsIn field names to highlight.
|
||||
* Must have a stored string value and also be indexed with offsets.
|
||||
* @param query query to highlight.
|
||||
* @param searcher searcher that was previously used to execute the query.
|
||||
* @param docidsIn containing the document IDs to highlight.
|
||||
* @param maxPassagesIn The maximum number of top-N ranked passages per-field used to
|
||||
* form the highlighted snippets.
|
||||
* @return Map keyed on field name, containing the array of formatted snippets
|
||||
* corresponding to the documents in <code>docidsIn</code>.
|
||||
* If no highlights were found for a document, the
|
||||
* first {@code maxPassages} from the field will
|
||||
* be returned.
|
||||
* @throws IOException if an I/O error occurred during processing
|
||||
* @throws IllegalArgumentException if <code>field</code> was indexed without
|
||||
* {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
|
||||
*/
|
||||
protected Map<String,Object[]> highlightFieldsAsObjects(String fieldsIn[], Query query, IndexSearcher searcher, int[] docidsIn, int maxPassagesIn[]) throws IOException {
|
||||
if (fieldsIn.length < 1) {
|
||||
throw new IllegalArgumentException("fieldsIn must not be empty");
|
||||
}
|
||||
if (fieldsIn.length != maxPassagesIn.length) {
|
||||
throw new IllegalArgumentException("invalid number of maxPassagesIn");
|
||||
}
|
||||
SortedSet<Term> queryTerms = new TreeSet<>();
|
||||
EMPTY_INDEXSEARCHER.createNormalizedWeight(query, false).extractTerms(queryTerms);
|
||||
|
||||
IndexReaderContext readerContext = searcher.getIndexReader().getContext();
|
||||
List<LeafReaderContext> leaves = readerContext.leaves();
|
||||
|
||||
// Make our own copies because we sort in-place:
|
||||
int[] docids = new int[docidsIn.length];
|
||||
System.arraycopy(docidsIn, 0, docids, 0, docidsIn.length);
|
||||
final String fields[] = new String[fieldsIn.length];
|
||||
System.arraycopy(fieldsIn, 0, fields, 0, fieldsIn.length);
|
||||
final int maxPassages[] = new int[maxPassagesIn.length];
|
||||
System.arraycopy(maxPassagesIn, 0, maxPassages, 0, maxPassagesIn.length);
|
||||
|
||||
// sort for sequential io
|
||||
Arrays.sort(docids);
|
||||
new InPlaceMergeSorter() {
|
||||
|
||||
@Override
|
||||
protected void swap(int i, int j) {
|
||||
String tmp = fields[i];
|
||||
fields[i] = fields[j];
|
||||
fields[j] = tmp;
|
||||
int tmp2 = maxPassages[i];
|
||||
maxPassages[i] = maxPassages[j];
|
||||
maxPassages[j] = tmp2;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected int compare(int i, int j) {
|
||||
return fields[i].compareTo(fields[j]);
|
||||
}
|
||||
|
||||
}.sort(0, fields.length);
|
||||
|
||||
// pull stored data:
|
||||
String[][] contents = loadFieldValues(searcher, fields, docids, maxLength);
|
||||
|
||||
Map<String,Object[]> highlights = new HashMap<>();
|
||||
for (int i = 0; i < fields.length; i++) {
|
||||
String field = fields[i];
|
||||
int numPassages = maxPassages[i];
|
||||
Term floor = new Term(field, "");
|
||||
Term ceiling = new Term(field, UnicodeUtil.BIG_TERM);
|
||||
SortedSet<Term> fieldTerms = queryTerms.subSet(floor, ceiling);
|
||||
// TODO: should we have some reasonable defaults for term pruning? (e.g. stopwords)
|
||||
|
||||
// Strip off the redundant field:
|
||||
BytesRef terms[] = new BytesRef[fieldTerms.size()];
|
||||
int termUpto = 0;
|
||||
for(Term term : fieldTerms) {
|
||||
terms[termUpto++] = term.bytes();
|
||||
}
|
||||
Map<Integer,Object> fieldHighlights = highlightField(field, contents[i], getBreakIterator(field), terms, docids, leaves, numPassages, query);
|
||||
|
||||
Object[] result = new Object[docids.length];
|
||||
for (int j = 0; j < docidsIn.length; j++) {
|
||||
result[j] = fieldHighlights.get(docidsIn[j]);
|
||||
}
|
||||
highlights.put(field, result);
|
||||
}
|
||||
return highlights;
|
||||
}
|
||||
|
||||
/** Loads the String values for each field X docID to be
|
||||
* highlighted. By default this loads from stored
|
||||
* fields, but a subclass can change the source. This
|
||||
* method should allocate the String[fields.length][docids.length]
|
||||
* and fill all values. The returned Strings must be
|
||||
* identical to what was indexed. */
|
||||
protected String[][] loadFieldValues(IndexSearcher searcher, String[] fields, int[] docids, int maxLength) throws IOException {
|
||||
String contents[][] = new String[fields.length][docids.length];
|
||||
char valueSeparators[] = new char[fields.length];
|
||||
for (int i = 0; i < fields.length; i++) {
|
||||
valueSeparators[i] = getMultiValuedSeparator(fields[i]);
|
||||
}
|
||||
LimitedStoredFieldVisitor visitor = new LimitedStoredFieldVisitor(fields, valueSeparators, maxLength);
|
||||
for (int i = 0; i < docids.length; i++) {
|
||||
searcher.doc(docids[i], visitor);
|
||||
for (int j = 0; j < fields.length; j++) {
|
||||
contents[j][i] = visitor.getValue(j).toString();
|
||||
}
|
||||
visitor.reset();
|
||||
}
|
||||
return contents;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the logical separator between values for multi-valued fields.
|
||||
* The default value is a space character, which means passages can span across values,
|
||||
* but a subclass can override, for example with {@code U+2029 PARAGRAPH SEPARATOR (PS)}
|
||||
* if each value holds a discrete passage for highlighting.
|
||||
*/
|
||||
protected char getMultiValuedSeparator(String field) {
|
||||
return ' ';
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the analyzer originally used to index the content for {@code field}.
|
||||
* <p>
|
||||
* This is used to highlight some MultiTermQueries.
|
||||
* @return Analyzer or null (the default, meaning no special multi-term processing)
|
||||
*/
|
||||
protected Analyzer getIndexAnalyzer(String field) {
|
||||
return null;
|
||||
}
|
||||
|
||||
private Map<Integer,Object> highlightField(String field, String contents[], BreakIterator bi, BytesRef terms[], int[] docids, List<LeafReaderContext> leaves, int maxPassages, Query query) throws IOException {
|
||||
Map<Integer,Object> highlights = new HashMap<>();
|
||||
|
||||
PassageFormatter fieldFormatter = getFormatter(field);
|
||||
if (fieldFormatter == null) {
|
||||
throw new NullPointerException("PassageFormatter must not be null");
|
||||
}
|
||||
|
||||
// check if we should do any multiterm processing
|
||||
Analyzer analyzer = getIndexAnalyzer(field);
|
||||
CharacterRunAutomaton automata[] = new CharacterRunAutomaton[0];
|
||||
if (analyzer != null) {
|
||||
automata = MultiTermHighlighting.extractAutomata(query, field);
|
||||
}
|
||||
|
||||
// resize 'terms', where the last term is the multiterm matcher
|
||||
if (automata.length > 0) {
|
||||
BytesRef newTerms[] = new BytesRef[terms.length + 1];
|
||||
System.arraycopy(terms, 0, newTerms, 0, terms.length);
|
||||
terms = newTerms;
|
||||
}
|
||||
|
||||
// we are processing in increasing docid order, so we only need to reinitialize stuff on segment changes
|
||||
// otherwise, we will just advance() existing enums to the new document in the same segment.
|
||||
PostingsEnum postings[] = null;
|
||||
TermsEnum termsEnum = null;
|
||||
int lastLeaf = -1;
|
||||
|
||||
for (int i = 0; i < docids.length; i++) {
|
||||
String content = contents[i];
|
||||
if (content.length() == 0) {
|
||||
continue; // nothing to do
|
||||
}
|
||||
bi.setText(content);
|
||||
int doc = docids[i];
|
||||
int leaf = ReaderUtil.subIndex(doc, leaves);
|
||||
LeafReaderContext subContext = leaves.get(leaf);
|
||||
LeafReader r = subContext.reader();
|
||||
|
||||
assert leaf >= lastLeaf; // increasing order
|
||||
|
||||
// if the segment has changed, we must initialize new enums.
|
||||
if (leaf != lastLeaf) {
|
||||
Terms t = r.terms(field);
|
||||
if (t != null) {
|
||||
if (!t.hasOffsets()) {
|
||||
// no offsets available
|
||||
throw new IllegalArgumentException("field '" + field + "' was indexed without offsets, cannot highlight");
|
||||
}
|
||||
termsEnum = t.iterator();
|
||||
postings = new PostingsEnum[terms.length];
|
||||
} else {
|
||||
termsEnum = null;
|
||||
}
|
||||
}
|
||||
if (termsEnum == null) {
|
||||
continue; // no terms for this field, nothing to do
|
||||
}
|
||||
|
||||
// if there are multi-term matches, we have to initialize the "fake" enum for each document
|
||||
if (automata.length > 0) {
|
||||
PostingsEnum dp = MultiTermHighlighting.getDocsEnum(analyzer.tokenStream(field, content), automata);
|
||||
dp.advance(doc - subContext.docBase);
|
||||
postings[terms.length-1] = dp; // last term is the multiterm matcher
|
||||
}
|
||||
|
||||
Passage passages[] = highlightDoc(field, terms, content.length(), bi, doc - subContext.docBase, termsEnum, postings, maxPassages);
|
||||
|
||||
if (passages.length == 0) {
|
||||
// no passages were returned, so ask for a default summary
|
||||
passages = getEmptyHighlight(field, bi, maxPassages);
|
||||
}
|
||||
|
||||
if (passages.length > 0) {
|
||||
highlights.put(doc, fieldFormatter.format(passages, content));
|
||||
}
|
||||
|
||||
lastLeaf = leaf;
|
||||
}
|
||||
|
||||
return highlights;
|
||||
}
|
||||
|
||||
// algorithm: treat sentence snippets as miniature documents
|
||||
// we can intersect these with the postings lists via BreakIterator.preceding(offset),s
|
||||
// score each sentence as norm(sentenceStartOffset) * sum(weight * tf(freq))
|
||||
private Passage[] highlightDoc(String field, BytesRef terms[], int contentLength, BreakIterator bi, int doc,
|
||||
TermsEnum termsEnum, PostingsEnum[] postings, int n) throws IOException {
|
||||
PassageScorer scorer = getScorer(field);
|
||||
if (scorer == null) {
|
||||
throw new NullPointerException("PassageScorer must not be null");
|
||||
}
|
||||
PriorityQueue<OffsetsEnum> pq = new PriorityQueue<>();
|
||||
float weights[] = new float[terms.length];
|
||||
// initialize postings
|
||||
for (int i = 0; i < terms.length; i++) {
|
||||
PostingsEnum de = postings[i];
|
||||
int pDoc;
|
||||
if (de == EMPTY) {
|
||||
continue;
|
||||
} else if (de == null) {
|
||||
postings[i] = EMPTY; // initially
|
||||
if (!termsEnum.seekExact(terms[i])) {
|
||||
continue; // term not found
|
||||
}
|
||||
de = postings[i] = termsEnum.postings(null, PostingsEnum.OFFSETS);
|
||||
assert de != null;
|
||||
pDoc = de.advance(doc);
|
||||
} else {
|
||||
pDoc = de.docID();
|
||||
if (pDoc < doc) {
|
||||
pDoc = de.advance(doc);
|
||||
}
|
||||
}
|
||||
|
||||
if (doc == pDoc) {
|
||||
weights[i] = scorer.weight(contentLength, de.freq());
|
||||
de.nextPosition();
|
||||
pq.add(new OffsetsEnum(de, i));
|
||||
}
|
||||
}
|
||||
|
||||
pq.add(new OffsetsEnum(EMPTY, Integer.MAX_VALUE)); // a sentinel for termination
|
||||
|
||||
PriorityQueue<Passage> passageQueue = new PriorityQueue<>(n, new Comparator<Passage>() {
|
||||
@Override
|
||||
public int compare(Passage left, Passage right) {
|
||||
if (left.score < right.score) {
|
||||
return -1;
|
||||
} else if (left.score > right.score) {
|
||||
return 1;
|
||||
} else {
|
||||
return left.startOffset - right.startOffset;
|
||||
}
|
||||
}
|
||||
});
|
||||
Passage current = new Passage();
|
||||
|
||||
OffsetsEnum off;
|
||||
while ((off = pq.poll()) != null) {
|
||||
final PostingsEnum dp = off.dp;
|
||||
int start = dp.startOffset();
|
||||
assert start >= 0;
|
||||
int end = dp.endOffset();
|
||||
// LUCENE-5166: this hit would span the content limit... however more valid
|
||||
// hits may exist (they are sorted by start). so we pretend like we never
|
||||
// saw this term, it won't cause a passage to be added to passageQueue or anything.
|
||||
assert EMPTY.startOffset() == Integer.MAX_VALUE;
|
||||
if (start < contentLength && end > contentLength) {
|
||||
continue;
|
||||
}
|
||||
if (start >= current.endOffset) {
|
||||
if (current.startOffset >= 0) {
|
||||
// finalize current
|
||||
current.score *= scorer.norm(current.startOffset);
|
||||
// new sentence: first add 'current' to queue
|
||||
if (passageQueue.size() == n && current.score < passageQueue.peek().score) {
|
||||
current.reset(); // can't compete, just reset it
|
||||
} else {
|
||||
passageQueue.offer(current);
|
||||
if (passageQueue.size() > n) {
|
||||
current = passageQueue.poll();
|
||||
current.reset();
|
||||
} else {
|
||||
current = new Passage();
|
||||
}
|
||||
}
|
||||
}
|
||||
// if we exceed limit, we are done
|
||||
if (start >= contentLength) {
|
||||
Passage passages[] = new Passage[passageQueue.size()];
|
||||
passageQueue.toArray(passages);
|
||||
for (Passage p : passages) {
|
||||
p.sort();
|
||||
}
|
||||
// sort in ascending order
|
||||
Arrays.sort(passages, new Comparator<Passage>() {
|
||||
@Override
|
||||
public int compare(Passage left, Passage right) {
|
||||
return left.startOffset - right.startOffset;
|
||||
}
|
||||
});
|
||||
return passages;
|
||||
}
|
||||
// advance breakiterator
|
||||
assert BreakIterator.DONE < 0;
|
||||
current.startOffset = Math.max(bi.preceding(start+1), 0);
|
||||
current.endOffset = Math.min(bi.next(), contentLength);
|
||||
}
|
||||
int tf = 0;
|
||||
while (true) {
|
||||
tf++;
|
||||
BytesRef term = terms[off.id];
|
||||
if (term == null) {
|
||||
// multitermquery match, pull from payload
|
||||
term = off.dp.getPayload();
|
||||
assert term != null;
|
||||
}
|
||||
current.addMatch(start, end, term);
|
||||
if (off.pos == dp.freq()) {
|
||||
break; // removed from pq
|
||||
} else {
|
||||
off.pos++;
|
||||
dp.nextPosition();
|
||||
start = dp.startOffset();
|
||||
end = dp.endOffset();
|
||||
}
|
||||
if (start >= current.endOffset || end > contentLength) {
|
||||
pq.offer(off);
|
||||
break;
|
||||
}
|
||||
}
|
||||
current.score += weights[off.id] * scorer.tf(tf, current.endOffset - current.startOffset);
|
||||
}
|
||||
|
||||
// Dead code but compiler disagrees:
|
||||
assert false;
|
||||
return null;
|
||||
}
|
||||
|
||||
/** Called to summarize a document when no hits were
|
||||
* found. By default this just returns the first
|
||||
* {@code maxPassages} sentences; subclasses can override
|
||||
* to customize. */
|
||||
protected Passage[] getEmptyHighlight(String fieldName, BreakIterator bi, int maxPassages) {
|
||||
// BreakIterator should be un-next'd:
|
||||
List<Passage> passages = new ArrayList<>();
|
||||
int pos = bi.current();
|
||||
assert pos == 0;
|
||||
while (passages.size() < maxPassages) {
|
||||
int next = bi.next();
|
||||
if (next == BreakIterator.DONE) {
|
||||
break;
|
||||
}
|
||||
Passage passage = new Passage();
|
||||
passage.score = Float.NaN;
|
||||
passage.startOffset = pos;
|
||||
passage.endOffset = next;
|
||||
passages.add(passage);
|
||||
pos = next;
|
||||
}
|
||||
|
||||
return passages.toArray(new Passage[passages.size()]);
|
||||
}
|
||||
|
||||
private static class OffsetsEnum implements Comparable<OffsetsEnum> {
|
||||
PostingsEnum dp;
|
||||
int pos;
|
||||
int id;
|
||||
|
||||
OffsetsEnum(PostingsEnum dp, int id) throws IOException {
|
||||
this.dp = dp;
|
||||
this.id = id;
|
||||
this.pos = 1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int compareTo(OffsetsEnum other) {
|
||||
try {
|
||||
int off = dp.startOffset();
|
||||
int otherOff = other.dp.startOffset();
|
||||
if (off == otherOff) {
|
||||
return id - other.id;
|
||||
} else {
|
||||
return Integer.compare(off, otherOff);
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static final PostingsEnum EMPTY = new PostingsEnum() {
|
||||
|
||||
@Override
|
||||
public int nextPosition() throws IOException { return -1; }
|
||||
|
||||
@Override
|
||||
public int startOffset() throws IOException { return Integer.MAX_VALUE; }
|
||||
|
||||
@Override
|
||||
public int endOffset() throws IOException { return Integer.MAX_VALUE; }
|
||||
|
||||
@Override
|
||||
public BytesRef getPayload() throws IOException { return null; }
|
||||
|
||||
@Override
|
||||
public int freq() throws IOException { return 0; }
|
||||
|
||||
@Override
|
||||
public int docID() { return NO_MORE_DOCS; }
|
||||
|
||||
@Override
|
||||
public int nextDoc() throws IOException { return NO_MORE_DOCS; }
|
||||
|
||||
@Override
|
||||
public int advance(int target) throws IOException { return NO_MORE_DOCS; }
|
||||
|
||||
@Override
|
||||
public long cost() { return 0; }
|
||||
};
|
||||
|
||||
private static class LimitedStoredFieldVisitor extends StoredFieldVisitor {
|
||||
private final String fields[];
|
||||
private final char valueSeparators[];
|
||||
private final int maxLength;
|
||||
private final StringBuilder builders[];
|
||||
private int currentField = -1;
|
||||
|
||||
public LimitedStoredFieldVisitor(String fields[], char valueSeparators[], int maxLength) {
|
||||
assert fields.length == valueSeparators.length;
|
||||
this.fields = fields;
|
||||
this.valueSeparators = valueSeparators;
|
||||
this.maxLength = maxLength;
|
||||
builders = new StringBuilder[fields.length];
|
||||
for (int i = 0; i < builders.length; i++) {
|
||||
builders[i] = new StringBuilder();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void stringField(FieldInfo fieldInfo, byte[] bytes) throws IOException {
|
||||
String value = new String(bytes, StandardCharsets.UTF_8);
|
||||
assert currentField >= 0;
|
||||
StringBuilder builder = builders[currentField];
|
||||
if (builder.length() > 0 && builder.length() < maxLength) {
|
||||
builder.append(valueSeparators[currentField]);
|
||||
}
|
||||
if (builder.length() + value.length() > maxLength) {
|
||||
builder.append(value, 0, maxLength - builder.length());
|
||||
} else {
|
||||
builder.append(value);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Status needsField(FieldInfo fieldInfo) throws IOException {
|
||||
currentField = Arrays.binarySearch(fields, fieldInfo.name);
|
||||
if (currentField < 0) {
|
||||
return Status.NO;
|
||||
} else if (builders[currentField].length() > maxLength) {
|
||||
return fields.length == 1 ? Status.STOP : Status.NO;
|
||||
}
|
||||
return Status.YES;
|
||||
}
|
||||
|
||||
String getValue(int i) {
|
||||
return builders[i].toString();
|
||||
}
|
||||
|
||||
void reset() {
|
||||
currentField = -1;
|
||||
for (int i = 0; i < fields.length; i++) {
|
||||
builders[i].setLength(0);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,21 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Highlighter implementation that uses offsets from postings lists.
|
||||
*/
|
||||
package org.apache.lucene.search.postingshighlight;
|
|
@ -0,0 +1,150 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.search.uhighlight;
|
||||
|
||||
import java.text.BreakIterator;
|
||||
import java.text.CharacterIterator;
|
||||
|
||||
/**
|
||||
* A {@link BreakIterator} that breaks the text whenever a certain separator, provided as a constructor argument, is found.
|
||||
*/
|
||||
public final class CustomSeparatorBreakIterator extends BreakIterator {
|
||||
|
||||
private final char separator;
|
||||
private CharacterIterator text;
|
||||
private int current;
|
||||
|
||||
public CustomSeparatorBreakIterator(char separator) {
|
||||
this.separator = separator;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int current() {
|
||||
return current;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int first() {
|
||||
text.setIndex(text.getBeginIndex());
|
||||
return current = text.getIndex();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int last() {
|
||||
text.setIndex(text.getEndIndex());
|
||||
return current = text.getIndex();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int next() {
|
||||
if (text.getIndex() == text.getEndIndex()) {
|
||||
return DONE;
|
||||
} else {
|
||||
return advanceForward();
|
||||
}
|
||||
}
|
||||
|
||||
private int advanceForward() {
|
||||
char c;
|
||||
while ((c = text.next()) != CharacterIterator.DONE) {
|
||||
if (c == separator) {
|
||||
return current = text.getIndex() + 1;
|
||||
}
|
||||
}
|
||||
assert text.getIndex() == text.getEndIndex();
|
||||
return current = text.getIndex();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int following(int pos) {
|
||||
if (pos < text.getBeginIndex() || pos > text.getEndIndex()) {
|
||||
throw new IllegalArgumentException("offset out of bounds");
|
||||
} else if (pos == text.getEndIndex()) {
|
||||
// this conflicts with the javadocs, but matches actual behavior (Oracle has a bug in something)
|
||||
// https://bugs.openjdk.java.net/browse/JDK-8015110
|
||||
text.setIndex(text.getEndIndex());
|
||||
current = text.getIndex();
|
||||
return DONE;
|
||||
} else {
|
||||
text.setIndex(pos);
|
||||
current = text.getIndex();
|
||||
return advanceForward();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public int previous() {
|
||||
if (text.getIndex() == text.getBeginIndex()) {
|
||||
return DONE;
|
||||
} else {
|
||||
return advanceBackward();
|
||||
}
|
||||
}
|
||||
|
||||
private int advanceBackward() {
|
||||
char c;
|
||||
while ((c = text.previous()) != CharacterIterator.DONE) {
|
||||
if (c == separator) {
|
||||
return current = text.getIndex() + 1;
|
||||
}
|
||||
}
|
||||
assert text.getIndex() == text.getBeginIndex();
|
||||
return current = text.getIndex();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int preceding(int pos) {
|
||||
if (pos < text.getBeginIndex() || pos > text.getEndIndex()) {
|
||||
throw new IllegalArgumentException("offset out of bounds");
|
||||
} else if (pos == text.getBeginIndex()) {
|
||||
// this conflicts with the javadocs, but matches actual behavior (Oracle has a bug in something)
|
||||
// https://bugs.openjdk.java.net/browse/JDK-8015110
|
||||
text.setIndex(text.getBeginIndex());
|
||||
current = text.getIndex();
|
||||
return DONE;
|
||||
} else {
|
||||
text.setIndex(pos);
|
||||
current = text.getIndex();
|
||||
return advanceBackward();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public int next(int n) {
|
||||
if (n < 0) {
|
||||
for (int i = 0; i < -n; i++) {
|
||||
previous();
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < n; i++) {
|
||||
next();
|
||||
}
|
||||
}
|
||||
return current();
|
||||
}
|
||||
|
||||
@Override
|
||||
public CharacterIterator getText() {
|
||||
return text;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setText(CharacterIterator newText) {
|
||||
text = newText;
|
||||
current = text.getBeginIndex();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,116 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.search.uhighlight;
|
||||
|
||||
import java.text.BreakIterator;
|
||||
import java.text.CharacterIterator;
|
||||
|
||||
/** Just produces one single fragment for the entire text */
|
||||
public final class WholeBreakIterator extends BreakIterator {
|
||||
private CharacterIterator text;
|
||||
private int start;
|
||||
private int end;
|
||||
private int current;
|
||||
|
||||
@Override
|
||||
public int current() {
|
||||
return current;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int first() {
|
||||
return (current = start);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int following(int pos) {
|
||||
if (pos < start || pos > end) {
|
||||
throw new IllegalArgumentException("offset out of bounds");
|
||||
} else if (pos == end) {
|
||||
// this conflicts with the javadocs, but matches actual behavior (Oracle has a bug in something)
|
||||
// https://bugs.openjdk.java.net/browse/JDK-8015110
|
||||
current = end;
|
||||
return DONE;
|
||||
} else {
|
||||
return last();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public CharacterIterator getText() {
|
||||
return text;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int last() {
|
||||
return (current = end);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int next() {
|
||||
if (current == end) {
|
||||
return DONE;
|
||||
} else {
|
||||
return last();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public int next(int n) {
|
||||
if (n < 0) {
|
||||
for (int i = 0; i < -n; i++) {
|
||||
previous();
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < n; i++) {
|
||||
next();
|
||||
}
|
||||
}
|
||||
return current();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int preceding(int pos) {
|
||||
if (pos < start || pos > end) {
|
||||
throw new IllegalArgumentException("offset out of bounds");
|
||||
} else if (pos == start) {
|
||||
// this conflicts with the javadocs, but matches actual behavior (Oracle has a bug in something)
|
||||
// https://bugs.openjdk.java.net/browse/JDK-8015110
|
||||
current = start;
|
||||
return DONE;
|
||||
} else {
|
||||
return first();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public int previous() {
|
||||
if (current == start) {
|
||||
return DONE;
|
||||
} else {
|
||||
return first();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setText(CharacterIterator newText) {
|
||||
start = newText.getBeginIndex();
|
||||
end = newText.getEndIndex();
|
||||
text = newText;
|
||||
current = start;
|
||||
}
|
||||
}
|
File diff suppressed because one or more lines are too long
|
@ -1,884 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.search.postingshighlight;
|
||||
|
||||
import java.util.Collections;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.FieldType;
|
||||
import org.apache.lucene.document.TextField;
|
||||
import org.apache.lucene.index.IndexOptions;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexWriterConfig;
|
||||
import org.apache.lucene.index.RandomIndexWriter;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.BooleanClause;
|
||||
import org.apache.lucene.search.BooleanClause.Occur;
|
||||
import org.apache.lucene.search.BooleanQuery;
|
||||
import org.apache.lucene.search.ConstantScoreQuery;
|
||||
import org.apache.lucene.search.DisjunctionMaxQuery;
|
||||
import org.apache.lucene.search.FuzzyQuery;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.MatchAllDocsQuery;
|
||||
import org.apache.lucene.search.PrefixQuery;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.RegexpQuery;
|
||||
import org.apache.lucene.search.Sort;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.search.TermRangeQuery;
|
||||
import org.apache.lucene.search.TopDocs;
|
||||
import org.apache.lucene.search.WildcardQuery;
|
||||
import org.apache.lucene.search.spans.SpanFirstQuery;
|
||||
import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper;
|
||||
import org.apache.lucene.search.spans.SpanNearQuery;
|
||||
import org.apache.lucene.search.spans.SpanNotQuery;
|
||||
import org.apache.lucene.search.spans.SpanOrQuery;
|
||||
import org.apache.lucene.search.spans.SpanQuery;
|
||||
import org.apache.lucene.search.spans.SpanTermQuery;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
/**
|
||||
* Some tests that override {@link PostingsHighlighter#getIndexAnalyzer} to
|
||||
* highlight wilcard, fuzzy, etc queries.
|
||||
*/
|
||||
public class TestMultiTermHighlighting extends LuceneTestCase {
|
||||
|
||||
public void testWildcards() throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
// use simpleanalyzer for more natural tokenization (else "test." is a token)
|
||||
final Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);
|
||||
IndexWriterConfig iwc = newIndexWriterConfig(analyzer);
|
||||
iwc.setMergePolicy(newLogMergePolicy());
|
||||
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
|
||||
|
||||
FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
|
||||
offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
|
||||
Field body = new Field("body", "", offsetsType);
|
||||
Document doc = new Document();
|
||||
doc.add(body);
|
||||
|
||||
body.setStringValue("This is a test.");
|
||||
iw.addDocument(doc);
|
||||
body.setStringValue("Test a one sentence document.");
|
||||
iw.addDocument(doc);
|
||||
|
||||
IndexReader ir = iw.getReader();
|
||||
iw.close();
|
||||
|
||||
IndexSearcher searcher = newSearcher(ir);
|
||||
PostingsHighlighter highlighter = new PostingsHighlighter() {
|
||||
@Override
|
||||
protected Analyzer getIndexAnalyzer(String field) {
|
||||
return analyzer;
|
||||
}
|
||||
};
|
||||
Query query = new WildcardQuery(new Term("body", "te*"));
|
||||
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
|
||||
assertEquals(2, topDocs.totalHits);
|
||||
String snippets[] = highlighter.highlight("body", query, searcher, topDocs);
|
||||
assertEquals(2, snippets.length);
|
||||
assertEquals("This is a <b>test</b>.", snippets[0]);
|
||||
assertEquals("<b>Test</b> a one sentence document.", snippets[1]);
|
||||
|
||||
// wrong field
|
||||
BooleanQuery.Builder bq = new BooleanQuery.Builder();
|
||||
bq.add(new MatchAllDocsQuery(), BooleanClause.Occur.SHOULD);
|
||||
bq.add(new WildcardQuery(new Term("bogus", "te*")), BooleanClause.Occur.SHOULD);
|
||||
topDocs = searcher.search(bq.build(), 10, Sort.INDEXORDER);
|
||||
assertEquals(2, topDocs.totalHits);
|
||||
snippets = highlighter.highlight("body", bq.build(), searcher, topDocs);
|
||||
assertEquals(2, snippets.length);
|
||||
assertEquals("This is a test.", snippets[0]);
|
||||
assertEquals("Test a one sentence document.", snippets[1]);
|
||||
|
||||
ir.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
public void testOnePrefix() throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
// use simpleanalyzer for more natural tokenization (else "test." is a token)
|
||||
final Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);
|
||||
IndexWriterConfig iwc = newIndexWriterConfig(analyzer);
|
||||
iwc.setMergePolicy(newLogMergePolicy());
|
||||
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
|
||||
|
||||
FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
|
||||
offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
|
||||
Field body = new Field("body", "", offsetsType);
|
||||
Document doc = new Document();
|
||||
doc.add(body);
|
||||
|
||||
body.setStringValue("This is a test.");
|
||||
iw.addDocument(doc);
|
||||
body.setStringValue("Test a one sentence document.");
|
||||
iw.addDocument(doc);
|
||||
|
||||
IndexReader ir = iw.getReader();
|
||||
iw.close();
|
||||
|
||||
IndexSearcher searcher = newSearcher(ir);
|
||||
PostingsHighlighter highlighter = new PostingsHighlighter() {
|
||||
@Override
|
||||
protected Analyzer getIndexAnalyzer(String field) {
|
||||
return analyzer;
|
||||
}
|
||||
};
|
||||
Query query = new PrefixQuery(new Term("body", "te"));
|
||||
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
|
||||
assertEquals(2, topDocs.totalHits);
|
||||
String snippets[] = highlighter.highlight("body", query, searcher, topDocs);
|
||||
assertEquals(2, snippets.length);
|
||||
assertEquals("This is a <b>test</b>.", snippets[0]);
|
||||
assertEquals("<b>Test</b> a one sentence document.", snippets[1]);
|
||||
|
||||
// wrong field
|
||||
BooleanQuery.Builder bq = new BooleanQuery.Builder();
|
||||
bq.add(new MatchAllDocsQuery(), BooleanClause.Occur.SHOULD);
|
||||
bq.add(new PrefixQuery(new Term("bogus", "te")), BooleanClause.Occur.SHOULD);
|
||||
topDocs = searcher.search(bq.build(), 10, Sort.INDEXORDER);
|
||||
assertEquals(2, topDocs.totalHits);
|
||||
snippets = highlighter.highlight("body", bq.build(), searcher, topDocs);
|
||||
assertEquals(2, snippets.length);
|
||||
assertEquals("This is a test.", snippets[0]);
|
||||
assertEquals("Test a one sentence document.", snippets[1]);
|
||||
|
||||
ir.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
public void testOneRegexp() throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
// use simpleanalyzer for more natural tokenization (else "test." is a token)
|
||||
final Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);
|
||||
IndexWriterConfig iwc = newIndexWriterConfig(analyzer);
|
||||
iwc.setMergePolicy(newLogMergePolicy());
|
||||
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
|
||||
|
||||
FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
|
||||
offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
|
||||
Field body = new Field("body", "", offsetsType);
|
||||
Document doc = new Document();
|
||||
doc.add(body);
|
||||
|
||||
body.setStringValue("This is a test.");
|
||||
iw.addDocument(doc);
|
||||
body.setStringValue("Test a one sentence document.");
|
||||
iw.addDocument(doc);
|
||||
|
||||
IndexReader ir = iw.getReader();
|
||||
iw.close();
|
||||
|
||||
IndexSearcher searcher = newSearcher(ir);
|
||||
PostingsHighlighter highlighter = new PostingsHighlighter() {
|
||||
@Override
|
||||
protected Analyzer getIndexAnalyzer(String field) {
|
||||
return analyzer;
|
||||
}
|
||||
};
|
||||
Query query = new RegexpQuery(new Term("body", "te.*"));
|
||||
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
|
||||
assertEquals(2, topDocs.totalHits);
|
||||
String snippets[] = highlighter.highlight("body", query, searcher, topDocs);
|
||||
assertEquals(2, snippets.length);
|
||||
assertEquals("This is a <b>test</b>.", snippets[0]);
|
||||
assertEquals("<b>Test</b> a one sentence document.", snippets[1]);
|
||||
|
||||
// wrong field
|
||||
BooleanQuery.Builder bq = new BooleanQuery.Builder();
|
||||
bq.add(new MatchAllDocsQuery(), BooleanClause.Occur.SHOULD);
|
||||
bq.add(new RegexpQuery(new Term("bogus", "te.*")), BooleanClause.Occur.SHOULD);
|
||||
topDocs = searcher.search(bq.build(), 10, Sort.INDEXORDER);
|
||||
assertEquals(2, topDocs.totalHits);
|
||||
snippets = highlighter.highlight("body", bq.build(), searcher, topDocs);
|
||||
assertEquals(2, snippets.length);
|
||||
assertEquals("This is a test.", snippets[0]);
|
||||
assertEquals("Test a one sentence document.", snippets[1]);
|
||||
|
||||
ir.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
public void testOneFuzzy() throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
// use simpleanalyzer for more natural tokenization (else "test." is a token)
|
||||
final Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);
|
||||
IndexWriterConfig iwc = newIndexWriterConfig(analyzer);
|
||||
iwc.setMergePolicy(newLogMergePolicy());
|
||||
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
|
||||
|
||||
FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
|
||||
offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
|
||||
Field body = new Field("body", "", offsetsType);
|
||||
Document doc = new Document();
|
||||
doc.add(body);
|
||||
|
||||
body.setStringValue("This is a test.");
|
||||
iw.addDocument(doc);
|
||||
body.setStringValue("Test a one sentence document.");
|
||||
iw.addDocument(doc);
|
||||
|
||||
IndexReader ir = iw.getReader();
|
||||
iw.close();
|
||||
|
||||
IndexSearcher searcher = newSearcher(ir);
|
||||
PostingsHighlighter highlighter = new PostingsHighlighter() {
|
||||
@Override
|
||||
protected Analyzer getIndexAnalyzer(String field) {
|
||||
return analyzer;
|
||||
}
|
||||
};
|
||||
Query query = new FuzzyQuery(new Term("body", "tets"), 1);
|
||||
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
|
||||
assertEquals(2, topDocs.totalHits);
|
||||
String snippets[] = highlighter.highlight("body", query, searcher, topDocs);
|
||||
assertEquals(2, snippets.length);
|
||||
assertEquals("This is a <b>test</b>.", snippets[0]);
|
||||
assertEquals("<b>Test</b> a one sentence document.", snippets[1]);
|
||||
|
||||
// with prefix
|
||||
query = new FuzzyQuery(new Term("body", "tets"), 1, 2);
|
||||
topDocs = searcher.search(query, 10, Sort.INDEXORDER);
|
||||
assertEquals(2, topDocs.totalHits);
|
||||
snippets = highlighter.highlight("body", query, searcher, topDocs);
|
||||
assertEquals(2, snippets.length);
|
||||
assertEquals("This is a <b>test</b>.", snippets[0]);
|
||||
assertEquals("<b>Test</b> a one sentence document.", snippets[1]);
|
||||
|
||||
// wrong field
|
||||
BooleanQuery.Builder bq = new BooleanQuery.Builder();
|
||||
bq.add(new MatchAllDocsQuery(), BooleanClause.Occur.SHOULD);
|
||||
bq.add(new FuzzyQuery(new Term("bogus", "tets"), 1), BooleanClause.Occur.SHOULD);
|
||||
topDocs = searcher.search(bq.build(), 10, Sort.INDEXORDER);
|
||||
assertEquals(2, topDocs.totalHits);
|
||||
snippets = highlighter.highlight("body", bq.build(), searcher, topDocs);
|
||||
assertEquals(2, snippets.length);
|
||||
assertEquals("This is a test.", snippets[0]);
|
||||
assertEquals("Test a one sentence document.", snippets[1]);
|
||||
|
||||
ir.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
public void testRanges() throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
// use simpleanalyzer for more natural tokenization (else "test." is a token)
|
||||
final Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);
|
||||
IndexWriterConfig iwc = newIndexWriterConfig(analyzer);
|
||||
iwc.setMergePolicy(newLogMergePolicy());
|
||||
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
|
||||
|
||||
FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
|
||||
offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
|
||||
Field body = new Field("body", "", offsetsType);
|
||||
Document doc = new Document();
|
||||
doc.add(body);
|
||||
|
||||
body.setStringValue("This is a test.");
|
||||
iw.addDocument(doc);
|
||||
body.setStringValue("Test a one sentence document.");
|
||||
iw.addDocument(doc);
|
||||
|
||||
IndexReader ir = iw.getReader();
|
||||
iw.close();
|
||||
|
||||
IndexSearcher searcher = newSearcher(ir);
|
||||
PostingsHighlighter highlighter = new PostingsHighlighter() {
|
||||
@Override
|
||||
protected Analyzer getIndexAnalyzer(String field) {
|
||||
return analyzer;
|
||||
}
|
||||
};
|
||||
Query query = TermRangeQuery.newStringRange("body", "ta", "tf", true, true);
|
||||
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
|
||||
assertEquals(2, topDocs.totalHits);
|
||||
String snippets[] = highlighter.highlight("body", query, searcher, topDocs);
|
||||
assertEquals(2, snippets.length);
|
||||
assertEquals("This is a <b>test</b>.", snippets[0]);
|
||||
assertEquals("<b>Test</b> a one sentence document.", snippets[1]);
|
||||
|
||||
// null start
|
||||
query = TermRangeQuery.newStringRange("body", null, "tf", true, true);
|
||||
topDocs = searcher.search(query, 10, Sort.INDEXORDER);
|
||||
assertEquals(2, topDocs.totalHits);
|
||||
snippets = highlighter.highlight("body", query, searcher, topDocs);
|
||||
assertEquals(2, snippets.length);
|
||||
assertEquals("This <b>is</b> <b>a</b> <b>test</b>.", snippets[0]);
|
||||
assertEquals("<b>Test</b> <b>a</b> <b>one</b> <b>sentence</b> <b>document</b>.", snippets[1]);
|
||||
|
||||
// null end
|
||||
query = TermRangeQuery.newStringRange("body", "ta", null, true, true);
|
||||
topDocs = searcher.search(query, 10, Sort.INDEXORDER);
|
||||
assertEquals(2, topDocs.totalHits);
|
||||
snippets = highlighter.highlight("body", query, searcher, topDocs);
|
||||
assertEquals(2, snippets.length);
|
||||
assertEquals("<b>This</b> is a <b>test</b>.", snippets[0]);
|
||||
assertEquals("<b>Test</b> a one sentence document.", snippets[1]);
|
||||
|
||||
// exact start inclusive
|
||||
query = TermRangeQuery.newStringRange("body", "test", "tf", true, true);
|
||||
topDocs = searcher.search(query, 10, Sort.INDEXORDER);
|
||||
assertEquals(2, topDocs.totalHits);
|
||||
snippets = highlighter.highlight("body", query, searcher, topDocs);
|
||||
assertEquals(2, snippets.length);
|
||||
assertEquals("This is a <b>test</b>.", snippets[0]);
|
||||
assertEquals("<b>Test</b> a one sentence document.", snippets[1]);
|
||||
|
||||
// exact end inclusive
|
||||
query = TermRangeQuery.newStringRange("body", "ta", "test", true, true);
|
||||
topDocs = searcher.search(query, 10, Sort.INDEXORDER);
|
||||
assertEquals(2, topDocs.totalHits);
|
||||
snippets = highlighter.highlight("body", query, searcher, topDocs);
|
||||
assertEquals(2, snippets.length);
|
||||
assertEquals("This is a <b>test</b>.", snippets[0]);
|
||||
assertEquals("<b>Test</b> a one sentence document.", snippets[1]);
|
||||
|
||||
// exact start exclusive
|
||||
BooleanQuery.Builder bq = new BooleanQuery.Builder();
|
||||
bq.add(new MatchAllDocsQuery(), BooleanClause.Occur.SHOULD);
|
||||
bq.add(TermRangeQuery.newStringRange("body", "test", "tf", false, true), BooleanClause.Occur.SHOULD);
|
||||
topDocs = searcher.search(bq.build(), 10, Sort.INDEXORDER);
|
||||
assertEquals(2, topDocs.totalHits);
|
||||
snippets = highlighter.highlight("body", bq.build(), searcher, topDocs);
|
||||
assertEquals(2, snippets.length);
|
||||
assertEquals("This is a test.", snippets[0]);
|
||||
assertEquals("Test a one sentence document.", snippets[1]);
|
||||
|
||||
// exact end exclusive
|
||||
bq = new BooleanQuery.Builder();
|
||||
bq.add(new MatchAllDocsQuery(), BooleanClause.Occur.SHOULD);
|
||||
bq.add(TermRangeQuery.newStringRange("body", "ta", "test", true, false), BooleanClause.Occur.SHOULD);
|
||||
topDocs = searcher.search(bq.build(), 10, Sort.INDEXORDER);
|
||||
assertEquals(2, topDocs.totalHits);
|
||||
snippets = highlighter.highlight("body", bq.build(), searcher, topDocs);
|
||||
assertEquals(2, snippets.length);
|
||||
assertEquals("This is a test.", snippets[0]);
|
||||
assertEquals("Test a one sentence document.", snippets[1]);
|
||||
|
||||
// wrong field
|
||||
bq = new BooleanQuery.Builder();
|
||||
bq.add(new MatchAllDocsQuery(), BooleanClause.Occur.SHOULD);
|
||||
bq.add(TermRangeQuery.newStringRange("bogus", "ta", "tf", true, true), BooleanClause.Occur.SHOULD);
|
||||
topDocs = searcher.search(bq.build(), 10, Sort.INDEXORDER);
|
||||
assertEquals(2, topDocs.totalHits);
|
||||
snippets = highlighter.highlight("body", bq.build(), searcher, topDocs);
|
||||
assertEquals(2, snippets.length);
|
||||
assertEquals("This is a test.", snippets[0]);
|
||||
assertEquals("Test a one sentence document.", snippets[1]);
|
||||
|
||||
ir.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
public void testWildcardInBoolean() throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
// use simpleanalyzer for more natural tokenization (else "test." is a token)
|
||||
final Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);
|
||||
IndexWriterConfig iwc = newIndexWriterConfig(analyzer);
|
||||
iwc.setMergePolicy(newLogMergePolicy());
|
||||
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
|
||||
|
||||
FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
|
||||
offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
|
||||
Field body = new Field("body", "", offsetsType);
|
||||
Document doc = new Document();
|
||||
doc.add(body);
|
||||
|
||||
body.setStringValue("This is a test.");
|
||||
iw.addDocument(doc);
|
||||
body.setStringValue("Test a one sentence document.");
|
||||
iw.addDocument(doc);
|
||||
|
||||
IndexReader ir = iw.getReader();
|
||||
iw.close();
|
||||
|
||||
IndexSearcher searcher = newSearcher(ir);
|
||||
PostingsHighlighter highlighter = new PostingsHighlighter() {
|
||||
@Override
|
||||
protected Analyzer getIndexAnalyzer(String field) {
|
||||
return analyzer;
|
||||
}
|
||||
};
|
||||
BooleanQuery.Builder query = new BooleanQuery.Builder();
|
||||
query.add(new WildcardQuery(new Term("body", "te*")), BooleanClause.Occur.SHOULD);
|
||||
TopDocs topDocs = searcher.search(query.build(), 10, Sort.INDEXORDER);
|
||||
assertEquals(2, topDocs.totalHits);
|
||||
String snippets[] = highlighter.highlight("body", query.build(), searcher, topDocs);
|
||||
assertEquals(2, snippets.length);
|
||||
assertEquals("This is a <b>test</b>.", snippets[0]);
|
||||
assertEquals("<b>Test</b> a one sentence document.", snippets[1]);
|
||||
|
||||
// must not
|
||||
query = new BooleanQuery.Builder();
|
||||
query.add(new MatchAllDocsQuery(), BooleanClause.Occur.SHOULD);
|
||||
query.add(new WildcardQuery(new Term("bogus", "te*")), BooleanClause.Occur.MUST_NOT);
|
||||
topDocs = searcher.search(query.build(), 10, Sort.INDEXORDER);
|
||||
assertEquals(2, topDocs.totalHits);
|
||||
snippets = highlighter.highlight("body", query.build(), searcher, topDocs);
|
||||
assertEquals(2, snippets.length);
|
||||
assertEquals("This is a test.", snippets[0]);
|
||||
assertEquals("Test a one sentence document.", snippets[1]);
|
||||
|
||||
ir.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
public void testWildcardInFiltered() throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
// use simpleanalyzer for more natural tokenization (else "test." is a token)
|
||||
final Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);
|
||||
IndexWriterConfig iwc = newIndexWriterConfig(analyzer);
|
||||
iwc.setMergePolicy(newLogMergePolicy());
|
||||
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
|
||||
|
||||
FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
|
||||
offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
|
||||
Field body = new Field("body", "", offsetsType);
|
||||
Document doc = new Document();
|
||||
doc.add(body);
|
||||
|
||||
body.setStringValue("This is a test.");
|
||||
iw.addDocument(doc);
|
||||
body.setStringValue("Test a one sentence document.");
|
||||
iw.addDocument(doc);
|
||||
|
||||
IndexReader ir = iw.getReader();
|
||||
iw.close();
|
||||
|
||||
IndexSearcher searcher = newSearcher(ir);
|
||||
PostingsHighlighter highlighter = new PostingsHighlighter() {
|
||||
@Override
|
||||
protected Analyzer getIndexAnalyzer(String field) {
|
||||
return analyzer;
|
||||
}
|
||||
};
|
||||
Query query = new BooleanQuery.Builder()
|
||||
.add(new WildcardQuery(new Term("body", "te*")), Occur.MUST)
|
||||
.add(new TermQuery(new Term("body", "test")), Occur.FILTER)
|
||||
.build();
|
||||
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
|
||||
assertEquals(2, topDocs.totalHits);
|
||||
String snippets[] = highlighter.highlight("body", query, searcher, topDocs);
|
||||
assertEquals(2, snippets.length);
|
||||
assertEquals("This is a <b>test</b>.", snippets[0]);
|
||||
assertEquals("<b>Test</b> a one sentence document.", snippets[1]);
|
||||
|
||||
ir.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
public void testWildcardInConstantScore() throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
// use simpleanalyzer for more natural tokenization (else "test." is a token)
|
||||
final Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);
|
||||
IndexWriterConfig iwc = newIndexWriterConfig(analyzer);
|
||||
iwc.setMergePolicy(newLogMergePolicy());
|
||||
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
|
||||
|
||||
FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
|
||||
offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
|
||||
Field body = new Field("body", "", offsetsType);
|
||||
Document doc = new Document();
|
||||
doc.add(body);
|
||||
|
||||
body.setStringValue("This is a test.");
|
||||
iw.addDocument(doc);
|
||||
body.setStringValue("Test a one sentence document.");
|
||||
iw.addDocument(doc);
|
||||
|
||||
IndexReader ir = iw.getReader();
|
||||
iw.close();
|
||||
|
||||
IndexSearcher searcher = newSearcher(ir);
|
||||
PostingsHighlighter highlighter = new PostingsHighlighter() {
|
||||
@Override
|
||||
protected Analyzer getIndexAnalyzer(String field) {
|
||||
return analyzer;
|
||||
}
|
||||
};
|
||||
ConstantScoreQuery query = new ConstantScoreQuery(new WildcardQuery(new Term("body", "te*")));
|
||||
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
|
||||
assertEquals(2, topDocs.totalHits);
|
||||
String snippets[] = highlighter.highlight("body", query, searcher, topDocs);
|
||||
assertEquals(2, snippets.length);
|
||||
assertEquals("This is a <b>test</b>.", snippets[0]);
|
||||
assertEquals("<b>Test</b> a one sentence document.", snippets[1]);
|
||||
|
||||
ir.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
public void testWildcardInDisjunctionMax() throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
// use simpleanalyzer for more natural tokenization (else "test." is a token)
|
||||
final Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);
|
||||
IndexWriterConfig iwc = newIndexWriterConfig(analyzer);
|
||||
iwc.setMergePolicy(newLogMergePolicy());
|
||||
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
|
||||
|
||||
FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
|
||||
offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
|
||||
Field body = new Field("body", "", offsetsType);
|
||||
Document doc = new Document();
|
||||
doc.add(body);
|
||||
|
||||
body.setStringValue("This is a test.");
|
||||
iw.addDocument(doc);
|
||||
body.setStringValue("Test a one sentence document.");
|
||||
iw.addDocument(doc);
|
||||
|
||||
IndexReader ir = iw.getReader();
|
||||
iw.close();
|
||||
|
||||
IndexSearcher searcher = newSearcher(ir);
|
||||
PostingsHighlighter highlighter = new PostingsHighlighter() {
|
||||
@Override
|
||||
protected Analyzer getIndexAnalyzer(String field) {
|
||||
return analyzer;
|
||||
}
|
||||
};
|
||||
DisjunctionMaxQuery query = new DisjunctionMaxQuery(
|
||||
Collections.singleton(new WildcardQuery(new Term("body", "te*"))), 0);
|
||||
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
|
||||
assertEquals(2, topDocs.totalHits);
|
||||
String snippets[] = highlighter.highlight("body", query, searcher, topDocs);
|
||||
assertEquals(2, snippets.length);
|
||||
assertEquals("This is a <b>test</b>.", snippets[0]);
|
||||
assertEquals("<b>Test</b> a one sentence document.", snippets[1]);
|
||||
|
||||
ir.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
public void testSpanWildcard() throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
// use simpleanalyzer for more natural tokenization (else "test." is a token)
|
||||
final Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);
|
||||
IndexWriterConfig iwc = newIndexWriterConfig(analyzer);
|
||||
iwc.setMergePolicy(newLogMergePolicy());
|
||||
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
|
||||
|
||||
FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
|
||||
offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
|
||||
Field body = new Field("body", "", offsetsType);
|
||||
Document doc = new Document();
|
||||
doc.add(body);
|
||||
|
||||
body.setStringValue("This is a test.");
|
||||
iw.addDocument(doc);
|
||||
body.setStringValue("Test a one sentence document.");
|
||||
iw.addDocument(doc);
|
||||
|
||||
IndexReader ir = iw.getReader();
|
||||
iw.close();
|
||||
|
||||
IndexSearcher searcher = newSearcher(ir);
|
||||
PostingsHighlighter highlighter = new PostingsHighlighter() {
|
||||
@Override
|
||||
protected Analyzer getIndexAnalyzer(String field) {
|
||||
return analyzer;
|
||||
}
|
||||
};
|
||||
Query query = new SpanMultiTermQueryWrapper<>(new WildcardQuery(new Term("body", "te*")));
|
||||
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
|
||||
assertEquals(2, topDocs.totalHits);
|
||||
String snippets[] = highlighter.highlight("body", query, searcher, topDocs);
|
||||
assertEquals(2, snippets.length);
|
||||
assertEquals("This is a <b>test</b>.", snippets[0]);
|
||||
assertEquals("<b>Test</b> a one sentence document.", snippets[1]);
|
||||
|
||||
ir.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
public void testSpanOr() throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
// use simpleanalyzer for more natural tokenization (else "test." is a token)
|
||||
final Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);
|
||||
IndexWriterConfig iwc = newIndexWriterConfig(analyzer);
|
||||
iwc.setMergePolicy(newLogMergePolicy());
|
||||
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
|
||||
|
||||
FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
|
||||
offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
|
||||
Field body = new Field("body", "", offsetsType);
|
||||
Document doc = new Document();
|
||||
doc.add(body);
|
||||
|
||||
body.setStringValue("This is a test.");
|
||||
iw.addDocument(doc);
|
||||
body.setStringValue("Test a one sentence document.");
|
||||
iw.addDocument(doc);
|
||||
|
||||
IndexReader ir = iw.getReader();
|
||||
iw.close();
|
||||
|
||||
IndexSearcher searcher = newSearcher(ir);
|
||||
PostingsHighlighter highlighter = new PostingsHighlighter() {
|
||||
@Override
|
||||
protected Analyzer getIndexAnalyzer(String field) {
|
||||
return analyzer;
|
||||
}
|
||||
};
|
||||
SpanQuery childQuery = new SpanMultiTermQueryWrapper<>(new WildcardQuery(new Term("body", "te*")));
|
||||
Query query = new SpanOrQuery(new SpanQuery[] { childQuery });
|
||||
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
|
||||
assertEquals(2, topDocs.totalHits);
|
||||
String snippets[] = highlighter.highlight("body", query, searcher, topDocs);
|
||||
assertEquals(2, snippets.length);
|
||||
assertEquals("This is a <b>test</b>.", snippets[0]);
|
||||
assertEquals("<b>Test</b> a one sentence document.", snippets[1]);
|
||||
|
||||
ir.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
public void testSpanNear() throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
// use simpleanalyzer for more natural tokenization (else "test." is a token)
|
||||
final Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);
|
||||
IndexWriterConfig iwc = newIndexWriterConfig(analyzer);
|
||||
iwc.setMergePolicy(newLogMergePolicy());
|
||||
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
|
||||
|
||||
FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
|
||||
offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
|
||||
Field body = new Field("body", "", offsetsType);
|
||||
Document doc = new Document();
|
||||
doc.add(body);
|
||||
|
||||
body.setStringValue("This is a test.");
|
||||
iw.addDocument(doc);
|
||||
body.setStringValue("Test a one sentence document.");
|
||||
iw.addDocument(doc);
|
||||
|
||||
IndexReader ir = iw.getReader();
|
||||
iw.close();
|
||||
|
||||
IndexSearcher searcher = newSearcher(ir);
|
||||
PostingsHighlighter highlighter = new PostingsHighlighter() {
|
||||
@Override
|
||||
protected Analyzer getIndexAnalyzer(String field) {
|
||||
return analyzer;
|
||||
}
|
||||
};
|
||||
SpanQuery childQuery = new SpanMultiTermQueryWrapper<>(new WildcardQuery(new Term("body", "te*")));
|
||||
Query query = new SpanNearQuery(new SpanQuery[] { childQuery, childQuery }, 0, false);
|
||||
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
|
||||
assertEquals(2, topDocs.totalHits);
|
||||
String snippets[] = highlighter.highlight("body", query, searcher, topDocs);
|
||||
assertEquals(2, snippets.length);
|
||||
assertEquals("This is a <b>test</b>.", snippets[0]);
|
||||
assertEquals("<b>Test</b> a one sentence document.", snippets[1]);
|
||||
|
||||
ir.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
public void testSpanNot() throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
// use simpleanalyzer for more natural tokenization (else "test." is a token)
|
||||
final Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);
|
||||
IndexWriterConfig iwc = newIndexWriterConfig(analyzer);
|
||||
iwc.setMergePolicy(newLogMergePolicy());
|
||||
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
|
||||
|
||||
FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
|
||||
offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
|
||||
Field body = new Field("body", "", offsetsType);
|
||||
Document doc = new Document();
|
||||
doc.add(body);
|
||||
|
||||
body.setStringValue("This is a test.");
|
||||
iw.addDocument(doc);
|
||||
body.setStringValue("Test a one sentence document.");
|
||||
iw.addDocument(doc);
|
||||
|
||||
IndexReader ir = iw.getReader();
|
||||
iw.close();
|
||||
|
||||
IndexSearcher searcher = newSearcher(ir);
|
||||
PostingsHighlighter highlighter = new PostingsHighlighter() {
|
||||
@Override
|
||||
protected Analyzer getIndexAnalyzer(String field) {
|
||||
return analyzer;
|
||||
}
|
||||
};
|
||||
SpanQuery include = new SpanMultiTermQueryWrapper<>(new WildcardQuery(new Term("body", "te*")));
|
||||
SpanQuery exclude = new SpanTermQuery(new Term("body", "bogus"));
|
||||
Query query = new SpanNotQuery(include, exclude);
|
||||
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
|
||||
assertEquals(2, topDocs.totalHits);
|
||||
String snippets[] = highlighter.highlight("body", query, searcher, topDocs);
|
||||
assertEquals(2, snippets.length);
|
||||
assertEquals("This is a <b>test</b>.", snippets[0]);
|
||||
assertEquals("<b>Test</b> a one sentence document.", snippets[1]);
|
||||
|
||||
ir.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
public void testSpanPositionCheck() throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
// use simpleanalyzer for more natural tokenization (else "test." is a token)
|
||||
final Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);
|
||||
IndexWriterConfig iwc = newIndexWriterConfig(analyzer);
|
||||
iwc.setMergePolicy(newLogMergePolicy());
|
||||
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
|
||||
|
||||
FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
|
||||
offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
|
||||
Field body = new Field("body", "", offsetsType);
|
||||
Document doc = new Document();
|
||||
doc.add(body);
|
||||
|
||||
body.setStringValue("This is a test.");
|
||||
iw.addDocument(doc);
|
||||
body.setStringValue("Test a one sentence document.");
|
||||
iw.addDocument(doc);
|
||||
|
||||
IndexReader ir = iw.getReader();
|
||||
iw.close();
|
||||
|
||||
IndexSearcher searcher = newSearcher(ir);
|
||||
PostingsHighlighter highlighter = new PostingsHighlighter() {
|
||||
@Override
|
||||
protected Analyzer getIndexAnalyzer(String field) {
|
||||
return analyzer;
|
||||
}
|
||||
};
|
||||
SpanQuery childQuery = new SpanMultiTermQueryWrapper<>(new WildcardQuery(new Term("body", "te*")));
|
||||
Query query = new SpanFirstQuery(childQuery, 1000000);
|
||||
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
|
||||
assertEquals(2, topDocs.totalHits);
|
||||
String snippets[] = highlighter.highlight("body", query, searcher, topDocs);
|
||||
assertEquals(2, snippets.length);
|
||||
assertEquals("This is a <b>test</b>.", snippets[0]);
|
||||
assertEquals("<b>Test</b> a one sentence document.", snippets[1]);
|
||||
|
||||
ir.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
/** Runs a query with two MTQs and confirms the formatter
|
||||
* can tell which query matched which hit. */
|
||||
public void testWhichMTQMatched() throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
// use simpleanalyzer for more natural tokenization (else "test." is a token)
|
||||
final Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);
|
||||
IndexWriterConfig iwc = newIndexWriterConfig(analyzer);
|
||||
iwc.setMergePolicy(newLogMergePolicy());
|
||||
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
|
||||
|
||||
FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
|
||||
offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
|
||||
Field body = new Field("body", "", offsetsType);
|
||||
Document doc = new Document();
|
||||
doc.add(body);
|
||||
|
||||
body.setStringValue("Test a one sentence document.");
|
||||
iw.addDocument(doc);
|
||||
|
||||
IndexReader ir = iw.getReader();
|
||||
iw.close();
|
||||
|
||||
IndexSearcher searcher = newSearcher(ir);
|
||||
PostingsHighlighter highlighter = new PostingsHighlighter() {
|
||||
@Override
|
||||
protected Analyzer getIndexAnalyzer(String field) {
|
||||
return analyzer;
|
||||
}
|
||||
};
|
||||
BooleanQuery.Builder query = new BooleanQuery.Builder();
|
||||
query.add(new WildcardQuery(new Term("body", "te*")), BooleanClause.Occur.SHOULD);
|
||||
query.add(new WildcardQuery(new Term("body", "one")), BooleanClause.Occur.SHOULD);
|
||||
query.add(new WildcardQuery(new Term("body", "se*")), BooleanClause.Occur.SHOULD);
|
||||
TopDocs topDocs = searcher.search(query.build(), 10, Sort.INDEXORDER);
|
||||
assertEquals(1, topDocs.totalHits);
|
||||
String snippets[] = highlighter.highlight("body", query.build(), searcher, topDocs);
|
||||
assertEquals(1, snippets.length);
|
||||
|
||||
// Default formatter just bolds each hit:
|
||||
assertEquals("<b>Test</b> a <b>one</b> <b>sentence</b> document.", snippets[0]);
|
||||
|
||||
// Now use our own formatter, that also stuffs the
|
||||
// matching term's text into the result:
|
||||
highlighter = new PostingsHighlighter() {
|
||||
@Override
|
||||
protected Analyzer getIndexAnalyzer(String field) {
|
||||
return analyzer;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected PassageFormatter getFormatter(String field) {
|
||||
return new PassageFormatter() {
|
||||
|
||||
@Override
|
||||
public Object format(Passage passages[], String content) {
|
||||
// Copied from DefaultPassageFormatter, but
|
||||
// tweaked to include the matched term:
|
||||
StringBuilder sb = new StringBuilder();
|
||||
int pos = 0;
|
||||
for (Passage passage : passages) {
|
||||
// don't add ellipsis if it's the first one, or if it's connected.
|
||||
if (passage.startOffset > pos && pos > 0) {
|
||||
sb.append("... ");
|
||||
}
|
||||
pos = passage.startOffset;
|
||||
for (int i = 0; i < passage.numMatches; i++) {
|
||||
int start = passage.matchStarts[i];
|
||||
int end = passage.matchEnds[i];
|
||||
// it's possible to have overlapping terms
|
||||
if (start > pos) {
|
||||
sb.append(content, pos, start);
|
||||
}
|
||||
if (end > pos) {
|
||||
sb.append("<b>");
|
||||
sb.append(content, Math.max(pos, start), end);
|
||||
sb.append('(');
|
||||
sb.append(passage.getMatchTerms()[i].utf8ToString());
|
||||
sb.append(')');
|
||||
sb.append("</b>");
|
||||
pos = end;
|
||||
}
|
||||
}
|
||||
// it's possible a "term" from the analyzer could span a sentence boundary.
|
||||
sb.append(content, pos, Math.max(pos, passage.endOffset));
|
||||
pos = passage.endOffset;
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
assertEquals(1, topDocs.totalHits);
|
||||
snippets = highlighter.highlight("body", query.build(), searcher, topDocs);
|
||||
assertEquals(1, snippets.length);
|
||||
|
||||
// Default formatter bolds each hit:
|
||||
assertEquals("<b>Test(body:te*)</b> a <b>one(body:one)</b> <b>sentence(body:se*)</b> document.", snippets[0]);
|
||||
|
||||
ir.close();
|
||||
dir.close();
|
||||
}
|
||||
}
|
File diff suppressed because it is too large
Load Diff
|
@ -1,324 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.search.postingshighlight;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashSet;
|
||||
import java.util.Random;
|
||||
|
||||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.FieldType;
|
||||
import org.apache.lucene.document.StringField;
|
||||
import org.apache.lucene.document.TextField;
|
||||
import org.apache.lucene.index.IndexOptions;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexWriterConfig;
|
||||
import org.apache.lucene.index.RandomIndexWriter;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.BooleanClause;
|
||||
import org.apache.lucene.search.BooleanQuery;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.Sort;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.search.TopDocs;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.TestUtil;
|
||||
|
||||
public class TestPostingsHighlighterRanking extends LuceneTestCase {
|
||||
/**
|
||||
* indexes a bunch of gibberish, and then highlights top(n).
|
||||
* asserts that top(n) highlights is a subset of top(n+1) up to some max N
|
||||
*/
|
||||
// TODO: this only tests single-valued fields. we should also index multiple values per field!
|
||||
public void testRanking() throws Exception {
|
||||
// number of documents: we will check each one
|
||||
final int numDocs = atLeast(100);
|
||||
// number of top-N snippets, we will check 1 .. N
|
||||
final int maxTopN = 5;
|
||||
// maximum number of elements to put in a sentence.
|
||||
final int maxSentenceLength = 10;
|
||||
// maximum number of sentences in a document
|
||||
final int maxNumSentences = 20;
|
||||
|
||||
Directory dir = newDirectory();
|
||||
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, new MockAnalyzer(random(), MockTokenizer.SIMPLE, true));
|
||||
Document document = new Document();
|
||||
Field id = new StringField("id", "", Field.Store.NO);
|
||||
FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
|
||||
offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
|
||||
Field body = new Field("body", "", offsetsType);
|
||||
document.add(id);
|
||||
document.add(body);
|
||||
|
||||
for (int i = 0; i < numDocs; i++) {
|
||||
StringBuilder bodyText = new StringBuilder();
|
||||
int numSentences = TestUtil.nextInt(random(), 1, maxNumSentences);
|
||||
for (int j = 0; j < numSentences; j++) {
|
||||
bodyText.append(newSentence(random(), maxSentenceLength));
|
||||
}
|
||||
body.setStringValue(bodyText.toString());
|
||||
id.setStringValue(Integer.toString(i));
|
||||
iw.addDocument(document);
|
||||
}
|
||||
|
||||
IndexReader ir = iw.getReader();
|
||||
IndexSearcher searcher = newSearcher(ir);
|
||||
for (int i = 0; i < numDocs; i++) {
|
||||
checkDocument(searcher, i, maxTopN);
|
||||
}
|
||||
iw.close();
|
||||
ir.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
private void checkDocument(IndexSearcher is, int doc, int maxTopN) throws IOException {
|
||||
for (int ch = 'a'; ch <= 'z'; ch++) {
|
||||
Term term = new Term("body", "" + (char)ch);
|
||||
// check a simple term query
|
||||
checkQuery(is, new TermQuery(term), doc, maxTopN);
|
||||
// check a boolean query
|
||||
BooleanQuery.Builder bq = new BooleanQuery.Builder();
|
||||
bq.add(new TermQuery(term), BooleanClause.Occur.SHOULD);
|
||||
Term nextTerm = new Term("body", "" + (char)(ch+1));
|
||||
bq.add(new TermQuery(nextTerm), BooleanClause.Occur.SHOULD);
|
||||
checkQuery(is, bq.build(), doc, maxTopN);
|
||||
}
|
||||
}
|
||||
|
||||
private void checkQuery(IndexSearcher is, Query query, int doc, int maxTopN) throws IOException {
|
||||
for (int n = 1; n < maxTopN; n++) {
|
||||
final FakePassageFormatter f1 = new FakePassageFormatter();
|
||||
PostingsHighlighter p1 = new PostingsHighlighter(Integer.MAX_VALUE-1) {
|
||||
@Override
|
||||
protected PassageFormatter getFormatter(String field) {
|
||||
assertEquals("body", field);
|
||||
return f1;
|
||||
}
|
||||
};
|
||||
|
||||
final FakePassageFormatter f2 = new FakePassageFormatter();
|
||||
PostingsHighlighter p2 = new PostingsHighlighter(Integer.MAX_VALUE-1) {
|
||||
@Override
|
||||
protected PassageFormatter getFormatter(String field) {
|
||||
assertEquals("body", field);
|
||||
return f2;
|
||||
}
|
||||
};
|
||||
|
||||
BooleanQuery.Builder bq = new BooleanQuery.Builder();
|
||||
bq.add(query, BooleanClause.Occur.MUST);
|
||||
bq.add(new TermQuery(new Term("id", Integer.toString(doc))), BooleanClause.Occur.MUST);
|
||||
TopDocs td = is.search(bq.build(), 1);
|
||||
p1.highlight("body", bq.build(), is, td, n);
|
||||
p2.highlight("body", bq.build(), is, td, n+1);
|
||||
assertTrue(f2.seen.containsAll(f1.seen));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* returns a new random sentence, up to maxSentenceLength "words" in length.
|
||||
* each word is a single character (a-z). The first one is capitalized.
|
||||
*/
|
||||
private String newSentence(Random r, int maxSentenceLength) {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
int numElements = TestUtil.nextInt(r, 1, maxSentenceLength);
|
||||
for (int i = 0; i < numElements; i++) {
|
||||
if (sb.length() > 0) {
|
||||
sb.append(' ');
|
||||
sb.append((char) TestUtil.nextInt(r, 'a', 'z'));
|
||||
} else {
|
||||
// capitalize the first word to help breakiterator
|
||||
sb.append((char) TestUtil.nextInt(r, 'A', 'Z'));
|
||||
}
|
||||
}
|
||||
sb.append(". "); // finalize sentence
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* a fake formatter that doesn't actually format passages.
|
||||
* instead it just collects them for asserts!
|
||||
*/
|
||||
static class FakePassageFormatter extends PassageFormatter {
|
||||
HashSet<Pair> seen = new HashSet<>();
|
||||
|
||||
@Override
|
||||
public String format(Passage passages[], String content) {
|
||||
for (Passage p : passages) {
|
||||
// verify some basics about the passage
|
||||
assertTrue(p.getScore() >= 0);
|
||||
assertTrue(p.getNumMatches() > 0);
|
||||
assertTrue(p.getStartOffset() >= 0);
|
||||
assertTrue(p.getStartOffset() <= content.length());
|
||||
assertTrue(p.getEndOffset() >= p.getStartOffset());
|
||||
assertTrue(p.getEndOffset() <= content.length());
|
||||
// we use a very simple analyzer. so we can assert the matches are correct
|
||||
int lastMatchStart = -1;
|
||||
for (int i = 0; i < p.getNumMatches(); i++) {
|
||||
BytesRef term = p.getMatchTerms()[i];
|
||||
int matchStart = p.getMatchStarts()[i];
|
||||
assertTrue(matchStart >= 0);
|
||||
// must at least start within the passage
|
||||
assertTrue(matchStart < p.getEndOffset());
|
||||
int matchEnd = p.getMatchEnds()[i];
|
||||
assertTrue(matchEnd >= 0);
|
||||
// always moving forward
|
||||
assertTrue(matchStart >= lastMatchStart);
|
||||
lastMatchStart = matchStart;
|
||||
// single character terms
|
||||
assertEquals(matchStart+1, matchEnd);
|
||||
// and the offsets must be correct...
|
||||
assertEquals(1, term.length);
|
||||
assertEquals((char)term.bytes[term.offset], Character.toLowerCase(content.charAt(matchStart)));
|
||||
}
|
||||
// record just the start/end offset for simplicity
|
||||
seen.add(new Pair(p.getStartOffset(), p.getEndOffset()));
|
||||
}
|
||||
return "bogus!!!!!!";
|
||||
}
|
||||
}
|
||||
|
||||
static class Pair {
|
||||
final int start;
|
||||
final int end;
|
||||
|
||||
Pair(int start, int end) {
|
||||
this.start = start;
|
||||
this.end = end;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
final int prime = 31;
|
||||
int result = 1;
|
||||
result = prime * result + end;
|
||||
result = prime * result + start;
|
||||
return result;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
if (this == obj) {
|
||||
return true;
|
||||
}
|
||||
if (obj == null) {
|
||||
return false;
|
||||
}
|
||||
if (getClass() != obj.getClass()) {
|
||||
return false;
|
||||
}
|
||||
Pair other = (Pair) obj;
|
||||
if (end != other.end) {
|
||||
return false;
|
||||
}
|
||||
if (start != other.start) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "Pair [start=" + start + ", end=" + end + "]";
|
||||
}
|
||||
}
|
||||
|
||||
/** sets b=0 to disable passage length normalization */
|
||||
public void testCustomB() throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true));
|
||||
iwc.setMergePolicy(newLogMergePolicy());
|
||||
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
|
||||
|
||||
FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
|
||||
offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
|
||||
Field body = new Field("body", "", offsetsType);
|
||||
Document doc = new Document();
|
||||
doc.add(body);
|
||||
|
||||
body.setStringValue("This is a test. This test is a better test but the sentence is excruiatingly long, " +
|
||||
"you have no idea how painful it was for me to type this long sentence into my IDE.");
|
||||
iw.addDocument(doc);
|
||||
|
||||
IndexReader ir = iw.getReader();
|
||||
iw.close();
|
||||
|
||||
IndexSearcher searcher = newSearcher(ir);
|
||||
PostingsHighlighter highlighter = new PostingsHighlighter(10000) {
|
||||
@Override
|
||||
protected PassageScorer getScorer(String field) {
|
||||
return new PassageScorer(1.2f, 0, 87);
|
||||
}
|
||||
};
|
||||
Query query = new TermQuery(new Term("body", "test"));
|
||||
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
|
||||
assertEquals(1, topDocs.totalHits);
|
||||
String snippets[] = highlighter.highlight("body", query, searcher, topDocs, 1);
|
||||
assertEquals(1, snippets.length);
|
||||
assertTrue(snippets[0].startsWith("This <b>test</b> is a better <b>test</b>"));
|
||||
|
||||
ir.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
/** sets k1=0 for simple coordinate-level match (# of query terms present) */
|
||||
public void testCustomK1() throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true));
|
||||
iwc.setMergePolicy(newLogMergePolicy());
|
||||
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
|
||||
|
||||
FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
|
||||
offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
|
||||
Field body = new Field("body", "", offsetsType);
|
||||
Document doc = new Document();
|
||||
doc.add(body);
|
||||
|
||||
body.setStringValue("This has only foo foo. " +
|
||||
"On the other hand this sentence contains both foo and bar. " +
|
||||
"This has only bar bar bar bar bar bar bar bar bar bar bar bar.");
|
||||
iw.addDocument(doc);
|
||||
|
||||
IndexReader ir = iw.getReader();
|
||||
iw.close();
|
||||
|
||||
IndexSearcher searcher = newSearcher(ir);
|
||||
PostingsHighlighter highlighter = new PostingsHighlighter(10000) {
|
||||
@Override
|
||||
protected PassageScorer getScorer(String field) {
|
||||
return new PassageScorer(0, 0.75f, 87);
|
||||
}
|
||||
};
|
||||
BooleanQuery.Builder query = new BooleanQuery.Builder();
|
||||
query.add(new TermQuery(new Term("body", "foo")), BooleanClause.Occur.SHOULD);
|
||||
query.add(new TermQuery(new Term("body", "bar")), BooleanClause.Occur.SHOULD);
|
||||
TopDocs topDocs = searcher.search(query.build(), 10, Sort.INDEXORDER);
|
||||
assertEquals(1, topDocs.totalHits);
|
||||
String snippets[] = highlighter.highlight("body", query.build(), searcher, topDocs, 1);
|
||||
assertEquals(1, snippets.length);
|
||||
assertTrue(snippets[0].startsWith("On the other hand"));
|
||||
|
||||
ir.close();
|
||||
dir.close();
|
||||
}
|
||||
}
|
|
@ -23,7 +23,6 @@ import org.apache.lucene.analysis.Analyzer;
|
|||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.postingshighlight.CustomSeparatorBreakIterator;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.QueryBuilder;
|
||||
|
||||
|
|
|
@ -0,0 +1,114 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.search.uhighlight;
|
||||
|
||||
import java.text.BreakIterator;
|
||||
import java.util.Locale;
|
||||
|
||||
import com.carrotsearch.randomizedtesting.generators.RandomPicks;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
import static org.apache.lucene.search.uhighlight.TestWholeBreakIterator.assertSameBreaks;
|
||||
import static org.hamcrest.CoreMatchers.equalTo;
|
||||
|
||||
public class TestCustomSeparatorBreakIterator extends LuceneTestCase {
|
||||
|
||||
private static final Character[] SEPARATORS = new Character[]{' ', '\u0000', 8233};
|
||||
|
||||
public void testBreakOnCustomSeparator() throws Exception {
|
||||
Character separator = randomSeparator();
|
||||
BreakIterator bi = new CustomSeparatorBreakIterator(separator);
|
||||
String source = "this" + separator + "is" + separator + "the" + separator + "first" + separator + "sentence";
|
||||
bi.setText(source);
|
||||
assertThat(bi.current(), equalTo(0));
|
||||
assertThat(bi.first(), equalTo(0));
|
||||
assertThat(source.substring(bi.current(), bi.next()), equalTo("this" + separator));
|
||||
assertThat(source.substring(bi.current(), bi.next()), equalTo("is" + separator));
|
||||
assertThat(source.substring(bi.current(), bi.next()), equalTo("the" + separator));
|
||||
assertThat(source.substring(bi.current(), bi.next()), equalTo("first" + separator));
|
||||
assertThat(source.substring(bi.current(), bi.next()), equalTo("sentence"));
|
||||
assertThat(bi.next(), equalTo(BreakIterator.DONE));
|
||||
|
||||
assertThat(bi.last(), equalTo(source.length()));
|
||||
int current = bi.current();
|
||||
assertThat(source.substring(bi.previous(), current), equalTo("sentence"));
|
||||
current = bi.current();
|
||||
assertThat(source.substring(bi.previous(), current), equalTo("first" + separator));
|
||||
current = bi.current();
|
||||
assertThat(source.substring(bi.previous(), current), equalTo("the" + separator));
|
||||
current = bi.current();
|
||||
assertThat(source.substring(bi.previous(), current), equalTo("is" + separator));
|
||||
current = bi.current();
|
||||
assertThat(source.substring(bi.previous(), current), equalTo("this" + separator));
|
||||
assertThat(bi.previous(), equalTo(BreakIterator.DONE));
|
||||
assertThat(bi.current(), equalTo(0));
|
||||
|
||||
assertThat(source.substring(0, bi.following(9)), equalTo("this" + separator + "is" + separator + "the" + separator));
|
||||
|
||||
assertThat(source.substring(0, bi.preceding(9)), equalTo("this" + separator + "is" + separator));
|
||||
|
||||
assertThat(bi.first(), equalTo(0));
|
||||
assertThat(source.substring(0, bi.next(3)), equalTo("this" + separator + "is" + separator + "the" + separator));
|
||||
}
|
||||
|
||||
public void testSingleSentences() throws Exception {
|
||||
BreakIterator expected = BreakIterator.getSentenceInstance(Locale.ROOT);
|
||||
BreakIterator actual = new CustomSeparatorBreakIterator(randomSeparator());
|
||||
assertSameBreaks("a", expected, actual);
|
||||
assertSameBreaks("ab", expected, actual);
|
||||
assertSameBreaks("abc", expected, actual);
|
||||
assertSameBreaks("", expected, actual);
|
||||
}
|
||||
|
||||
public void testSliceEnd() throws Exception {
|
||||
BreakIterator expected = BreakIterator.getSentenceInstance(Locale.ROOT);
|
||||
BreakIterator actual = new CustomSeparatorBreakIterator(randomSeparator());
|
||||
assertSameBreaks("a000", 0, 1, expected, actual);
|
||||
assertSameBreaks("ab000", 0, 1, expected, actual);
|
||||
assertSameBreaks("abc000", 0, 1, expected, actual);
|
||||
assertSameBreaks("000", 0, 0, expected, actual);
|
||||
}
|
||||
|
||||
public void testSliceStart() throws Exception {
|
||||
BreakIterator expected = BreakIterator.getSentenceInstance(Locale.ROOT);
|
||||
BreakIterator actual = new CustomSeparatorBreakIterator(randomSeparator());
|
||||
assertSameBreaks("000a", 3, 1, expected, actual);
|
||||
assertSameBreaks("000ab", 3, 2, expected, actual);
|
||||
assertSameBreaks("000abc", 3, 3, expected, actual);
|
||||
assertSameBreaks("000", 3, 0, expected, actual);
|
||||
}
|
||||
|
||||
public void testSliceMiddle() throws Exception {
|
||||
BreakIterator expected = BreakIterator.getSentenceInstance(Locale.ROOT);
|
||||
BreakIterator actual = new CustomSeparatorBreakIterator(randomSeparator());
|
||||
assertSameBreaks("000a000", 3, 1, expected, actual);
|
||||
assertSameBreaks("000ab000", 3, 2, expected, actual);
|
||||
assertSameBreaks("000abc000", 3, 3, expected, actual);
|
||||
assertSameBreaks("000000", 3, 0, expected, actual);
|
||||
}
|
||||
|
||||
/** the current position must be ignored, initial position is always first() */
|
||||
public void testFirstPosition() throws Exception {
|
||||
BreakIterator expected = BreakIterator.getSentenceInstance(Locale.ROOT);
|
||||
BreakIterator actual = new CustomSeparatorBreakIterator(randomSeparator());
|
||||
assertSameBreaks("000ab000", 3, 2, 4, expected, actual);
|
||||
}
|
||||
|
||||
private static char randomSeparator() {
|
||||
return RandomPicks.randomFrom(random(), SEPARATORS);
|
||||
}
|
||||
}
|
|
@ -49,7 +49,6 @@ import org.apache.lucene.search.ScoreDoc;
|
|||
import org.apache.lucene.search.Sort;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.search.TopDocs;
|
||||
import org.apache.lucene.search.postingshighlight.WholeBreakIterator;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.LuceneTestCase.SuppressCodecs;
|
||||
|
|
|
@ -0,0 +1,134 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.search.uhighlight;
|
||||
|
||||
import java.text.BreakIterator;
|
||||
import java.text.CharacterIterator;
|
||||
import java.text.StringCharacterIterator;
|
||||
import java.util.Locale;
|
||||
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
public class TestWholeBreakIterator extends LuceneTestCase {
|
||||
|
||||
/** For single sentences, we know WholeBreakIterator should break the same as a sentence iterator */
|
||||
public void testSingleSentences() throws Exception {
|
||||
BreakIterator expected = BreakIterator.getSentenceInstance(Locale.ROOT);
|
||||
BreakIterator actual = new WholeBreakIterator();
|
||||
assertSameBreaks("a", expected, actual);
|
||||
assertSameBreaks("ab", expected, actual);
|
||||
assertSameBreaks("abc", expected, actual);
|
||||
assertSameBreaks("", expected, actual);
|
||||
}
|
||||
|
||||
public void testSliceEnd() throws Exception {
|
||||
BreakIterator expected = BreakIterator.getSentenceInstance(Locale.ROOT);
|
||||
BreakIterator actual = new WholeBreakIterator();
|
||||
assertSameBreaks("a000", 0, 1, expected, actual);
|
||||
assertSameBreaks("ab000", 0, 1, expected, actual);
|
||||
assertSameBreaks("abc000", 0, 1, expected, actual);
|
||||
assertSameBreaks("000", 0, 0, expected, actual);
|
||||
}
|
||||
|
||||
public void testSliceStart() throws Exception {
|
||||
BreakIterator expected = BreakIterator.getSentenceInstance(Locale.ROOT);
|
||||
BreakIterator actual = new WholeBreakIterator();
|
||||
assertSameBreaks("000a", 3, 1, expected, actual);
|
||||
assertSameBreaks("000ab", 3, 2, expected, actual);
|
||||
assertSameBreaks("000abc", 3, 3, expected, actual);
|
||||
assertSameBreaks("000", 3, 0, expected, actual);
|
||||
}
|
||||
|
||||
public void testSliceMiddle() throws Exception {
|
||||
BreakIterator expected = BreakIterator.getSentenceInstance(Locale.ROOT);
|
||||
BreakIterator actual = new WholeBreakIterator();
|
||||
assertSameBreaks("000a000", 3, 1, expected, actual);
|
||||
assertSameBreaks("000ab000", 3, 2, expected, actual);
|
||||
assertSameBreaks("000abc000", 3, 3, expected, actual);
|
||||
assertSameBreaks("000000", 3, 0, expected, actual);
|
||||
}
|
||||
|
||||
/** the current position must be ignored, initial position is always first() */
|
||||
public void testFirstPosition() throws Exception {
|
||||
BreakIterator expected = BreakIterator.getSentenceInstance(Locale.ROOT);
|
||||
BreakIterator actual = new WholeBreakIterator();
|
||||
assertSameBreaks("000ab000", 3, 2, 4, expected, actual);
|
||||
}
|
||||
|
||||
public static void assertSameBreaks(String text, BreakIterator expected, BreakIterator actual) {
|
||||
assertSameBreaks(new StringCharacterIterator(text),
|
||||
new StringCharacterIterator(text),
|
||||
expected,
|
||||
actual);
|
||||
}
|
||||
|
||||
public static void assertSameBreaks(String text, int offset, int length, BreakIterator expected, BreakIterator actual) {
|
||||
assertSameBreaks(text, offset, length, offset, expected, actual);
|
||||
}
|
||||
|
||||
public static void assertSameBreaks(String text, int offset, int length, int current, BreakIterator expected, BreakIterator actual) {
|
||||
assertSameBreaks(new StringCharacterIterator(text, offset, offset+length, current),
|
||||
new StringCharacterIterator(text, offset, offset+length, current),
|
||||
expected,
|
||||
actual);
|
||||
}
|
||||
|
||||
/** Asserts that two breakiterators break the text the same way */
|
||||
public static void assertSameBreaks(CharacterIterator one, CharacterIterator two, BreakIterator expected, BreakIterator actual) {
|
||||
expected.setText(one);
|
||||
actual.setText(two);
|
||||
|
||||
assertEquals(expected.current(), actual.current());
|
||||
|
||||
// next()
|
||||
int v = expected.current();
|
||||
while (v != BreakIterator.DONE) {
|
||||
assertEquals(v = expected.next(), actual.next());
|
||||
assertEquals(expected.current(), actual.current());
|
||||
}
|
||||
|
||||
// first()
|
||||
assertEquals(expected.first(), actual.first());
|
||||
assertEquals(expected.current(), actual.current());
|
||||
// last()
|
||||
assertEquals(expected.last(), actual.last());
|
||||
assertEquals(expected.current(), actual.current());
|
||||
|
||||
// previous()
|
||||
v = expected.current();
|
||||
while (v != BreakIterator.DONE) {
|
||||
assertEquals(v = expected.previous(), actual.previous());
|
||||
assertEquals(expected.current(), actual.current());
|
||||
}
|
||||
|
||||
// following()
|
||||
for (int i = one.getBeginIndex(); i <= one.getEndIndex(); i++) {
|
||||
expected.first();
|
||||
actual.first();
|
||||
assertEquals(expected.following(i), actual.following(i));
|
||||
assertEquals(expected.current(), actual.current());
|
||||
}
|
||||
|
||||
// preceding()
|
||||
for (int i = one.getBeginIndex(); i <= one.getEndIndex(); i++) {
|
||||
expected.last();
|
||||
actual.last();
|
||||
assertEquals(expected.preceding(i), actual.preceding(i));
|
||||
assertEquals(expected.current(), actual.current());
|
||||
}
|
||||
}
|
||||
}
|
|
@ -28,13 +28,13 @@ import java.util.function.Predicate;
|
|||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.postingshighlight.CustomSeparatorBreakIterator;
|
||||
import org.apache.lucene.search.postingshighlight.WholeBreakIterator;
|
||||
import org.apache.lucene.search.uhighlight.CustomSeparatorBreakIterator;
|
||||
import org.apache.lucene.search.uhighlight.DefaultPassageFormatter;
|
||||
import org.apache.lucene.search.uhighlight.LengthGoalBreakIterator;
|
||||
import org.apache.lucene.search.uhighlight.PassageFormatter;
|
||||
import org.apache.lucene.search.uhighlight.PassageScorer;
|
||||
import org.apache.lucene.search.uhighlight.UnifiedHighlighter;
|
||||
import org.apache.lucene.search.uhighlight.WholeBreakIterator;
|
||||
import org.apache.solr.common.SolrException;
|
||||
import org.apache.solr.common.params.HighlightParams;
|
||||
import org.apache.solr.common.params.SolrParams;
|
||||
|
|
Loading…
Reference in New Issue