mirror of https://github.com/apache/lucene.git
LUCENE-7620: UnifiedHighlighter: new LengthGoalBreakIterator wrapper
This commit is contained in:
parent
ac14fc32e0
commit
ea49989524
|
@ -218,6 +218,10 @@ Improvements
|
||||||
* LUCENE-7614: Complex Phrase Query parser ignores double quotes around single token
|
* LUCENE-7614: Complex Phrase Query parser ignores double quotes around single token
|
||||||
prefix, wildcard, range queries (Mikhail Khludnev)
|
prefix, wildcard, range queries (Mikhail Khludnev)
|
||||||
|
|
||||||
|
* LUCENE-7620: Added LengthGoalBreakIterator, a wrapper around another B.I. to skip breaks
|
||||||
|
that would create Passages that are too short. Only for use with the UnifiedHighlighter
|
||||||
|
(and probably PostingsHighlighter). (David Smiley)
|
||||||
|
|
||||||
Optimizations
|
Optimizations
|
||||||
|
|
||||||
* LUCENE-7568: Optimize merging when index sorting is used but the
|
* LUCENE-7568: Optimize merging when index sorting is used but the
|
||||||
|
|
|
@ -0,0 +1,185 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.lucene.search.uhighlight;
|
||||||
|
|
||||||
|
import java.text.BreakIterator;
|
||||||
|
import java.text.CharacterIterator;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Wraps another {@link BreakIterator} to skip past breaks that would result in passages that are too
|
||||||
|
* short. It's still possible to get a short passage but only at the very end of the input text.
|
||||||
|
* <p>
|
||||||
|
* Important: This is not a general purpose {@link BreakIterator}; it's only designed to work in a way
|
||||||
|
* compatible with the {@link UnifiedHighlighter}. Some assumptions are checked with Java assertions.
|
||||||
|
*
|
||||||
|
* @lucene.experimental
|
||||||
|
*/
|
||||||
|
public class LengthGoalBreakIterator extends BreakIterator {
|
||||||
|
|
||||||
|
private final BreakIterator baseIter;
|
||||||
|
private final int lengthGoal;
|
||||||
|
private final boolean isMinimumLength; // if false then is "closest to" length
|
||||||
|
|
||||||
|
/** Breaks will be at least {@code minLength} apart (to the extent possible). */
|
||||||
|
public static LengthGoalBreakIterator createMinLength(BreakIterator baseIter, int minLength) {
|
||||||
|
return new LengthGoalBreakIterator(baseIter, minLength, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Breaks will be on average {@code targetLength} apart; the closest break to this target (before or after)
|
||||||
|
* is chosen. */
|
||||||
|
public static LengthGoalBreakIterator createClosestToLength(BreakIterator baseIter, int targetLength) {
|
||||||
|
return new LengthGoalBreakIterator(baseIter, targetLength, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
private LengthGoalBreakIterator(BreakIterator baseIter, int lengthGoal, boolean isMinimumLength) {
|
||||||
|
this.baseIter = baseIter;
|
||||||
|
this.lengthGoal = lengthGoal;
|
||||||
|
this.isMinimumLength = isMinimumLength;
|
||||||
|
}
|
||||||
|
|
||||||
|
// note: the only methods that will get called are setText(txt), getText(),
|
||||||
|
// getSummaryPassagesNoHighlight: current(), first(), next()
|
||||||
|
// highlightOffsetsEnums: preceding(int), and following(int)
|
||||||
|
// Nonetheless we make some attempt to implement the rest; mostly delegating.
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
String goalDesc = isMinimumLength ? "minLen" : "targetLen";
|
||||||
|
return getClass().getSimpleName() + "{" + goalDesc + "=" + lengthGoal + ", baseIter=" + baseIter + "}";
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Object clone() {
|
||||||
|
return new LengthGoalBreakIterator((BreakIterator) baseIter.clone(), lengthGoal, isMinimumLength);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public CharacterIterator getText() {
|
||||||
|
return baseIter.getText();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void setText(String newText) {
|
||||||
|
baseIter.setText(newText);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void setText(CharacterIterator newText) {
|
||||||
|
baseIter.setText(newText);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int current() {
|
||||||
|
return baseIter.current();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int first() {
|
||||||
|
return baseIter.first();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int last() {
|
||||||
|
return baseIter.last();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int next(int n) {
|
||||||
|
assert false : "Not supported";
|
||||||
|
return baseIter.next(n); // probably wrong
|
||||||
|
}
|
||||||
|
|
||||||
|
// called by getSummaryPassagesNoHighlight to generate default summary.
|
||||||
|
@Override
|
||||||
|
public int next() {
|
||||||
|
return following(current());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int previous() {
|
||||||
|
assert false : "Not supported";
|
||||||
|
return baseIter.previous();
|
||||||
|
}
|
||||||
|
|
||||||
|
// called while the current position is the start of a new passage; find end of passage
|
||||||
|
@Override
|
||||||
|
public int following(int followingIdx) {
|
||||||
|
final int startIdx = current();
|
||||||
|
if (followingIdx < startIdx) {
|
||||||
|
assert false : "Not supported";
|
||||||
|
return baseIter.following(followingIdx);
|
||||||
|
}
|
||||||
|
final int targetIdx = startIdx + lengthGoal;
|
||||||
|
// When followingIdx >= targetIdx, we can simply delegate since it will be >= the target
|
||||||
|
if (followingIdx >= targetIdx - 1) {
|
||||||
|
return baseIter.following(followingIdx);
|
||||||
|
}
|
||||||
|
// If target exceeds the text length, return the last index.
|
||||||
|
if (targetIdx >= getText().getEndIndex()) {
|
||||||
|
return baseIter.last();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Find closest break >= the target
|
||||||
|
final int afterIdx = baseIter.following(targetIdx - 1);
|
||||||
|
if (afterIdx == DONE) { // we're at the end; can this happen?
|
||||||
|
return current();
|
||||||
|
}
|
||||||
|
if (afterIdx == targetIdx) { // right on the money
|
||||||
|
return afterIdx;
|
||||||
|
}
|
||||||
|
if (isMinimumLength) { // thus never undershoot
|
||||||
|
return afterIdx;
|
||||||
|
}
|
||||||
|
|
||||||
|
// note: it is a shame that we invoke preceding() *in addition to* following(); BI's are sometimes expensive.
|
||||||
|
|
||||||
|
// Find closest break < target
|
||||||
|
final int beforeIdx = baseIter.preceding(targetIdx); // or could do baseIter.previous() but we hope the BI implements preceding()
|
||||||
|
if (beforeIdx <= followingIdx) { // too far back
|
||||||
|
return moveToBreak(afterIdx);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (targetIdx - beforeIdx <= afterIdx - targetIdx) {
|
||||||
|
return beforeIdx;
|
||||||
|
}
|
||||||
|
return moveToBreak(afterIdx);
|
||||||
|
}
|
||||||
|
|
||||||
|
private int moveToBreak(int idx) { // precondition: idx is a known break
|
||||||
|
// bi.isBoundary(idx) has side-effect of moving the position. Not obvious!
|
||||||
|
//boolean moved = baseIter.isBoundary(idx); // probably not particularly expensive
|
||||||
|
//assert moved && current() == idx;
|
||||||
|
|
||||||
|
// TODO fix: Would prefer to do "- 1" instead of "- 2" but CustomSeparatorBreakIterator has a bug.
|
||||||
|
int current = baseIter.following(idx - 2);
|
||||||
|
assert current == idx : "following() didn't move us to the expected index.";
|
||||||
|
return idx;
|
||||||
|
}
|
||||||
|
|
||||||
|
// called at start of new Passage given first word start offset
|
||||||
|
@Override
|
||||||
|
public int preceding(int offset) {
|
||||||
|
return baseIter.preceding(offset); // no change needed
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean isBoundary(int offset) {
|
||||||
|
assert false : "Not supported";
|
||||||
|
return baseIter.isBoundary(offset);
|
||||||
|
}
|
||||||
|
}
|
|
@ -171,6 +171,7 @@ public class Passage {
|
||||||
|
|
||||||
/** @lucene.internal */
|
/** @lucene.internal */
|
||||||
public void setEndOffset(int endOffset) {
|
public void setEndOffset(int endOffset) {
|
||||||
|
assert startOffset <= endOffset;
|
||||||
this.endOffset = endOffset;
|
this.endOffset = endOffset;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,104 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.lucene.search.uhighlight;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.MockAnalyzer;
|
||||||
|
import org.apache.lucene.analysis.MockTokenizer;
|
||||||
|
import org.apache.lucene.search.Query;
|
||||||
|
import org.apache.lucene.search.postingshighlight.CustomSeparatorBreakIterator;
|
||||||
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
|
import org.apache.lucene.util.QueryBuilder;
|
||||||
|
|
||||||
|
public class LengthGoalBreakIteratorTest extends LuceneTestCase {
|
||||||
|
private static final String FIELD = "body";
|
||||||
|
|
||||||
|
// We test LengthGoalBreakIterator as it is used by the UnifiedHighlighter instead of directly, because it is
|
||||||
|
// not a general purpose BreakIterator. A unit test of it directly wouldn't give as much confidence.
|
||||||
|
|
||||||
|
private final Analyzer analyzer =
|
||||||
|
new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);//whitespace, punctuation, lowercase
|
||||||
|
|
||||||
|
// We do a '.' BreakIterator and test varying the length goal.
|
||||||
|
// 0 1
|
||||||
|
// 01234567890123456789
|
||||||
|
final String content = "Aa bb. Cc dd. Ee ff";
|
||||||
|
|
||||||
|
public void testTargetLen() throws IOException {
|
||||||
|
// "goal" means target length goal to find closest break
|
||||||
|
|
||||||
|
// at first word:
|
||||||
|
Query query = query("aa");
|
||||||
|
assertEquals("almost two sent",
|
||||||
|
"<b>Aa</b> bb.", highlightClosestToLen(content, query, 9));
|
||||||
|
assertEquals( "barely two sent",
|
||||||
|
"<b>Aa</b> bb. Cc dd.", highlightClosestToLen(content, query, 10));
|
||||||
|
assertEquals("long goal",
|
||||||
|
"<b>Aa</b> bb. Cc dd. Ee ff", highlightClosestToLen(content, query, 17 + random().nextInt(20)));
|
||||||
|
|
||||||
|
// at some word not at start of passage
|
||||||
|
query = query("dd");
|
||||||
|
assertEquals("short goal",
|
||||||
|
" Cc <b>dd</b>.", highlightClosestToLen(content, query, random().nextInt(5)));
|
||||||
|
assertEquals("almost two sent",
|
||||||
|
" Cc <b>dd</b>.", highlightClosestToLen(content, query, 10));
|
||||||
|
assertEquals("barely two sent",
|
||||||
|
" Cc <b>dd</b>. Ee ff", highlightClosestToLen(content, query, 11));
|
||||||
|
assertEquals("long goal",
|
||||||
|
" Cc <b>dd</b>. Ee ff", highlightClosestToLen(content, query, 12 + random().nextInt(20)));
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testMinLen() throws IOException {
|
||||||
|
// minLen mode is simpler than targetLen... just test a few cases
|
||||||
|
|
||||||
|
Query query = query("dd");
|
||||||
|
assertEquals("almost two sent",
|
||||||
|
" Cc <b>dd</b>.", highlightMinLen(content, query, 6));
|
||||||
|
assertEquals("barely two sent",
|
||||||
|
" Cc <b>dd</b>. Ee ff", highlightMinLen(content, query, 7));
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testDefaultSummaryTargetLen() throws IOException {
|
||||||
|
Query query = query("zz");
|
||||||
|
assertEquals("Aa bb.",
|
||||||
|
highlightClosestToLen(content, query, random().nextInt(10))); // < 10
|
||||||
|
assertEquals("Aa bb. Cc dd.",
|
||||||
|
highlightClosestToLen(content, query, 10 + 6)); // cusp of adding 3rd sentence
|
||||||
|
assertEquals("Aa bb. Cc dd. Ee ff",
|
||||||
|
highlightClosestToLen(content, query, 17 + random().nextInt(20))); // >= 14
|
||||||
|
}
|
||||||
|
|
||||||
|
private Query query(String qStr) {
|
||||||
|
return new QueryBuilder(analyzer).createBooleanQuery(FIELD, qStr);
|
||||||
|
}
|
||||||
|
|
||||||
|
private String highlightClosestToLen(String content, Query query, int lengthGoal) throws IOException {
|
||||||
|
UnifiedHighlighter highlighter = new UnifiedHighlighter(null, analyzer);
|
||||||
|
highlighter.setBreakIterator(() -> LengthGoalBreakIterator.createClosestToLength(new CustomSeparatorBreakIterator('.'), lengthGoal));
|
||||||
|
return highlighter.highlightWithoutSearcher(FIELD, query, content, 1).toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
private String highlightMinLen(String content, Query query, int lengthGoal) throws IOException {
|
||||||
|
// differs from above only by "createMinLength"
|
||||||
|
UnifiedHighlighter highlighter = new UnifiedHighlighter(null, analyzer);
|
||||||
|
highlighter.setBreakIterator(() -> LengthGoalBreakIterator.createMinLength(new CustomSeparatorBreakIterator('.'), lengthGoal));
|
||||||
|
return highlighter.highlightWithoutSearcher(FIELD, query, content, 1).toString();
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue