mirror of https://github.com/apache/lucene.git
LUCENE-7559: UnifiedHighlighter: Increase Passage visibility
This commit is contained in:
parent
738198ef34
commit
c51e89014a
|
@ -111,6 +111,9 @@ Other
|
|||
|
||||
* LUCENE-7534: fix smokeTestRelease.py to run on Cygwin (Mikhail Khludnev)
|
||||
|
||||
* LUCENE-7559: UnifiedHighlighter: Make Passage more exposed to allow passage creation to
|
||||
be customized. (David Smiley)
|
||||
|
||||
Build
|
||||
|
||||
* LUCENE-7387: fix defaultCodec in build.xml to account for the line ending (hossman)
|
||||
|
|
|
@ -75,6 +75,9 @@ public abstract class AnalysisOffsetStrategy extends FieldOffsetStrategy {
|
|||
*
|
||||
* @lucene.internal
|
||||
*/
|
||||
// TODO we could make this go away. MemoryIndexOffsetStrategy could simply split and analyze each value into the
|
||||
// MemoryIndex. TokenStreamOffsetStrategy's hack TokenStreamPostingsEnum could incorporate this logic,
|
||||
// albeit with less code, less hack.
|
||||
private static final class MultiValueTokenStream extends TokenFilter {
|
||||
|
||||
private final String fieldName;
|
||||
|
|
|
@ -63,13 +63,13 @@ public class DefaultPassageFormatter extends PassageFormatter {
|
|||
int pos = 0;
|
||||
for (Passage passage : passages) {
|
||||
// don't add ellipsis if its the first one, or if its connected.
|
||||
if (passage.startOffset > pos && pos > 0) {
|
||||
if (passage.getStartOffset() > pos && pos > 0) {
|
||||
sb.append(ellipsis);
|
||||
}
|
||||
pos = passage.startOffset;
|
||||
for (int i = 0; i < passage.numMatches; i++) {
|
||||
int start = passage.matchStarts[i];
|
||||
int end = passage.matchEnds[i];
|
||||
pos = passage.getStartOffset();
|
||||
for (int i = 0; i < passage.getNumMatches(); i++) {
|
||||
int start = passage.getMatchStarts()[i];
|
||||
int end = passage.getMatchEnds()[i];
|
||||
// its possible to have overlapping terms
|
||||
if (start > pos) {
|
||||
append(sb, content, pos, start);
|
||||
|
@ -82,8 +82,8 @@ public class DefaultPassageFormatter extends PassageFormatter {
|
|||
}
|
||||
}
|
||||
// its possible a "term" from the analyzer could span a sentence boundary.
|
||||
append(sb, content, pos, Math.max(pos, passage.endOffset));
|
||||
pos = passage.endOffset;
|
||||
append(sb, content, pos, Math.max(pos, passage.getEndOffset()));
|
||||
pos = passage.getEndOffset();
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
|
|
|
@ -117,9 +117,9 @@ public class FieldHighlighter {
|
|||
break;
|
||||
}
|
||||
Passage passage = new Passage();
|
||||
passage.score = Float.NaN;
|
||||
passage.startOffset = pos;
|
||||
passage.endOffset = next;
|
||||
passage.setScore(Float.NaN);
|
||||
passage.setStartOffset(pos);
|
||||
passage.setEndOffset(next);
|
||||
passages.add(passage);
|
||||
pos = next;
|
||||
}
|
||||
|
@ -145,12 +145,12 @@ public class FieldHighlighter {
|
|||
offsetsEnumQueue.add(new OffsetsEnum(null, EMPTY)); // a sentinel for termination
|
||||
|
||||
PriorityQueue<Passage> passageQueue = new PriorityQueue<>(Math.min(64, maxPassages + 1), (left, right) -> {
|
||||
if (left.score < right.score) {
|
||||
if (left.getScore() < right.getScore()) {
|
||||
return -1;
|
||||
} else if (left.score > right.score) {
|
||||
} else if (left.getScore() > right.getScore()) {
|
||||
return 1;
|
||||
} else {
|
||||
return left.startOffset - right.startOffset;
|
||||
return left.getStartOffset() - right.getStartOffset();
|
||||
}
|
||||
});
|
||||
Passage passage = new Passage(); // the current passage in-progress. Will either get reset or added to queue.
|
||||
|
@ -170,12 +170,12 @@ public class FieldHighlighter {
|
|||
continue;
|
||||
}
|
||||
// See if this term should be part of a new passage.
|
||||
if (start >= passage.endOffset) {
|
||||
if (passage.startOffset >= 0) { // true if this passage has terms; otherwise couldn't find any (yet)
|
||||
if (start >= passage.getEndOffset()) {
|
||||
if (passage.getStartOffset() >= 0) { // true if this passage has terms; otherwise couldn't find any (yet)
|
||||
// finalize passage
|
||||
passage.score *= scorer.norm(passage.startOffset);
|
||||
passage.setScore(passage.getScore() * scorer.norm(passage.getStartOffset()));
|
||||
// new sentence: first add 'passage' to queue
|
||||
if (passageQueue.size() == maxPassages && passage.score < passageQueue.peek().score) {
|
||||
if (passageQueue.size() == maxPassages && passage.getScore() < passageQueue.peek().getScore()) {
|
||||
passage.reset(); // can't compete, just reset it
|
||||
} else {
|
||||
passageQueue.offer(passage);
|
||||
|
@ -192,8 +192,8 @@ public class FieldHighlighter {
|
|||
break;
|
||||
}
|
||||
// advance breakIterator
|
||||
passage.startOffset = Math.max(breakIterator.preceding(start + 1), 0);
|
||||
passage.endOffset = Math.min(breakIterator.following(start), contentLength);
|
||||
passage.setStartOffset(Math.max(breakIterator.preceding(start + 1), 0));
|
||||
passage.setEndOffset(Math.min(breakIterator.following(start), contentLength));
|
||||
}
|
||||
// Add this term to the passage.
|
||||
int tf = 0;
|
||||
|
@ -209,12 +209,12 @@ public class FieldHighlighter {
|
|||
off.nextPosition();
|
||||
start = off.startOffset();
|
||||
end = off.endOffset();
|
||||
if (start >= passage.endOffset || end > contentLength) { // it's beyond this passage
|
||||
if (start >= passage.getEndOffset() || end > contentLength) { // it's beyond this passage
|
||||
offsetsEnumQueue.offer(off);
|
||||
break;
|
||||
}
|
||||
}
|
||||
passage.score += off.weight * scorer.tf(tf, passage.endOffset - passage.startOffset);
|
||||
passage.setScore(passage.getScore() + off.weight * scorer.tf(tf, passage.getEndOffset() - passage.getStartOffset()));
|
||||
}
|
||||
|
||||
Passage[] passages = passageQueue.toArray(new Passage[passageQueue.size()]);
|
||||
|
@ -222,7 +222,7 @@ public class FieldHighlighter {
|
|||
p.sort();
|
||||
}
|
||||
// sort in ascending order
|
||||
Arrays.sort(passages, (left, right) -> left.startOffset - right.startOffset);
|
||||
Arrays.sort(passages, (left, right) -> left.getStartOffset() - right.getStartOffset());
|
||||
return passages;
|
||||
}
|
||||
|
||||
|
|
|
@ -66,9 +66,8 @@ public class OffsetsEnum implements Comparable<OffsetsEnum>, Closeable {
|
|||
}
|
||||
|
||||
BytesRef getTerm() throws IOException {
|
||||
// the dp.getPayload thing is a hack -- see MultiTermHighlighting
|
||||
return term != null ? term : postingsEnum.getPayload();
|
||||
// We don't deepcopy() because in this hack we know we don't have to.
|
||||
// TODO TokenStreamOffsetStrategy could override OffsetsEnum; then remove this hack here
|
||||
return term != null ? term : postingsEnum.getPayload(); // abusing payload like this is a total hack!
|
||||
}
|
||||
|
||||
boolean hasMorePositions() throws IOException {
|
||||
|
@ -91,7 +90,8 @@ public class OffsetsEnum implements Comparable<OffsetsEnum>, Closeable {
|
|||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
if (postingsEnum instanceof Closeable) { // the one in MultiTermHighlighting is.
|
||||
// TODO TokenStreamOffsetStrategy could override OffsetsEnum; then this base impl would be no-op.
|
||||
if (postingsEnum instanceof Closeable) {
|
||||
((Closeable) postingsEnum).close();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -30,16 +30,17 @@ import org.apache.lucene.util.RamUsageEstimator;
|
|||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public final class Passage {
|
||||
int startOffset = -1;
|
||||
int endOffset = -1;
|
||||
float score = 0.0f;
|
||||
public class Passage {
|
||||
private int startOffset = -1;
|
||||
private int endOffset = -1;
|
||||
private float score = 0.0f;
|
||||
|
||||
int matchStarts[] = new int[8];
|
||||
int matchEnds[] = new int[8];
|
||||
BytesRef matchTerms[] = new BytesRef[8];
|
||||
int numMatches = 0;
|
||||
private int[] matchStarts = new int[8];
|
||||
private int[] matchEnds = new int[8];
|
||||
private BytesRef[] matchTerms = new BytesRef[8];
|
||||
private int numMatches = 0;
|
||||
|
||||
/** @lucene.internal */
|
||||
public void addMatch(int startOffset, int endOffset, BytesRef term) {
|
||||
assert startOffset >= this.startOffset && startOffset <= this.endOffset;
|
||||
if (numMatches == matchStarts.length) {
|
||||
|
@ -61,7 +62,8 @@ public final class Passage {
|
|||
numMatches++;
|
||||
}
|
||||
|
||||
void sort() {
|
||||
/** @lucene.internal */
|
||||
public void sort() {
|
||||
final int starts[] = matchStarts;
|
||||
final int ends[] = matchEnds;
|
||||
final BytesRef terms[] = matchTerms;
|
||||
|
@ -89,7 +91,8 @@ public final class Passage {
|
|||
}.sort(0, numMatches);
|
||||
}
|
||||
|
||||
void reset() {
|
||||
/** @lucene.internal */
|
||||
public void reset() {
|
||||
startOffset = endOffset = -1;
|
||||
score = 0.0f;
|
||||
numMatches = 0;
|
||||
|
@ -158,4 +161,19 @@ public final class Passage {
|
|||
public BytesRef[] getMatchTerms() {
|
||||
return matchTerms;
|
||||
}
|
||||
|
||||
/** @lucene.internal */
|
||||
public void setStartOffset(int startOffset) {
|
||||
this.startOffset = startOffset;
|
||||
}
|
||||
|
||||
/** @lucene.internal */
|
||||
public void setEndOffset(int endOffset) {
|
||||
this.endOffset = endOffset;
|
||||
}
|
||||
|
||||
/** @lucene.internal */
|
||||
public void setScore(float score) {
|
||||
this.score = score;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -69,10 +69,8 @@ public class TokenStreamOffsetStrategy extends AnalysisOffsetStrategy {
|
|||
return Collections.singletonList(new OffsetsEnum(null, mtqPostingsEnum));
|
||||
}
|
||||
|
||||
// but this would have a performance cost for likely little gain in the user experience, it
|
||||
// would only serve to make this method less bogus.
|
||||
// instead, we always return freq() = Integer.MAX_VALUE and let the highlighter terminate based on offset...
|
||||
// TODO: DWS perhaps instead OffsetsEnum could become abstract and this would be an impl?
|
||||
// See class javadocs.
|
||||
// TODO: DWS perhaps instead OffsetsEnum could become abstract and this would be an impl? See TODOs in OffsetsEnum.
|
||||
private static class TokenStreamPostingsEnum extends PostingsEnum implements Closeable {
|
||||
TokenStream stream; // becomes null when closed
|
||||
final CharacterRunAutomaton[] matchers;
|
||||
|
@ -134,6 +132,7 @@ public class TokenStreamOffsetStrategy extends AnalysisOffsetStrategy {
|
|||
return currentEndOffset;
|
||||
}
|
||||
|
||||
// TOTAL HACK; used in OffsetsEnum.getTerm()
|
||||
@Override
|
||||
public BytesRef getPayload() throws IOException {
|
||||
if (matchDescriptions[currentMatch] == null) {
|
||||
|
|
|
@ -697,13 +697,13 @@ public class TestUnifiedHighlighterMTQ extends LuceneTestCase {
|
|||
int pos = 0;
|
||||
for (Passage passage : passages) {
|
||||
// don't add ellipsis if its the first one, or if its connected.
|
||||
if (passage.startOffset > pos && pos > 0) {
|
||||
if (passage.getStartOffset() > pos && pos > 0) {
|
||||
sb.append("... ");
|
||||
}
|
||||
pos = passage.startOffset;
|
||||
for (int i = 0; i < passage.numMatches; i++) {
|
||||
int start = passage.matchStarts[i];
|
||||
int end = passage.matchEnds[i];
|
||||
pos = passage.getStartOffset();
|
||||
for (int i = 0; i < passage.getNumMatches(); i++) {
|
||||
int start = passage.getMatchStarts()[i];
|
||||
int end = passage.getMatchEnds()[i];
|
||||
// its possible to have overlapping terms
|
||||
if (start > pos) {
|
||||
sb.append(content, pos, start);
|
||||
|
@ -719,8 +719,8 @@ public class TestUnifiedHighlighterMTQ extends LuceneTestCase {
|
|||
}
|
||||
}
|
||||
// its possible a "term" from the analyzer could span a sentence boundary.
|
||||
sb.append(content, pos, Math.max(pos, passage.endOffset));
|
||||
pos = passage.endOffset;
|
||||
sb.append(content, pos, Math.max(pos, passage.getEndOffset()));
|
||||
pos = passage.getEndOffset();
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue