mirror of https://github.com/apache/lucene.git
LUCENE-7559: UnifiedHighlighter: Increase Passage visibility
This commit is contained in:
parent
738198ef34
commit
c51e89014a
|
@ -111,6 +111,9 @@ Other
|
||||||
|
|
||||||
* LUCENE-7534: fix smokeTestRelease.py to run on Cygwin (Mikhail Khludnev)
|
* LUCENE-7534: fix smokeTestRelease.py to run on Cygwin (Mikhail Khludnev)
|
||||||
|
|
||||||
|
* LUCENE-7559: UnifiedHighlighter: Make Passage more exposed to allow passage creation to
|
||||||
|
be customized. (David Smiley)
|
||||||
|
|
||||||
Build
|
Build
|
||||||
|
|
||||||
* LUCENE-7387: fix defaultCodec in build.xml to account for the line ending (hossman)
|
* LUCENE-7387: fix defaultCodec in build.xml to account for the line ending (hossman)
|
||||||
|
|
|
@ -75,6 +75,9 @@ public abstract class AnalysisOffsetStrategy extends FieldOffsetStrategy {
|
||||||
*
|
*
|
||||||
* @lucene.internal
|
* @lucene.internal
|
||||||
*/
|
*/
|
||||||
|
// TODO we could make this go away. MemoryIndexOffsetStrategy could simply split and analyze each value into the
|
||||||
|
// MemoryIndex. TokenStreamOffsetStrategy's hack TokenStreamPostingsEnum could incorporate this logic,
|
||||||
|
// albeit with less code, less hack.
|
||||||
private static final class MultiValueTokenStream extends TokenFilter {
|
private static final class MultiValueTokenStream extends TokenFilter {
|
||||||
|
|
||||||
private final String fieldName;
|
private final String fieldName;
|
||||||
|
|
|
@ -63,13 +63,13 @@ public class DefaultPassageFormatter extends PassageFormatter {
|
||||||
int pos = 0;
|
int pos = 0;
|
||||||
for (Passage passage : passages) {
|
for (Passage passage : passages) {
|
||||||
// don't add ellipsis if its the first one, or if its connected.
|
// don't add ellipsis if its the first one, or if its connected.
|
||||||
if (passage.startOffset > pos && pos > 0) {
|
if (passage.getStartOffset() > pos && pos > 0) {
|
||||||
sb.append(ellipsis);
|
sb.append(ellipsis);
|
||||||
}
|
}
|
||||||
pos = passage.startOffset;
|
pos = passage.getStartOffset();
|
||||||
for (int i = 0; i < passage.numMatches; i++) {
|
for (int i = 0; i < passage.getNumMatches(); i++) {
|
||||||
int start = passage.matchStarts[i];
|
int start = passage.getMatchStarts()[i];
|
||||||
int end = passage.matchEnds[i];
|
int end = passage.getMatchEnds()[i];
|
||||||
// its possible to have overlapping terms
|
// its possible to have overlapping terms
|
||||||
if (start > pos) {
|
if (start > pos) {
|
||||||
append(sb, content, pos, start);
|
append(sb, content, pos, start);
|
||||||
|
@ -82,8 +82,8 @@ public class DefaultPassageFormatter extends PassageFormatter {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// its possible a "term" from the analyzer could span a sentence boundary.
|
// its possible a "term" from the analyzer could span a sentence boundary.
|
||||||
append(sb, content, pos, Math.max(pos, passage.endOffset));
|
append(sb, content, pos, Math.max(pos, passage.getEndOffset()));
|
||||||
pos = passage.endOffset;
|
pos = passage.getEndOffset();
|
||||||
}
|
}
|
||||||
return sb.toString();
|
return sb.toString();
|
||||||
}
|
}
|
||||||
|
|
|
@ -117,9 +117,9 @@ public class FieldHighlighter {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
Passage passage = new Passage();
|
Passage passage = new Passage();
|
||||||
passage.score = Float.NaN;
|
passage.setScore(Float.NaN);
|
||||||
passage.startOffset = pos;
|
passage.setStartOffset(pos);
|
||||||
passage.endOffset = next;
|
passage.setEndOffset(next);
|
||||||
passages.add(passage);
|
passages.add(passage);
|
||||||
pos = next;
|
pos = next;
|
||||||
}
|
}
|
||||||
|
@ -145,12 +145,12 @@ public class FieldHighlighter {
|
||||||
offsetsEnumQueue.add(new OffsetsEnum(null, EMPTY)); // a sentinel for termination
|
offsetsEnumQueue.add(new OffsetsEnum(null, EMPTY)); // a sentinel for termination
|
||||||
|
|
||||||
PriorityQueue<Passage> passageQueue = new PriorityQueue<>(Math.min(64, maxPassages + 1), (left, right) -> {
|
PriorityQueue<Passage> passageQueue = new PriorityQueue<>(Math.min(64, maxPassages + 1), (left, right) -> {
|
||||||
if (left.score < right.score) {
|
if (left.getScore() < right.getScore()) {
|
||||||
return -1;
|
return -1;
|
||||||
} else if (left.score > right.score) {
|
} else if (left.getScore() > right.getScore()) {
|
||||||
return 1;
|
return 1;
|
||||||
} else {
|
} else {
|
||||||
return left.startOffset - right.startOffset;
|
return left.getStartOffset() - right.getStartOffset();
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
Passage passage = new Passage(); // the current passage in-progress. Will either get reset or added to queue.
|
Passage passage = new Passage(); // the current passage in-progress. Will either get reset or added to queue.
|
||||||
|
@ -170,12 +170,12 @@ public class FieldHighlighter {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
// See if this term should be part of a new passage.
|
// See if this term should be part of a new passage.
|
||||||
if (start >= passage.endOffset) {
|
if (start >= passage.getEndOffset()) {
|
||||||
if (passage.startOffset >= 0) { // true if this passage has terms; otherwise couldn't find any (yet)
|
if (passage.getStartOffset() >= 0) { // true if this passage has terms; otherwise couldn't find any (yet)
|
||||||
// finalize passage
|
// finalize passage
|
||||||
passage.score *= scorer.norm(passage.startOffset);
|
passage.setScore(passage.getScore() * scorer.norm(passage.getStartOffset()));
|
||||||
// new sentence: first add 'passage' to queue
|
// new sentence: first add 'passage' to queue
|
||||||
if (passageQueue.size() == maxPassages && passage.score < passageQueue.peek().score) {
|
if (passageQueue.size() == maxPassages && passage.getScore() < passageQueue.peek().getScore()) {
|
||||||
passage.reset(); // can't compete, just reset it
|
passage.reset(); // can't compete, just reset it
|
||||||
} else {
|
} else {
|
||||||
passageQueue.offer(passage);
|
passageQueue.offer(passage);
|
||||||
|
@ -192,8 +192,8 @@ public class FieldHighlighter {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
// advance breakIterator
|
// advance breakIterator
|
||||||
passage.startOffset = Math.max(breakIterator.preceding(start + 1), 0);
|
passage.setStartOffset(Math.max(breakIterator.preceding(start + 1), 0));
|
||||||
passage.endOffset = Math.min(breakIterator.following(start), contentLength);
|
passage.setEndOffset(Math.min(breakIterator.following(start), contentLength));
|
||||||
}
|
}
|
||||||
// Add this term to the passage.
|
// Add this term to the passage.
|
||||||
int tf = 0;
|
int tf = 0;
|
||||||
|
@ -209,12 +209,12 @@ public class FieldHighlighter {
|
||||||
off.nextPosition();
|
off.nextPosition();
|
||||||
start = off.startOffset();
|
start = off.startOffset();
|
||||||
end = off.endOffset();
|
end = off.endOffset();
|
||||||
if (start >= passage.endOffset || end > contentLength) { // it's beyond this passage
|
if (start >= passage.getEndOffset() || end > contentLength) { // it's beyond this passage
|
||||||
offsetsEnumQueue.offer(off);
|
offsetsEnumQueue.offer(off);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
passage.score += off.weight * scorer.tf(tf, passage.endOffset - passage.startOffset);
|
passage.setScore(passage.getScore() + off.weight * scorer.tf(tf, passage.getEndOffset() - passage.getStartOffset()));
|
||||||
}
|
}
|
||||||
|
|
||||||
Passage[] passages = passageQueue.toArray(new Passage[passageQueue.size()]);
|
Passage[] passages = passageQueue.toArray(new Passage[passageQueue.size()]);
|
||||||
|
@ -222,7 +222,7 @@ public class FieldHighlighter {
|
||||||
p.sort();
|
p.sort();
|
||||||
}
|
}
|
||||||
// sort in ascending order
|
// sort in ascending order
|
||||||
Arrays.sort(passages, (left, right) -> left.startOffset - right.startOffset);
|
Arrays.sort(passages, (left, right) -> left.getStartOffset() - right.getStartOffset());
|
||||||
return passages;
|
return passages;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -66,9 +66,8 @@ public class OffsetsEnum implements Comparable<OffsetsEnum>, Closeable {
|
||||||
}
|
}
|
||||||
|
|
||||||
BytesRef getTerm() throws IOException {
|
BytesRef getTerm() throws IOException {
|
||||||
// the dp.getPayload thing is a hack -- see MultiTermHighlighting
|
// TODO TokenStreamOffsetStrategy could override OffsetsEnum; then remove this hack here
|
||||||
return term != null ? term : postingsEnum.getPayload();
|
return term != null ? term : postingsEnum.getPayload(); // abusing payload like this is a total hack!
|
||||||
// We don't deepcopy() because in this hack we know we don't have to.
|
|
||||||
}
|
}
|
||||||
|
|
||||||
boolean hasMorePositions() throws IOException {
|
boolean hasMorePositions() throws IOException {
|
||||||
|
@ -91,7 +90,8 @@ public class OffsetsEnum implements Comparable<OffsetsEnum>, Closeable {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void close() throws IOException {
|
public void close() throws IOException {
|
||||||
if (postingsEnum instanceof Closeable) { // the one in MultiTermHighlighting is.
|
// TODO TokenStreamOffsetStrategy could override OffsetsEnum; then this base impl would be no-op.
|
||||||
|
if (postingsEnum instanceof Closeable) {
|
||||||
((Closeable) postingsEnum).close();
|
((Closeable) postingsEnum).close();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -30,16 +30,17 @@ import org.apache.lucene.util.RamUsageEstimator;
|
||||||
*
|
*
|
||||||
* @lucene.experimental
|
* @lucene.experimental
|
||||||
*/
|
*/
|
||||||
public final class Passage {
|
public class Passage {
|
||||||
int startOffset = -1;
|
private int startOffset = -1;
|
||||||
int endOffset = -1;
|
private int endOffset = -1;
|
||||||
float score = 0.0f;
|
private float score = 0.0f;
|
||||||
|
|
||||||
int matchStarts[] = new int[8];
|
private int[] matchStarts = new int[8];
|
||||||
int matchEnds[] = new int[8];
|
private int[] matchEnds = new int[8];
|
||||||
BytesRef matchTerms[] = new BytesRef[8];
|
private BytesRef[] matchTerms = new BytesRef[8];
|
||||||
int numMatches = 0;
|
private int numMatches = 0;
|
||||||
|
|
||||||
|
/** @lucene.internal */
|
||||||
public void addMatch(int startOffset, int endOffset, BytesRef term) {
|
public void addMatch(int startOffset, int endOffset, BytesRef term) {
|
||||||
assert startOffset >= this.startOffset && startOffset <= this.endOffset;
|
assert startOffset >= this.startOffset && startOffset <= this.endOffset;
|
||||||
if (numMatches == matchStarts.length) {
|
if (numMatches == matchStarts.length) {
|
||||||
|
@ -61,7 +62,8 @@ public final class Passage {
|
||||||
numMatches++;
|
numMatches++;
|
||||||
}
|
}
|
||||||
|
|
||||||
void sort() {
|
/** @lucene.internal */
|
||||||
|
public void sort() {
|
||||||
final int starts[] = matchStarts;
|
final int starts[] = matchStarts;
|
||||||
final int ends[] = matchEnds;
|
final int ends[] = matchEnds;
|
||||||
final BytesRef terms[] = matchTerms;
|
final BytesRef terms[] = matchTerms;
|
||||||
|
@ -89,7 +91,8 @@ public final class Passage {
|
||||||
}.sort(0, numMatches);
|
}.sort(0, numMatches);
|
||||||
}
|
}
|
||||||
|
|
||||||
void reset() {
|
/** @lucene.internal */
|
||||||
|
public void reset() {
|
||||||
startOffset = endOffset = -1;
|
startOffset = endOffset = -1;
|
||||||
score = 0.0f;
|
score = 0.0f;
|
||||||
numMatches = 0;
|
numMatches = 0;
|
||||||
|
@ -158,4 +161,19 @@ public final class Passage {
|
||||||
public BytesRef[] getMatchTerms() {
|
public BytesRef[] getMatchTerms() {
|
||||||
return matchTerms;
|
return matchTerms;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** @lucene.internal */
|
||||||
|
public void setStartOffset(int startOffset) {
|
||||||
|
this.startOffset = startOffset;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** @lucene.internal */
|
||||||
|
public void setEndOffset(int endOffset) {
|
||||||
|
this.endOffset = endOffset;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** @lucene.internal */
|
||||||
|
public void setScore(float score) {
|
||||||
|
this.score = score;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -69,10 +69,8 @@ public class TokenStreamOffsetStrategy extends AnalysisOffsetStrategy {
|
||||||
return Collections.singletonList(new OffsetsEnum(null, mtqPostingsEnum));
|
return Collections.singletonList(new OffsetsEnum(null, mtqPostingsEnum));
|
||||||
}
|
}
|
||||||
|
|
||||||
// but this would have a performance cost for likely little gain in the user experience, it
|
// See class javadocs.
|
||||||
// would only serve to make this method less bogus.
|
// TODO: DWS perhaps instead OffsetsEnum could become abstract and this would be an impl? See TODOs in OffsetsEnum.
|
||||||
// instead, we always return freq() = Integer.MAX_VALUE and let the highlighter terminate based on offset...
|
|
||||||
// TODO: DWS perhaps instead OffsetsEnum could become abstract and this would be an impl?
|
|
||||||
private static class TokenStreamPostingsEnum extends PostingsEnum implements Closeable {
|
private static class TokenStreamPostingsEnum extends PostingsEnum implements Closeable {
|
||||||
TokenStream stream; // becomes null when closed
|
TokenStream stream; // becomes null when closed
|
||||||
final CharacterRunAutomaton[] matchers;
|
final CharacterRunAutomaton[] matchers;
|
||||||
|
@ -134,6 +132,7 @@ public class TokenStreamOffsetStrategy extends AnalysisOffsetStrategy {
|
||||||
return currentEndOffset;
|
return currentEndOffset;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TOTAL HACK; used in OffsetsEnum.getTerm()
|
||||||
@Override
|
@Override
|
||||||
public BytesRef getPayload() throws IOException {
|
public BytesRef getPayload() throws IOException {
|
||||||
if (matchDescriptions[currentMatch] == null) {
|
if (matchDescriptions[currentMatch] == null) {
|
||||||
|
|
|
@ -697,13 +697,13 @@ public class TestUnifiedHighlighterMTQ extends LuceneTestCase {
|
||||||
int pos = 0;
|
int pos = 0;
|
||||||
for (Passage passage : passages) {
|
for (Passage passage : passages) {
|
||||||
// don't add ellipsis if its the first one, or if its connected.
|
// don't add ellipsis if its the first one, or if its connected.
|
||||||
if (passage.startOffset > pos && pos > 0) {
|
if (passage.getStartOffset() > pos && pos > 0) {
|
||||||
sb.append("... ");
|
sb.append("... ");
|
||||||
}
|
}
|
||||||
pos = passage.startOffset;
|
pos = passage.getStartOffset();
|
||||||
for (int i = 0; i < passage.numMatches; i++) {
|
for (int i = 0; i < passage.getNumMatches(); i++) {
|
||||||
int start = passage.matchStarts[i];
|
int start = passage.getMatchStarts()[i];
|
||||||
int end = passage.matchEnds[i];
|
int end = passage.getMatchEnds()[i];
|
||||||
// its possible to have overlapping terms
|
// its possible to have overlapping terms
|
||||||
if (start > pos) {
|
if (start > pos) {
|
||||||
sb.append(content, pos, start);
|
sb.append(content, pos, start);
|
||||||
|
@ -719,8 +719,8 @@ public class TestUnifiedHighlighterMTQ extends LuceneTestCase {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// its possible a "term" from the analyzer could span a sentence boundary.
|
// its possible a "term" from the analyzer could span a sentence boundary.
|
||||||
sb.append(content, pos, Math.max(pos, passage.endOffset));
|
sb.append(content, pos, Math.max(pos, passage.getEndOffset()));
|
||||||
pos = passage.endOffset;
|
pos = passage.getEndOffset();
|
||||||
}
|
}
|
||||||
return sb.toString();
|
return sb.toString();
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue