LUCENE-7559: UnifiedHighlighter: Increase Passage visibility

This commit is contained in:
David Smiley 2016-11-17 11:19:54 -05:00
parent 738198ef34
commit c51e89014a
8 changed files with 70 additions and 47 deletions

View File

@ -111,6 +111,9 @@ Other
* LUCENE-7534: fix smokeTestRelease.py to run on Cygwin (Mikhail Khludnev)
* LUCENE-7559: UnifiedHighlighter: Make Passage more exposed to allow passage creation to
be customized. (David Smiley)
Build
* LUCENE-7387: fix defaultCodec in build.xml to account for the line ending (hossman)

View File

@ -75,6 +75,9 @@ public abstract class AnalysisOffsetStrategy extends FieldOffsetStrategy {
*
* @lucene.internal
*/
// TODO we could make this go away. MemoryIndexOffsetStrategy could simply split and analyze each value into the
// MemoryIndex. TokenStreamOffsetStrategy's hack TokenStreamPostingsEnum could incorporate this logic,
// albeit with less code, less hack.
private static final class MultiValueTokenStream extends TokenFilter {
private final String fieldName;

View File

@ -63,13 +63,13 @@ public class DefaultPassageFormatter extends PassageFormatter {
int pos = 0;
for (Passage passage : passages) {
// don't add ellipsis if its the first one, or if its connected.
if (passage.startOffset > pos && pos > 0) {
if (passage.getStartOffset() > pos && pos > 0) {
sb.append(ellipsis);
}
pos = passage.startOffset;
for (int i = 0; i < passage.numMatches; i++) {
int start = passage.matchStarts[i];
int end = passage.matchEnds[i];
pos = passage.getStartOffset();
for (int i = 0; i < passage.getNumMatches(); i++) {
int start = passage.getMatchStarts()[i];
int end = passage.getMatchEnds()[i];
// its possible to have overlapping terms
if (start > pos) {
append(sb, content, pos, start);
@ -82,8 +82,8 @@ public class DefaultPassageFormatter extends PassageFormatter {
}
}
// its possible a "term" from the analyzer could span a sentence boundary.
append(sb, content, pos, Math.max(pos, passage.endOffset));
pos = passage.endOffset;
append(sb, content, pos, Math.max(pos, passage.getEndOffset()));
pos = passage.getEndOffset();
}
return sb.toString();
}

View File

@ -117,9 +117,9 @@ public class FieldHighlighter {
break;
}
Passage passage = new Passage();
passage.score = Float.NaN;
passage.startOffset = pos;
passage.endOffset = next;
passage.setScore(Float.NaN);
passage.setStartOffset(pos);
passage.setEndOffset(next);
passages.add(passage);
pos = next;
}
@ -145,12 +145,12 @@ public class FieldHighlighter {
offsetsEnumQueue.add(new OffsetsEnum(null, EMPTY)); // a sentinel for termination
PriorityQueue<Passage> passageQueue = new PriorityQueue<>(Math.min(64, maxPassages + 1), (left, right) -> {
if (left.score < right.score) {
if (left.getScore() < right.getScore()) {
return -1;
} else if (left.score > right.score) {
} else if (left.getScore() > right.getScore()) {
return 1;
} else {
return left.startOffset - right.startOffset;
return left.getStartOffset() - right.getStartOffset();
}
});
Passage passage = new Passage(); // the current passage in-progress. Will either get reset or added to queue.
@ -170,12 +170,12 @@ public class FieldHighlighter {
continue;
}
// See if this term should be part of a new passage.
if (start >= passage.endOffset) {
if (passage.startOffset >= 0) { // true if this passage has terms; otherwise couldn't find any (yet)
if (start >= passage.getEndOffset()) {
if (passage.getStartOffset() >= 0) { // true if this passage has terms; otherwise couldn't find any (yet)
// finalize passage
passage.score *= scorer.norm(passage.startOffset);
passage.setScore(passage.getScore() * scorer.norm(passage.getStartOffset()));
// new sentence: first add 'passage' to queue
if (passageQueue.size() == maxPassages && passage.score < passageQueue.peek().score) {
if (passageQueue.size() == maxPassages && passage.getScore() < passageQueue.peek().getScore()) {
passage.reset(); // can't compete, just reset it
} else {
passageQueue.offer(passage);
@ -192,8 +192,8 @@ public class FieldHighlighter {
break;
}
// advance breakIterator
passage.startOffset = Math.max(breakIterator.preceding(start + 1), 0);
passage.endOffset = Math.min(breakIterator.following(start), contentLength);
passage.setStartOffset(Math.max(breakIterator.preceding(start + 1), 0));
passage.setEndOffset(Math.min(breakIterator.following(start), contentLength));
}
// Add this term to the passage.
int tf = 0;
@ -209,12 +209,12 @@ public class FieldHighlighter {
off.nextPosition();
start = off.startOffset();
end = off.endOffset();
if (start >= passage.endOffset || end > contentLength) { // it's beyond this passage
if (start >= passage.getEndOffset() || end > contentLength) { // it's beyond this passage
offsetsEnumQueue.offer(off);
break;
}
}
passage.score += off.weight * scorer.tf(tf, passage.endOffset - passage.startOffset);
passage.setScore(passage.getScore() + off.weight * scorer.tf(tf, passage.getEndOffset() - passage.getStartOffset()));
}
Passage[] passages = passageQueue.toArray(new Passage[passageQueue.size()]);
@ -222,7 +222,7 @@ public class FieldHighlighter {
p.sort();
}
// sort in ascending order
Arrays.sort(passages, (left, right) -> left.startOffset - right.startOffset);
Arrays.sort(passages, (left, right) -> left.getStartOffset() - right.getStartOffset());
return passages;
}

View File

@ -66,9 +66,8 @@ public class OffsetsEnum implements Comparable<OffsetsEnum>, Closeable {
}
BytesRef getTerm() throws IOException {
// the dp.getPayload thing is a hack -- see MultiTermHighlighting
return term != null ? term : postingsEnum.getPayload();
// We don't deepcopy() because in this hack we know we don't have to.
// TODO TokenStreamOffsetStrategy could override OffsetsEnum; then remove this hack here
return term != null ? term : postingsEnum.getPayload(); // abusing payload like this is a total hack!
}
boolean hasMorePositions() throws IOException {
@ -91,7 +90,8 @@ public class OffsetsEnum implements Comparable<OffsetsEnum>, Closeable {
@Override
public void close() throws IOException {
if (postingsEnum instanceof Closeable) { // the one in MultiTermHighlighting is.
// TODO TokenStreamOffsetStrategy could override OffsetsEnum; then this base impl would be no-op.
if (postingsEnum instanceof Closeable) {
((Closeable) postingsEnum).close();
}
}

View File

@ -30,16 +30,17 @@ import org.apache.lucene.util.RamUsageEstimator;
*
* @lucene.experimental
*/
public final class Passage {
int startOffset = -1;
int endOffset = -1;
float score = 0.0f;
public class Passage {
private int startOffset = -1;
private int endOffset = -1;
private float score = 0.0f;
int matchStarts[] = new int[8];
int matchEnds[] = new int[8];
BytesRef matchTerms[] = new BytesRef[8];
int numMatches = 0;
private int[] matchStarts = new int[8];
private int[] matchEnds = new int[8];
private BytesRef[] matchTerms = new BytesRef[8];
private int numMatches = 0;
/** @lucene.internal */
public void addMatch(int startOffset, int endOffset, BytesRef term) {
assert startOffset >= this.startOffset && startOffset <= this.endOffset;
if (numMatches == matchStarts.length) {
@ -61,7 +62,8 @@ public final class Passage {
numMatches++;
}
void sort() {
/** @lucene.internal */
public void sort() {
final int starts[] = matchStarts;
final int ends[] = matchEnds;
final BytesRef terms[] = matchTerms;
@ -89,7 +91,8 @@ public final class Passage {
}.sort(0, numMatches);
}
void reset() {
/** @lucene.internal */
public void reset() {
startOffset = endOffset = -1;
score = 0.0f;
numMatches = 0;
@ -158,4 +161,19 @@ public final class Passage {
public BytesRef[] getMatchTerms() {
return matchTerms;
}
/** @lucene.internal */
public void setStartOffset(int startOffset) {
this.startOffset = startOffset;
}
/** @lucene.internal */
public void setEndOffset(int endOffset) {
this.endOffset = endOffset;
}
/** @lucene.internal */
public void setScore(float score) {
this.score = score;
}
}

View File

@ -69,10 +69,8 @@ public class TokenStreamOffsetStrategy extends AnalysisOffsetStrategy {
return Collections.singletonList(new OffsetsEnum(null, mtqPostingsEnum));
}
// but this would have a performance cost for likely little gain in the user experience, it
// would only serve to make this method less bogus.
// instead, we always return freq() = Integer.MAX_VALUE and let the highlighter terminate based on offset...
// TODO: DWS perhaps instead OffsetsEnum could become abstract and this would be an impl?
// See class javadocs.
// TODO: DWS perhaps instead OffsetsEnum could become abstract and this would be an impl? See TODOs in OffsetsEnum.
private static class TokenStreamPostingsEnum extends PostingsEnum implements Closeable {
TokenStream stream; // becomes null when closed
final CharacterRunAutomaton[] matchers;
@ -134,6 +132,7 @@ public class TokenStreamOffsetStrategy extends AnalysisOffsetStrategy {
return currentEndOffset;
}
// TOTAL HACK; used in OffsetsEnum.getTerm()
@Override
public BytesRef getPayload() throws IOException {
if (matchDescriptions[currentMatch] == null) {

View File

@ -697,13 +697,13 @@ public class TestUnifiedHighlighterMTQ extends LuceneTestCase {
int pos = 0;
for (Passage passage : passages) {
// don't add ellipsis if its the first one, or if its connected.
if (passage.startOffset > pos && pos > 0) {
if (passage.getStartOffset() > pos && pos > 0) {
sb.append("... ");
}
pos = passage.startOffset;
for (int i = 0; i < passage.numMatches; i++) {
int start = passage.matchStarts[i];
int end = passage.matchEnds[i];
pos = passage.getStartOffset();
for (int i = 0; i < passage.getNumMatches(); i++) {
int start = passage.getMatchStarts()[i];
int end = passage.getMatchEnds()[i];
// its possible to have overlapping terms
if (start > pos) {
sb.append(content, pos, start);
@ -719,8 +719,8 @@ public class TestUnifiedHighlighterMTQ extends LuceneTestCase {
}
}
// its possible a "term" from the analyzer could span a sentence boundary.
sb.append(content, pos, Math.max(pos, passage.endOffset));
pos = passage.endOffset;
sb.append(content, pos, Math.max(pos, passage.getEndOffset()));
pos = passage.getEndOffset();
}
return sb.toString();
}