LUCENE-7559: UnifiedHighlighter: Increase Passage visibility

This commit is contained in:
David Smiley 2016-11-17 11:19:54 -05:00
parent 738198ef34
commit c51e89014a
8 changed files with 70 additions and 47 deletions

View File

@ -111,6 +111,9 @@ Other
* LUCENE-7534: fix smokeTestRelease.py to run on Cygwin (Mikhail Khludnev) * LUCENE-7534: fix smokeTestRelease.py to run on Cygwin (Mikhail Khludnev)
* LUCENE-7559: UnifiedHighlighter: Make Passage more exposed to allow passage creation to
be customized. (David Smiley)
Build Build
* LUCENE-7387: fix defaultCodec in build.xml to account for the line ending (hossman) * LUCENE-7387: fix defaultCodec in build.xml to account for the line ending (hossman)

View File

@ -75,6 +75,9 @@ public abstract class AnalysisOffsetStrategy extends FieldOffsetStrategy {
* *
* @lucene.internal * @lucene.internal
*/ */
// TODO we could make this go away. MemoryIndexOffsetStrategy could simply split and analyze each value into the
// MemoryIndex. TokenStreamOffsetStrategy's hack TokenStreamPostingsEnum could incorporate this logic,
// albeit with less code, less hack.
private static final class MultiValueTokenStream extends TokenFilter { private static final class MultiValueTokenStream extends TokenFilter {
private final String fieldName; private final String fieldName;

View File

@ -63,13 +63,13 @@ public class DefaultPassageFormatter extends PassageFormatter {
int pos = 0; int pos = 0;
for (Passage passage : passages) { for (Passage passage : passages) {
// don't add ellipsis if its the first one, or if its connected. // don't add ellipsis if its the first one, or if its connected.
if (passage.startOffset > pos && pos > 0) { if (passage.getStartOffset() > pos && pos > 0) {
sb.append(ellipsis); sb.append(ellipsis);
} }
pos = passage.startOffset; pos = passage.getStartOffset();
for (int i = 0; i < passage.numMatches; i++) { for (int i = 0; i < passage.getNumMatches(); i++) {
int start = passage.matchStarts[i]; int start = passage.getMatchStarts()[i];
int end = passage.matchEnds[i]; int end = passage.getMatchEnds()[i];
// its possible to have overlapping terms // its possible to have overlapping terms
if (start > pos) { if (start > pos) {
append(sb, content, pos, start); append(sb, content, pos, start);
@ -82,8 +82,8 @@ public class DefaultPassageFormatter extends PassageFormatter {
} }
} }
// its possible a "term" from the analyzer could span a sentence boundary. // its possible a "term" from the analyzer could span a sentence boundary.
append(sb, content, pos, Math.max(pos, passage.endOffset)); append(sb, content, pos, Math.max(pos, passage.getEndOffset()));
pos = passage.endOffset; pos = passage.getEndOffset();
} }
return sb.toString(); return sb.toString();
} }

View File

@ -117,9 +117,9 @@ public class FieldHighlighter {
break; break;
} }
Passage passage = new Passage(); Passage passage = new Passage();
passage.score = Float.NaN; passage.setScore(Float.NaN);
passage.startOffset = pos; passage.setStartOffset(pos);
passage.endOffset = next; passage.setEndOffset(next);
passages.add(passage); passages.add(passage);
pos = next; pos = next;
} }
@ -145,12 +145,12 @@ public class FieldHighlighter {
offsetsEnumQueue.add(new OffsetsEnum(null, EMPTY)); // a sentinel for termination offsetsEnumQueue.add(new OffsetsEnum(null, EMPTY)); // a sentinel for termination
PriorityQueue<Passage> passageQueue = new PriorityQueue<>(Math.min(64, maxPassages + 1), (left, right) -> { PriorityQueue<Passage> passageQueue = new PriorityQueue<>(Math.min(64, maxPassages + 1), (left, right) -> {
if (left.score < right.score) { if (left.getScore() < right.getScore()) {
return -1; return -1;
} else if (left.score > right.score) { } else if (left.getScore() > right.getScore()) {
return 1; return 1;
} else { } else {
return left.startOffset - right.startOffset; return left.getStartOffset() - right.getStartOffset();
} }
}); });
Passage passage = new Passage(); // the current passage in-progress. Will either get reset or added to queue. Passage passage = new Passage(); // the current passage in-progress. Will either get reset or added to queue.
@ -170,12 +170,12 @@ public class FieldHighlighter {
continue; continue;
} }
// See if this term should be part of a new passage. // See if this term should be part of a new passage.
if (start >= passage.endOffset) { if (start >= passage.getEndOffset()) {
if (passage.startOffset >= 0) { // true if this passage has terms; otherwise couldn't find any (yet) if (passage.getStartOffset() >= 0) { // true if this passage has terms; otherwise couldn't find any (yet)
// finalize passage // finalize passage
passage.score *= scorer.norm(passage.startOffset); passage.setScore(passage.getScore() * scorer.norm(passage.getStartOffset()));
// new sentence: first add 'passage' to queue // new sentence: first add 'passage' to queue
if (passageQueue.size() == maxPassages && passage.score < passageQueue.peek().score) { if (passageQueue.size() == maxPassages && passage.getScore() < passageQueue.peek().getScore()) {
passage.reset(); // can't compete, just reset it passage.reset(); // can't compete, just reset it
} else { } else {
passageQueue.offer(passage); passageQueue.offer(passage);
@ -192,8 +192,8 @@ public class FieldHighlighter {
break; break;
} }
// advance breakIterator // advance breakIterator
passage.startOffset = Math.max(breakIterator.preceding(start + 1), 0); passage.setStartOffset(Math.max(breakIterator.preceding(start + 1), 0));
passage.endOffset = Math.min(breakIterator.following(start), contentLength); passage.setEndOffset(Math.min(breakIterator.following(start), contentLength));
} }
// Add this term to the passage. // Add this term to the passage.
int tf = 0; int tf = 0;
@ -209,12 +209,12 @@ public class FieldHighlighter {
off.nextPosition(); off.nextPosition();
start = off.startOffset(); start = off.startOffset();
end = off.endOffset(); end = off.endOffset();
if (start >= passage.endOffset || end > contentLength) { // it's beyond this passage if (start >= passage.getEndOffset() || end > contentLength) { // it's beyond this passage
offsetsEnumQueue.offer(off); offsetsEnumQueue.offer(off);
break; break;
} }
} }
passage.score += off.weight * scorer.tf(tf, passage.endOffset - passage.startOffset); passage.setScore(passage.getScore() + off.weight * scorer.tf(tf, passage.getEndOffset() - passage.getStartOffset()));
} }
Passage[] passages = passageQueue.toArray(new Passage[passageQueue.size()]); Passage[] passages = passageQueue.toArray(new Passage[passageQueue.size()]);
@ -222,7 +222,7 @@ public class FieldHighlighter {
p.sort(); p.sort();
} }
// sort in ascending order // sort in ascending order
Arrays.sort(passages, (left, right) -> left.startOffset - right.startOffset); Arrays.sort(passages, (left, right) -> left.getStartOffset() - right.getStartOffset());
return passages; return passages;
} }

View File

@ -66,9 +66,8 @@ public class OffsetsEnum implements Comparable<OffsetsEnum>, Closeable {
} }
BytesRef getTerm() throws IOException { BytesRef getTerm() throws IOException {
// the dp.getPayload thing is a hack -- see MultiTermHighlighting // TODO TokenStreamOffsetStrategy could override OffsetsEnum; then remove this hack here
return term != null ? term : postingsEnum.getPayload(); return term != null ? term : postingsEnum.getPayload(); // abusing payload like this is a total hack!
// We don't deepcopy() because in this hack we know we don't have to.
} }
boolean hasMorePositions() throws IOException { boolean hasMorePositions() throws IOException {
@ -91,7 +90,8 @@ public class OffsetsEnum implements Comparable<OffsetsEnum>, Closeable {
@Override @Override
public void close() throws IOException { public void close() throws IOException {
if (postingsEnum instanceof Closeable) { // the one in MultiTermHighlighting is. // TODO TokenStreamOffsetStrategy could override OffsetsEnum; then this base impl would be no-op.
if (postingsEnum instanceof Closeable) {
((Closeable) postingsEnum).close(); ((Closeable) postingsEnum).close();
} }
} }

View File

@ -30,16 +30,17 @@ import org.apache.lucene.util.RamUsageEstimator;
* *
* @lucene.experimental * @lucene.experimental
*/ */
public final class Passage { public class Passage {
int startOffset = -1; private int startOffset = -1;
int endOffset = -1; private int endOffset = -1;
float score = 0.0f; private float score = 0.0f;
int matchStarts[] = new int[8]; private int[] matchStarts = new int[8];
int matchEnds[] = new int[8]; private int[] matchEnds = new int[8];
BytesRef matchTerms[] = new BytesRef[8]; private BytesRef[] matchTerms = new BytesRef[8];
int numMatches = 0; private int numMatches = 0;
/** @lucene.internal */
public void addMatch(int startOffset, int endOffset, BytesRef term) { public void addMatch(int startOffset, int endOffset, BytesRef term) {
assert startOffset >= this.startOffset && startOffset <= this.endOffset; assert startOffset >= this.startOffset && startOffset <= this.endOffset;
if (numMatches == matchStarts.length) { if (numMatches == matchStarts.length) {
@ -61,7 +62,8 @@ public final class Passage {
numMatches++; numMatches++;
} }
void sort() { /** @lucene.internal */
public void sort() {
final int starts[] = matchStarts; final int starts[] = matchStarts;
final int ends[] = matchEnds; final int ends[] = matchEnds;
final BytesRef terms[] = matchTerms; final BytesRef terms[] = matchTerms;
@ -89,7 +91,8 @@ public final class Passage {
}.sort(0, numMatches); }.sort(0, numMatches);
} }
void reset() { /** @lucene.internal */
public void reset() {
startOffset = endOffset = -1; startOffset = endOffset = -1;
score = 0.0f; score = 0.0f;
numMatches = 0; numMatches = 0;
@ -158,4 +161,19 @@ public final class Passage {
public BytesRef[] getMatchTerms() { public BytesRef[] getMatchTerms() {
return matchTerms; return matchTerms;
} }
/** @lucene.internal */
public void setStartOffset(int startOffset) {
this.startOffset = startOffset;
}
/** @lucene.internal */
public void setEndOffset(int endOffset) {
this.endOffset = endOffset;
}
/** @lucene.internal */
public void setScore(float score) {
this.score = score;
}
} }

View File

@ -69,10 +69,8 @@ public class TokenStreamOffsetStrategy extends AnalysisOffsetStrategy {
return Collections.singletonList(new OffsetsEnum(null, mtqPostingsEnum)); return Collections.singletonList(new OffsetsEnum(null, mtqPostingsEnum));
} }
// but this would have a performance cost for likely little gain in the user experience, it // See class javadocs.
// would only serve to make this method less bogus. // TODO: DWS perhaps instead OffsetsEnum could become abstract and this would be an impl? See TODOs in OffsetsEnum.
// instead, we always return freq() = Integer.MAX_VALUE and let the highlighter terminate based on offset...
// TODO: DWS perhaps instead OffsetsEnum could become abstract and this would be an impl?
private static class TokenStreamPostingsEnum extends PostingsEnum implements Closeable { private static class TokenStreamPostingsEnum extends PostingsEnum implements Closeable {
TokenStream stream; // becomes null when closed TokenStream stream; // becomes null when closed
final CharacterRunAutomaton[] matchers; final CharacterRunAutomaton[] matchers;
@ -134,6 +132,7 @@ public class TokenStreamOffsetStrategy extends AnalysisOffsetStrategy {
return currentEndOffset; return currentEndOffset;
} }
// TOTAL HACK; used in OffsetsEnum.getTerm()
@Override @Override
public BytesRef getPayload() throws IOException { public BytesRef getPayload() throws IOException {
if (matchDescriptions[currentMatch] == null) { if (matchDescriptions[currentMatch] == null) {

View File

@ -697,13 +697,13 @@ public class TestUnifiedHighlighterMTQ extends LuceneTestCase {
int pos = 0; int pos = 0;
for (Passage passage : passages) { for (Passage passage : passages) {
// don't add ellipsis if its the first one, or if its connected. // don't add ellipsis if its the first one, or if its connected.
if (passage.startOffset > pos && pos > 0) { if (passage.getStartOffset() > pos && pos > 0) {
sb.append("... "); sb.append("... ");
} }
pos = passage.startOffset; pos = passage.getStartOffset();
for (int i = 0; i < passage.numMatches; i++) { for (int i = 0; i < passage.getNumMatches(); i++) {
int start = passage.matchStarts[i]; int start = passage.getMatchStarts()[i];
int end = passage.matchEnds[i]; int end = passage.getMatchEnds()[i];
// its possible to have overlapping terms // its possible to have overlapping terms
if (start > pos) { if (start > pos) {
sb.append(content, pos, start); sb.append(content, pos, start);
@ -719,8 +719,8 @@ public class TestUnifiedHighlighterMTQ extends LuceneTestCase {
} }
} }
// its possible a "term" from the analyzer could span a sentence boundary. // its possible a "term" from the analyzer could span a sentence boundary.
sb.append(content, pos, Math.max(pos, passage.endOffset)); sb.append(content, pos, Math.max(pos, passage.getEndOffset()));
pos = passage.endOffset; pos = passage.getEndOffset();
} }
return sb.toString(); return sb.toString();
} }