LUCENE-6334: fix FastVectorHighlighter when a phrase spans more than one value in a multi-valued field

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1693156 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2015-07-28 20:24:01 +00:00
parent 27aa993b15
commit 7023d92ca8
3 changed files with 101 additions and 2 deletions

View File

@ -281,6 +281,10 @@ Bug fixes
* LUCENE-6696: Fix FilterDirectoryReader.close() to never close the
underlying reader several times. (Adrien Grand)
* LUCENE-6334: FastVectorHighlighter failed to highlight phrases across
more than one value in a multi-valued field. (Chris Earle, Nik Everett
via Mike McCandless)
Changes in Runtime Behavior
* LUCENE-6501: The subreader structure in ParallelCompositeReader

View File

@ -268,10 +268,39 @@ public abstract class BaseFragmentsBuilder implements FragmentsBuilder {
Iterator<Toffs> toffsIterator = subInfo.getTermsOffsets().iterator();
while (toffsIterator.hasNext()) {
Toffs toffs = toffsIterator.next();
if (toffs.getStartOffset() >= fieldStart && toffs.getEndOffset() <= fieldEnd) {
if (toffs.getStartOffset() >= fieldEnd) {
// We've gone past this value so its not worth iterating any more.
break;
}
boolean startsAfterField = toffs.getStartOffset() >= fieldStart;
boolean endsBeforeField = toffs.getEndOffset() < fieldEnd;
if (startsAfterField && endsBeforeField) {
// The Toff is entirely within this value.
toffsList.add(toffs);
toffsIterator.remove();
} else if (startsAfterField) {
/*
* The Toffs starts within this value but ends after this value
* so we clamp the returned Toffs to this value and leave the
* Toffs in the iterator for the next value of this field.
*/
toffsList.add(new Toffs(toffs.getStartOffset(), fieldEnd - 1));
} else if (endsBeforeField) {
/*
* The Toffs starts before this value but ends in this value
* which means we're really continuing from where we left off
* above. Since we use the remainder of the offset we can remove
* it from the iterator.
*/
toffsList.add(new Toffs(fieldStart, toffs.getEndOffset()));
toffsIterator.remove();
} else {
/*
* The Toffs spans the whole value so we clamp on both sides.
* This is basically a combination of both arms of the loop
* above.
*/
toffsList.add(new Toffs(fieldStart, fieldEnd - 1));
}
}
if (!toffsList.isEmpty()) {

View File

@ -574,6 +574,72 @@ public class FastVectorHighlighterTest extends LuceneTestCase {
dir.close();
}
public void testPhrasesSpanningFieldValues() throws IOException {
Directory dir = newDirectory();
// positionIncrementGap is 0 so the pharse is found across multiple field
// values.
IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random())));
FieldType type = new FieldType(TextField.TYPE_STORED);
type.setStoreTermVectorOffsets(true);
type.setStoreTermVectorPositions(true);
type.setStoreTermVectors(true);
type.freeze();
Document doc = new Document();
doc.add( new Field( "field", "one two three five", type ) );
doc.add( new Field( "field", "two three four", type ) );
doc.add( new Field( "field", "five six five", type ) );
doc.add( new Field( "field", "six seven eight nine eight nine eight " +
"nine eight nine eight nine eight nine", type ) );
doc.add( new Field( "field", "eight nine", type ) );
doc.add( new Field( "field", "ten eleven", type ) );
doc.add( new Field( "field", "twelve thirteen", type ) );
writer.addDocument(doc);
BaseFragListBuilder fragListBuilder = new SimpleFragListBuilder();
BaseFragmentsBuilder fragmentsBuilder = new SimpleFragmentsBuilder();
fragmentsBuilder.setDiscreteMultiValueHighlighting(true);
FastVectorHighlighter highlighter = new FastVectorHighlighter(true, true, fragListBuilder, fragmentsBuilder);
IndexReader reader = DirectoryReader.open(writer, true);
int docId = 0;
// Phrase that spans a field value
Query q = new PhraseQuery("field", "four", "five");
FieldQuery fieldQuery = highlighter.getFieldQuery(q, reader);
String[] bestFragments = highlighter.getBestFragments(fieldQuery, reader, docId, "field", 1000, 1000);
assertEquals("two three <b>four</b>", bestFragments[0]);
assertEquals("<b>five</b> six five", bestFragments[1]);
assertEquals(2, bestFragments.length);
// Phrase that ends at a field value
q = new PhraseQuery("field", "three", "five");
fieldQuery = highlighter.getFieldQuery(q, reader);
bestFragments = highlighter.getBestFragments(fieldQuery, reader, docId, "field", 1000, 1000);
assertEquals("one two <b>three five</b>", bestFragments[0]);
assertEquals(1, bestFragments.length);
// Phrase that spans across three values
q = new PhraseQuery("field", "nine", "ten", "eleven", "twelve");
fieldQuery = highlighter.getFieldQuery(q, reader);
bestFragments = highlighter.getBestFragments(fieldQuery, reader, docId, "field", 1000, 1000);
assertEquals("eight <b>nine</b>", bestFragments[0]);
assertEquals("<b>ten eleven</b>", bestFragments[1]);
assertEquals("<b>twelve</b> thirteen", bestFragments[2]);
assertEquals(3, bestFragments.length);
// Term query that appears in multiple values
q = new TermQuery(new Term("field", "two"));
fieldQuery = highlighter.getFieldQuery(q, reader);
bestFragments = highlighter.getBestFragments(fieldQuery, reader, docId, "field", 1000, 1000);
assertEquals("one <b>two</b> three five", bestFragments[0]);
assertEquals("<b>two</b> three four", bestFragments[1]);
assertEquals(2, bestFragments.length);
reader.close();
writer.close();
dir.close();
}
private void matchedFieldsTestCase( String fieldValue, String expected, Query... queryClauses ) throws IOException {
matchedFieldsTestCase( true, true, fieldValue, expected, queryClauses );
}