LUCENE-3087: fix highlighter case that prevented highlighting exact phrase when tokens overlap

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1102377 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2011-05-12 16:54:45 +00:00
parent 858aa929c4
commit 33eb3e534e
3 changed files with 123 additions and 16 deletions

View File

@ -188,6 +188,10 @@ Bug fixes
* LUCENE-2943: Fix thread-safety issues with ICUCollationKeyFilter.
(Robert Muir)
* LUCENE-3087: Highlighter: fix case that was preventing highlighting
of exact phrase when tokens overlap. (Pierre Gossé via Mike
McCandless)
API Changes
* LUCENE-2867: Some contrib queryparser methods that receives CharSequence as

View File

@ -30,6 +30,7 @@ import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.TermFreqVector;
@ -158,10 +159,13 @@ public class TokenSources {
OffsetAttribute offsetAtt;
PositionIncrementAttribute posincAtt;
StoredTokenStream(Token tokens[]) {
this.tokens = tokens;
termAtt = addAttribute(CharTermAttribute.class);
offsetAtt = addAttribute(OffsetAttribute.class);
posincAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
}
@Override
@ -173,6 +177,10 @@ public class TokenSources {
clearAttributes();
termAtt.setEmpty().append(token);
offsetAtt.setOffset(token.startOffset(), token.endOffset());
posincAtt
.setPositionIncrement(currentToken <= 1
|| tokens[currentToken - 1].startOffset() > tokens[currentToken - 2]
.startOffset() ? 1 : 0);
return true;
}
}
@ -180,7 +188,6 @@ public class TokenSources {
BytesRef[] terms = tpv.getTerms();
int[] freq = tpv.getTermFrequencies();
int totalTokens = 0;
for (int t = 0; t < freq.length; t++) {
totalTokens += freq[t];
}
@ -189,7 +196,8 @@ public class TokenSources {
for (int t = 0; t < freq.length; t++) {
TermVectorOffsetInfo[] offsets = tpv.getOffsets(t);
if (offsets == null) {
throw new IllegalArgumentException("Required TermVector Offset information was not found");
throw new IllegalArgumentException(
"Required TermVector Offset information was not found");
}
int[] pos = null;
@ -205,8 +213,8 @@ public class TokenSources {
unsortedTokens = new ArrayList<Token>();
}
for (int tp = 0; tp < offsets.length; tp++) {
Token token = new Token(terms[t].utf8ToString(), offsets[tp].getStartOffset(), offsets[tp]
.getEndOffset());
Token token = new Token(terms[t].utf8ToString(),
offsets[tp].getStartOffset(), offsets[tp].getEndOffset());
unsortedTokens.add(token);
}
} else {
@ -221,8 +229,8 @@ public class TokenSources {
// tokens stored with positions - can use this to index straight into
// sorted array
for (int tp = 0; tp < pos.length; tp++) {
Token token = new Token(terms[t].utf8ToString(), offsets[tp].getStartOffset(),
offsets[tp].getEndOffset());
Token token = new Token(terms[t].utf8ToString(),
offsets[tp].getStartOffset(), offsets[tp].getEndOffset());
tokensInOriginalOrder[pos[tp]] = token;
}
}
@ -233,10 +241,9 @@ public class TokenSources {
.size()]);
ArrayUtil.mergeSort(tokensInOriginalOrder, new Comparator<Token>() {
public int compare(Token t1, Token t2) {
if (t1.startOffset() == t2.startOffset())
return t1.endOffset() - t2.endOffset();
else
return t1.startOffset() - t2.startOffset();
if (t1.startOffset() == t2.startOffset()) return t1.endOffset()
- t2.endOffset();
else return t1.startOffset() - t2.startOffset();
}
});
}

View File

@ -36,7 +36,10 @@ import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermPositionVector;
import org.apache.lucene.search.DisjunctionMaxQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.spans.SpanNearQuery;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.LockObtainFailedException;
@ -188,4 +191,97 @@ public class TokenSourcesTest extends LuceneTestCase {
}
}
public void testOverlapWithOffsetExactPhrase() throws CorruptIndexException,
LockObtainFailedException, IOException, InvalidTokenOffsetsException {
final String TEXT = "the fox did not jump";
final Directory directory = newDirectory();
final IndexWriter indexWriter = new IndexWriter(directory,
newIndexWriterConfig(TEST_VERSION_CURRENT, new OverlapAnalyzer()));
try {
final Document document = new Document();
document.add(new Field(FIELD, new TokenStreamOverlap(),
TermVector.WITH_OFFSETS));
indexWriter.addDocument(document);
} finally {
indexWriter.close();
}
final IndexReader indexReader = IndexReader.open(directory, true);
try {
assertEquals(1, indexReader.numDocs());
final IndexSearcher indexSearcher = newSearcher(indexReader);
try {
// final DisjunctionMaxQuery query = new DisjunctionMaxQuery(1);
// query.add(new SpanTermQuery(new Term(FIELD, "{fox}")));
// query.add(new SpanTermQuery(new Term(FIELD, "fox")));
final Query phraseQuery = new SpanNearQuery(new SpanQuery[] {
new SpanTermQuery(new Term(FIELD, "the")),
new SpanTermQuery(new Term(FIELD, "fox"))}, 0, true);
TopDocs hits = indexSearcher.search(phraseQuery, 1);
assertEquals(1, hits.totalHits);
final Highlighter highlighter = new Highlighter(
new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
new QueryScorer(phraseQuery));
final TokenStream tokenStream = TokenSources
.getTokenStream(
(TermPositionVector) indexReader.getTermFreqVector(0, FIELD),
false);
assertEquals("<B>the fox</B> did not jump",
highlighter.getBestFragment(tokenStream, TEXT));
} finally {
indexSearcher.close();
}
} finally {
indexReader.close();
directory.close();
}
}
public void testOverlapWithPositionsAndOffsetExactPhrase()
throws CorruptIndexException, LockObtainFailedException, IOException,
InvalidTokenOffsetsException {
final String TEXT = "the fox did not jump";
final Directory directory = newDirectory();
final IndexWriter indexWriter = new IndexWriter(directory,
newIndexWriterConfig(TEST_VERSION_CURRENT, new OverlapAnalyzer()));
try {
final Document document = new Document();
document.add(new Field(FIELD, new TokenStreamOverlap(),
TermVector.WITH_POSITIONS_OFFSETS));
indexWriter.addDocument(document);
} finally {
indexWriter.close();
}
final IndexReader indexReader = IndexReader.open(directory, true);
try {
assertEquals(1, indexReader.numDocs());
final IndexSearcher indexSearcher = newSearcher(indexReader);
try {
// final DisjunctionMaxQuery query = new DisjunctionMaxQuery(1);
// query.add(new SpanTermQuery(new Term(FIELD, "the")));
// query.add(new SpanTermQuery(new Term(FIELD, "fox")));
final Query phraseQuery = new SpanNearQuery(new SpanQuery[] {
new SpanTermQuery(new Term(FIELD, "the")),
new SpanTermQuery(new Term(FIELD, "fox"))}, 0, true);
TopDocs hits = indexSearcher.search(phraseQuery, 1);
assertEquals(1, hits.totalHits);
final Highlighter highlighter = new Highlighter(
new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
new QueryScorer(phraseQuery));
final TokenStream tokenStream = TokenSources
.getTokenStream(
(TermPositionVector) indexReader.getTermFreqVector(0, FIELD),
false);
assertEquals("<B>the fox</B> did not jump",
highlighter.getBestFragment(tokenStream, TEXT));
} finally {
indexSearcher.close();
}
} finally {
indexReader.close();
directory.close();
}
}
}