mirror of https://github.com/apache/lucene.git
LUCENE-3087: fix highlighter case that prevented highlighting exact phrase when tokens overlap
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1102377 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
858aa929c4
commit
33eb3e534e
|
@ -188,6 +188,10 @@ Bug fixes
|
|||
* LUCENE-2943: Fix thread-safety issues with ICUCollationKeyFilter.
|
||||
(Robert Muir)
|
||||
|
||||
* LUCENE-3087: Highlighter: fix case that was preventing highlighting
|
||||
of exact phrase when tokens overlap. (Pierre Gossé via Mike
|
||||
McCandless)
|
||||
|
||||
API Changes
|
||||
|
||||
* LUCENE-2867: Some contrib queryparser methods that receives CharSequence as
|
||||
|
|
|
@ -30,6 +30,7 @@ import org.apache.lucene.analysis.Token;
|
|||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.TermFreqVector;
|
||||
|
@ -158,10 +159,13 @@ public class TokenSources {
|
|||
|
||||
OffsetAttribute offsetAtt;
|
||||
|
||||
PositionIncrementAttribute posincAtt;
|
||||
|
||||
StoredTokenStream(Token tokens[]) {
|
||||
this.tokens = tokens;
|
||||
termAtt = addAttribute(CharTermAttribute.class);
|
||||
offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
posincAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -173,6 +177,10 @@ public class TokenSources {
|
|||
clearAttributes();
|
||||
termAtt.setEmpty().append(token);
|
||||
offsetAtt.setOffset(token.startOffset(), token.endOffset());
|
||||
posincAtt
|
||||
.setPositionIncrement(currentToken <= 1
|
||||
|| tokens[currentToken - 1].startOffset() > tokens[currentToken - 2]
|
||||
.startOffset() ? 1 : 0);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
@ -180,7 +188,6 @@ public class TokenSources {
|
|||
BytesRef[] terms = tpv.getTerms();
|
||||
int[] freq = tpv.getTermFrequencies();
|
||||
int totalTokens = 0;
|
||||
|
||||
for (int t = 0; t < freq.length; t++) {
|
||||
totalTokens += freq[t];
|
||||
}
|
||||
|
@ -189,7 +196,8 @@ public class TokenSources {
|
|||
for (int t = 0; t < freq.length; t++) {
|
||||
TermVectorOffsetInfo[] offsets = tpv.getOffsets(t);
|
||||
if (offsets == null) {
|
||||
throw new IllegalArgumentException("Required TermVector Offset information was not found");
|
||||
throw new IllegalArgumentException(
|
||||
"Required TermVector Offset information was not found");
|
||||
}
|
||||
|
||||
int[] pos = null;
|
||||
|
@ -205,8 +213,8 @@ public class TokenSources {
|
|||
unsortedTokens = new ArrayList<Token>();
|
||||
}
|
||||
for (int tp = 0; tp < offsets.length; tp++) {
|
||||
Token token = new Token(terms[t].utf8ToString(), offsets[tp].getStartOffset(), offsets[tp]
|
||||
.getEndOffset());
|
||||
Token token = new Token(terms[t].utf8ToString(),
|
||||
offsets[tp].getStartOffset(), offsets[tp].getEndOffset());
|
||||
unsortedTokens.add(token);
|
||||
}
|
||||
} else {
|
||||
|
@ -221,8 +229,8 @@ public class TokenSources {
|
|||
// tokens stored with positions - can use this to index straight into
|
||||
// sorted array
|
||||
for (int tp = 0; tp < pos.length; tp++) {
|
||||
Token token = new Token(terms[t].utf8ToString(), offsets[tp].getStartOffset(),
|
||||
offsets[tp].getEndOffset());
|
||||
Token token = new Token(terms[t].utf8ToString(),
|
||||
offsets[tp].getStartOffset(), offsets[tp].getEndOffset());
|
||||
tokensInOriginalOrder[pos[tp]] = token;
|
||||
}
|
||||
}
|
||||
|
@ -233,10 +241,9 @@ public class TokenSources {
|
|||
.size()]);
|
||||
ArrayUtil.mergeSort(tokensInOriginalOrder, new Comparator<Token>() {
|
||||
public int compare(Token t1, Token t2) {
|
||||
if (t1.startOffset() == t2.startOffset())
|
||||
return t1.endOffset() - t2.endOffset();
|
||||
else
|
||||
return t1.startOffset() - t2.startOffset();
|
||||
if (t1.startOffset() == t2.startOffset()) return t1.endOffset()
|
||||
- t2.endOffset();
|
||||
else return t1.startOffset() - t2.startOffset();
|
||||
}
|
||||
});
|
||||
}
|
||||
|
|
|
@ -36,7 +36,10 @@ import org.apache.lucene.index.Term;
|
|||
import org.apache.lucene.index.TermPositionVector;
|
||||
import org.apache.lucene.search.DisjunctionMaxQuery;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.TopDocs;
|
||||
import org.apache.lucene.search.spans.SpanNearQuery;
|
||||
import org.apache.lucene.search.spans.SpanQuery;
|
||||
import org.apache.lucene.search.spans.SpanTermQuery;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.LockObtainFailedException;
|
||||
|
@ -188,4 +191,97 @@ public class TokenSourcesTest extends LuceneTestCase {
|
|||
}
|
||||
}
|
||||
|
||||
public void testOverlapWithOffsetExactPhrase() throws CorruptIndexException,
|
||||
LockObtainFailedException, IOException, InvalidTokenOffsetsException {
|
||||
final String TEXT = "the fox did not jump";
|
||||
final Directory directory = newDirectory();
|
||||
final IndexWriter indexWriter = new IndexWriter(directory,
|
||||
newIndexWriterConfig(TEST_VERSION_CURRENT, new OverlapAnalyzer()));
|
||||
try {
|
||||
final Document document = new Document();
|
||||
document.add(new Field(FIELD, new TokenStreamOverlap(),
|
||||
TermVector.WITH_OFFSETS));
|
||||
indexWriter.addDocument(document);
|
||||
} finally {
|
||||
indexWriter.close();
|
||||
}
|
||||
final IndexReader indexReader = IndexReader.open(directory, true);
|
||||
try {
|
||||
assertEquals(1, indexReader.numDocs());
|
||||
final IndexSearcher indexSearcher = newSearcher(indexReader);
|
||||
try {
|
||||
// final DisjunctionMaxQuery query = new DisjunctionMaxQuery(1);
|
||||
// query.add(new SpanTermQuery(new Term(FIELD, "{fox}")));
|
||||
// query.add(new SpanTermQuery(new Term(FIELD, "fox")));
|
||||
final Query phraseQuery = new SpanNearQuery(new SpanQuery[] {
|
||||
new SpanTermQuery(new Term(FIELD, "the")),
|
||||
new SpanTermQuery(new Term(FIELD, "fox"))}, 0, true);
|
||||
|
||||
TopDocs hits = indexSearcher.search(phraseQuery, 1);
|
||||
assertEquals(1, hits.totalHits);
|
||||
final Highlighter highlighter = new Highlighter(
|
||||
new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
|
||||
new QueryScorer(phraseQuery));
|
||||
final TokenStream tokenStream = TokenSources
|
||||
.getTokenStream(
|
||||
(TermPositionVector) indexReader.getTermFreqVector(0, FIELD),
|
||||
false);
|
||||
assertEquals("<B>the fox</B> did not jump",
|
||||
highlighter.getBestFragment(tokenStream, TEXT));
|
||||
} finally {
|
||||
indexSearcher.close();
|
||||
}
|
||||
} finally {
|
||||
indexReader.close();
|
||||
directory.close();
|
||||
}
|
||||
}
|
||||
|
||||
public void testOverlapWithPositionsAndOffsetExactPhrase()
|
||||
throws CorruptIndexException, LockObtainFailedException, IOException,
|
||||
InvalidTokenOffsetsException {
|
||||
final String TEXT = "the fox did not jump";
|
||||
final Directory directory = newDirectory();
|
||||
final IndexWriter indexWriter = new IndexWriter(directory,
|
||||
newIndexWriterConfig(TEST_VERSION_CURRENT, new OverlapAnalyzer()));
|
||||
try {
|
||||
final Document document = new Document();
|
||||
document.add(new Field(FIELD, new TokenStreamOverlap(),
|
||||
TermVector.WITH_POSITIONS_OFFSETS));
|
||||
indexWriter.addDocument(document);
|
||||
} finally {
|
||||
indexWriter.close();
|
||||
}
|
||||
final IndexReader indexReader = IndexReader.open(directory, true);
|
||||
try {
|
||||
assertEquals(1, indexReader.numDocs());
|
||||
final IndexSearcher indexSearcher = newSearcher(indexReader);
|
||||
try {
|
||||
// final DisjunctionMaxQuery query = new DisjunctionMaxQuery(1);
|
||||
// query.add(new SpanTermQuery(new Term(FIELD, "the")));
|
||||
// query.add(new SpanTermQuery(new Term(FIELD, "fox")));
|
||||
final Query phraseQuery = new SpanNearQuery(new SpanQuery[] {
|
||||
new SpanTermQuery(new Term(FIELD, "the")),
|
||||
new SpanTermQuery(new Term(FIELD, "fox"))}, 0, true);
|
||||
|
||||
TopDocs hits = indexSearcher.search(phraseQuery, 1);
|
||||
assertEquals(1, hits.totalHits);
|
||||
final Highlighter highlighter = new Highlighter(
|
||||
new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
|
||||
new QueryScorer(phraseQuery));
|
||||
final TokenStream tokenStream = TokenSources
|
||||
.getTokenStream(
|
||||
(TermPositionVector) indexReader.getTermFreqVector(0, FIELD),
|
||||
false);
|
||||
assertEquals("<B>the fox</B> did not jump",
|
||||
highlighter.getBestFragment(tokenStream, TEXT));
|
||||
} finally {
|
||||
indexSearcher.close();
|
||||
}
|
||||
} finally {
|
||||
indexReader.close();
|
||||
directory.close();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue