From 33eb3e534e1403e78c2047b18fc2fb3c2597cd56 Mon Sep 17 00:00:00 2001 From: Michael McCandless Date: Thu, 12 May 2011 16:54:45 +0000 Subject: [PATCH] LUCENE-3087: fix highlighter case that prevented highlighting exact phrase when tokens overlap git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1102377 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/contrib/CHANGES.txt | 4 + .../lucene/search/highlight/TokenSources.java | 27 +++-- .../search/highlight/TokenSourcesTest.java | 108 +++++++++++++++++- 3 files changed, 123 insertions(+), 16 deletions(-) diff --git a/lucene/contrib/CHANGES.txt b/lucene/contrib/CHANGES.txt index 2e6021bf090..3c553a67694 100644 --- a/lucene/contrib/CHANGES.txt +++ b/lucene/contrib/CHANGES.txt @@ -188,6 +188,10 @@ Bug fixes * LUCENE-2943: Fix thread-safety issues with ICUCollationKeyFilter. (Robert Muir) + * LUCENE-3087: Highlighter: fix case that was preventing highlighting + of exact phrase when tokens overlap. (Pierre Gossé via Mike + McCandless) + API Changes * LUCENE-2867: Some contrib queryparser methods that receives CharSequence as diff --git a/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java b/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java index 9a9294f93fd..536c7e20465 100644 --- a/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java +++ b/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java @@ -30,6 +30,7 @@ import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.document.Document; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.TermFreqVector; @@ -158,10 +159,13 @@ public class TokenSources { OffsetAttribute offsetAtt; + PositionIncrementAttribute posincAtt; + StoredTokenStream(Token tokens[]) { this.tokens = tokens; termAtt = addAttribute(CharTermAttribute.class); offsetAtt = addAttribute(OffsetAttribute.class); + posincAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class); } @Override @@ -173,6 +177,10 @@ public class TokenSources { clearAttributes(); termAtt.setEmpty().append(token); offsetAtt.setOffset(token.startOffset(), token.endOffset()); + posincAtt + .setPositionIncrement(currentToken <= 1 + || tokens[currentToken - 1].startOffset() > tokens[currentToken - 2] + .startOffset() ? 1 : 0); return true; } } @@ -180,7 +188,6 @@ public class TokenSources { BytesRef[] terms = tpv.getTerms(); int[] freq = tpv.getTermFrequencies(); int totalTokens = 0; - for (int t = 0; t < freq.length; t++) { totalTokens += freq[t]; } @@ -189,7 +196,8 @@ public class TokenSources { for (int t = 0; t < freq.length; t++) { TermVectorOffsetInfo[] offsets = tpv.getOffsets(t); if (offsets == null) { - throw new IllegalArgumentException("Required TermVector Offset information was not found"); + throw new IllegalArgumentException( + "Required TermVector Offset information was not found"); } int[] pos = null; @@ -205,8 +213,8 @@ public class TokenSources { unsortedTokens = new ArrayList(); } for (int tp = 0; tp < offsets.length; tp++) { - Token token = new Token(terms[t].utf8ToString(), offsets[tp].getStartOffset(), offsets[tp] - .getEndOffset()); + Token token = new Token(terms[t].utf8ToString(), + offsets[tp].getStartOffset(), offsets[tp].getEndOffset()); unsortedTokens.add(token); } } else { @@ -221,8 +229,8 @@ public class TokenSources { // tokens stored with positions - can use this to index straight into // sorted array for (int tp = 0; tp < pos.length; tp++) { - Token token = new Token(terms[t].utf8ToString(), offsets[tp].getStartOffset(), - offsets[tp].getEndOffset()); + Token token = new Token(terms[t].utf8ToString(), + offsets[tp].getStartOffset(), offsets[tp].getEndOffset()); tokensInOriginalOrder[pos[tp]] = token; } } @@ -233,10 +241,9 @@ public class TokenSources { .size()]); ArrayUtil.mergeSort(tokensInOriginalOrder, new Comparator() { public int compare(Token t1, Token t2) { - if (t1.startOffset() == t2.startOffset()) - return t1.endOffset() - t2.endOffset(); - else - return t1.startOffset() - t2.startOffset(); + if (t1.startOffset() == t2.startOffset()) return t1.endOffset() + - t2.endOffset(); + else return t1.startOffset() - t2.startOffset(); } }); } diff --git a/lucene/contrib/highlighter/src/test/org/apache/lucene/search/highlight/TokenSourcesTest.java b/lucene/contrib/highlighter/src/test/org/apache/lucene/search/highlight/TokenSourcesTest.java index 572aa219b78..02dd92d40e0 100644 --- a/lucene/contrib/highlighter/src/test/org/apache/lucene/search/highlight/TokenSourcesTest.java +++ b/lucene/contrib/highlighter/src/test/org/apache/lucene/search/highlight/TokenSourcesTest.java @@ -36,7 +36,10 @@ import org.apache.lucene.index.Term; import org.apache.lucene.index.TermPositionVector; import org.apache.lucene.search.DisjunctionMaxQuery; import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; import org.apache.lucene.search.TopDocs; +import org.apache.lucene.search.spans.SpanNearQuery; +import org.apache.lucene.search.spans.SpanQuery; import org.apache.lucene.search.spans.SpanTermQuery; import org.apache.lucene.store.Directory; import org.apache.lucene.store.LockObtainFailedException; @@ -86,12 +89,12 @@ public class TokenSourcesTest extends LuceneTestCase { public void reset() { this.i = -1; this.tokens = new Token[] { - new Token(new char[] { 't', 'h', 'e' }, 0, 3, 0, 3), - new Token(new char[] { '{', 'f', 'o', 'x', '}' }, 0, 5, 0, 7), - new Token(new char[] { 'f', 'o', 'x' }, 0, 3, 4, 7), - new Token(new char[] { 'd', 'i', 'd' }, 0, 3, 8, 11), - new Token(new char[] { 'n', 'o', 't' }, 0, 3, 12, 15), - new Token(new char[] { 'j', 'u', 'm', 'p' }, 0, 4, 16, 20) }; + new Token(new char[] {'t', 'h', 'e'}, 0, 3, 0, 3), + new Token(new char[] {'{', 'f', 'o', 'x', '}'}, 0, 5, 0, 7), + new Token(new char[] {'f', 'o', 'x'}, 0, 3, 4, 7), + new Token(new char[] {'d', 'i', 'd'}, 0, 3, 8, 11), + new Token(new char[] {'n', 'o', 't'}, 0, 3, 12, 15), + new Token(new char[] {'j', 'u', 'm', 'p'}, 0, 4, 16, 20)}; this.tokens[1].setPositionIncrement(0); } } @@ -188,4 +191,97 @@ public class TokenSourcesTest extends LuceneTestCase { } } + public void testOverlapWithOffsetExactPhrase() throws CorruptIndexException, + LockObtainFailedException, IOException, InvalidTokenOffsetsException { + final String TEXT = "the fox did not jump"; + final Directory directory = newDirectory(); + final IndexWriter indexWriter = new IndexWriter(directory, + newIndexWriterConfig(TEST_VERSION_CURRENT, new OverlapAnalyzer())); + try { + final Document document = new Document(); + document.add(new Field(FIELD, new TokenStreamOverlap(), + TermVector.WITH_OFFSETS)); + indexWriter.addDocument(document); + } finally { + indexWriter.close(); + } + final IndexReader indexReader = IndexReader.open(directory, true); + try { + assertEquals(1, indexReader.numDocs()); + final IndexSearcher indexSearcher = newSearcher(indexReader); + try { + // final DisjunctionMaxQuery query = new DisjunctionMaxQuery(1); + // query.add(new SpanTermQuery(new Term(FIELD, "{fox}"))); + // query.add(new SpanTermQuery(new Term(FIELD, "fox"))); + final Query phraseQuery = new SpanNearQuery(new SpanQuery[] { + new SpanTermQuery(new Term(FIELD, "the")), + new SpanTermQuery(new Term(FIELD, "fox"))}, 0, true); + + TopDocs hits = indexSearcher.search(phraseQuery, 1); + assertEquals(1, hits.totalHits); + final Highlighter highlighter = new Highlighter( + new SimpleHTMLFormatter(), new SimpleHTMLEncoder(), + new QueryScorer(phraseQuery)); + final TokenStream tokenStream = TokenSources + .getTokenStream( + (TermPositionVector) indexReader.getTermFreqVector(0, FIELD), + false); + assertEquals("the fox did not jump", + highlighter.getBestFragment(tokenStream, TEXT)); + } finally { + indexSearcher.close(); + } + } finally { + indexReader.close(); + directory.close(); + } + } + + public void testOverlapWithPositionsAndOffsetExactPhrase() + throws CorruptIndexException, LockObtainFailedException, IOException, + InvalidTokenOffsetsException { + final String TEXT = "the fox did not jump"; + final Directory directory = newDirectory(); + final IndexWriter indexWriter = new IndexWriter(directory, + newIndexWriterConfig(TEST_VERSION_CURRENT, new OverlapAnalyzer())); + try { + final Document document = new Document(); + document.add(new Field(FIELD, new TokenStreamOverlap(), + TermVector.WITH_POSITIONS_OFFSETS)); + indexWriter.addDocument(document); + } finally { + indexWriter.close(); + } + final IndexReader indexReader = IndexReader.open(directory, true); + try { + assertEquals(1, indexReader.numDocs()); + final IndexSearcher indexSearcher = newSearcher(indexReader); + try { + // final DisjunctionMaxQuery query = new DisjunctionMaxQuery(1); + // query.add(new SpanTermQuery(new Term(FIELD, "the"))); + // query.add(new SpanTermQuery(new Term(FIELD, "fox"))); + final Query phraseQuery = new SpanNearQuery(new SpanQuery[] { + new SpanTermQuery(new Term(FIELD, "the")), + new SpanTermQuery(new Term(FIELD, "fox"))}, 0, true); + + TopDocs hits = indexSearcher.search(phraseQuery, 1); + assertEquals(1, hits.totalHits); + final Highlighter highlighter = new Highlighter( + new SimpleHTMLFormatter(), new SimpleHTMLEncoder(), + new QueryScorer(phraseQuery)); + final TokenStream tokenStream = TokenSources + .getTokenStream( + (TermPositionVector) indexReader.getTermFreqVector(0, FIELD), + false); + assertEquals("the fox did not jump", + highlighter.getBestFragment(tokenStream, TEXT)); + } finally { + indexSearcher.close(); + } + } finally { + indexReader.close(); + directory.close(); + } + } + }