mirror of https://github.com/apache/lucene.git
LUCENE-4652: highlight multiple fields with postings highlighter
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1428147 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
b85bf15d38
commit
d28c846d34
|
@ -61,6 +61,8 @@ public final class Passage {
|
|||
|
||||
/**
|
||||
* Start offset of this passage.
|
||||
* @return start index (inclusive) of the passage in the
|
||||
* original content: always >= 0.
|
||||
*/
|
||||
public int getStartOffset() {
|
||||
return startOffset;
|
||||
|
@ -68,6 +70,8 @@ public final class Passage {
|
|||
|
||||
/**
|
||||
* End offset of this passage.
|
||||
* @return end index (exclusive) of the passage in the
|
||||
* original content: always >= {@link #getStartOffset()}
|
||||
*/
|
||||
public int getEndOffset() {
|
||||
return endOffset;
|
||||
|
@ -91,6 +95,7 @@ public final class Passage {
|
|||
|
||||
/**
|
||||
* Start offsets of the term matches, in increasing order.
|
||||
* <p>
|
||||
* Only {@link #getNumMatches} are valid. Note that these
|
||||
* offsets are absolute (not relative to {@link #getStartOffset()}).
|
||||
*/
|
||||
|
@ -99,19 +104,20 @@ public final class Passage {
|
|||
}
|
||||
|
||||
/**
|
||||
* End offsets of the term matches, corresponding with
|
||||
* {@link #getMatchStarts}. Note that its possible that
|
||||
* an end offset could exceed beyond the bounds of the passage
|
||||
* ({@link #getEndOffset()}), if the Analyzer produced a term
|
||||
* which spans a passage boundary.
|
||||
* End offsets of the term matches, corresponding with {@link #getMatchStarts}.
|
||||
* <p>
|
||||
* Only {@link #getNumMatches} are valid. Note that its possible that an end offset
|
||||
* could exceed beyond the bounds of the passage ({@link #getEndOffset()}), if the
|
||||
* Analyzer produced a term which spans a passage boundary.
|
||||
*/
|
||||
public int[] getMatchEnds() {
|
||||
return matchEnds;
|
||||
}
|
||||
|
||||
/**
|
||||
* Term of the matches, corresponding with
|
||||
* {@link #getMatchStarts()}.
|
||||
* Term of the matches, corresponding with {@link #getMatchStarts()}.
|
||||
* <p>
|
||||
* Only {@link #getNumMatches()} are valid.
|
||||
*/
|
||||
public Term[] getMatchTerms() {
|
||||
return matchTerms;
|
||||
|
|
|
@ -18,7 +18,7 @@ package org.apache.lucene.sandbox.postingshighlight;
|
|||
*/
|
||||
|
||||
/**
|
||||
* Constructs a formatted passage.
|
||||
* Creates a formatted snippet from the top passages.
|
||||
* <p>
|
||||
* The default implementation marks the query terms as bold, and places
|
||||
* ellipses between unconnected passages.
|
||||
|
@ -26,6 +26,12 @@ package org.apache.lucene.sandbox.postingshighlight;
|
|||
*/
|
||||
public class PassageFormatter {
|
||||
/**
|
||||
* Formats the top <code>passages</code> from <code>content</code>
|
||||
* into a human-readable text snippet.
|
||||
*
|
||||
* @param passages top-N passages for the field. Note these are sorted in
|
||||
* the order that they appear in the document for convenience.
|
||||
* @param content content for the field.
|
||||
* @return formatted highlight
|
||||
*/
|
||||
public String format(Passage passages[], String content) {
|
||||
|
|
|
@ -18,7 +18,7 @@ package org.apache.lucene.sandbox.postingshighlight;
|
|||
*/
|
||||
|
||||
/**
|
||||
* Used for ranking passages.
|
||||
* Ranks passages found by {@link PostingsHighlighter}.
|
||||
* <p>
|
||||
* Each passage is scored as a miniature document within the document.
|
||||
* The final score is computed as {@link #norm} * ∑ ({@link #weight} * {@link #tf}).
|
||||
|
|
|
@ -62,19 +62,15 @@ import org.apache.lucene.util.UnicodeUtil;
|
|||
* Field body = new Field("body", "foobar", offsetsType);
|
||||
*
|
||||
* // retrieve highlights at query time
|
||||
* PostingsHighlighter highlighter = new PostingsHighlighter("body");
|
||||
* PostingsHighlighter highlighter = new PostingsHighlighter();
|
||||
* Query query = new TermQuery(new Term("body", "highlighting"));
|
||||
* TopDocs topDocs = searcher.search(query, n);
|
||||
* String highlights[] = highlighter.highlight(query, searcher, topDocs);
|
||||
* String highlights[] = highlighter.highlight("body", query, searcher, topDocs);
|
||||
* </pre>
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public final class PostingsHighlighter {
|
||||
|
||||
// TODO: support highlighting multiple fields at once? someone is bound
|
||||
// to try to use this in a slow way (invoking over and over for each field), which
|
||||
// would be horrible.
|
||||
|
||||
// TODO: maybe allow re-analysis for tiny fields? currently we require offsets,
|
||||
// but if the analyzer is really fast and the field is tiny, this might really be
|
||||
// unnecessary.
|
||||
|
@ -86,25 +82,37 @@ public final class PostingsHighlighter {
|
|||
* closer to the beginning of the document better summarize its content */
|
||||
public static final int DEFAULT_MAX_LENGTH = 10000;
|
||||
|
||||
private final String field;
|
||||
private final Term floor;
|
||||
private final Term ceiling;
|
||||
private final int maxLength;
|
||||
private final BreakIterator breakIterator;
|
||||
private final PassageScorer scorer;
|
||||
private final PassageFormatter formatter;
|
||||
|
||||
public PostingsHighlighter(String field) {
|
||||
this(field, DEFAULT_MAX_LENGTH);
|
||||
/**
|
||||
* Creates a new highlighter with default parameters.
|
||||
*/
|
||||
public PostingsHighlighter() {
|
||||
this(DEFAULT_MAX_LENGTH);
|
||||
}
|
||||
|
||||
public PostingsHighlighter(String field, int maxLength) {
|
||||
this(field, maxLength, BreakIterator.getSentenceInstance(Locale.ROOT), new PassageScorer(), new PassageFormatter());
|
||||
/**
|
||||
* Creates a new highlighter, specifying maximum content length.
|
||||
* @param maxLength maximum content size to process.
|
||||
* @throws IllegalArgumentException if <code>maxLength</code> is negative or <code>Integer.MAX_VALUE</code>
|
||||
*/
|
||||
public PostingsHighlighter(int maxLength) {
|
||||
this(maxLength, BreakIterator.getSentenceInstance(Locale.ROOT), new PassageScorer(), new PassageFormatter());
|
||||
}
|
||||
|
||||
public PostingsHighlighter(String field, int maxLength, BreakIterator breakIterator, PassageScorer scorer, PassageFormatter formatter) {
|
||||
this.field = field;
|
||||
if (maxLength == Integer.MAX_VALUE) {
|
||||
/**
|
||||
* Creates a new highlighter with custom parameters.
|
||||
* @param maxLength maximum content size to process.
|
||||
* @param breakIterator used for finding passage boundaries.
|
||||
* @param scorer used for ranking passages.
|
||||
* @param formatter used for formatting passages into highlighted snippets.
|
||||
* @throws IllegalArgumentException if <code>maxLength</code> is negative or <code>Integer.MAX_VALUE</code>
|
||||
*/
|
||||
public PostingsHighlighter(int maxLength, BreakIterator breakIterator, PassageScorer scorer, PassageFormatter formatter) {
|
||||
if (maxLength < 0 || maxLength == Integer.MAX_VALUE) {
|
||||
// two reasons: no overflow problems in BreakIterator.preceding(offset+1),
|
||||
// our sentinel in the offsets queue uses this value to terminate.
|
||||
throw new IllegalArgumentException("maxLength must be < Integer.MAX_VALUE");
|
||||
|
@ -113,26 +121,107 @@ public final class PostingsHighlighter {
|
|||
this.breakIterator = breakIterator;
|
||||
this.scorer = scorer;
|
||||
this.formatter = formatter;
|
||||
floor = new Term(field, "");
|
||||
ceiling = new Term(field, UnicodeUtil.BIG_TERM);
|
||||
}
|
||||
|
||||
/**
|
||||
* Calls {@link #highlight(Query, IndexSearcher, TopDocs, int) highlight(query, searcher, topDocs, 1)}
|
||||
* Highlights the top passages from a single field.
|
||||
*
|
||||
* @param field field name to highlight.
|
||||
* Must have a stored string value and also be indexed with offsets.
|
||||
* @param query query to highlight.
|
||||
* @param searcher searcher that was previously used to execute the query.
|
||||
* @param topDocs TopDocs containing the summary result documents to highlight.
|
||||
* @return Array of formatted snippets corresponding to the documents in <code>topDocs</code>.
|
||||
* If no highlights were found for a document, its value is <code>null</code>.
|
||||
* @throws IOException if an I/O error occurred during processing
|
||||
* @throws IllegalArgumentException if <code>field</code> was indexed without
|
||||
* {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
|
||||
*/
|
||||
public String[] highlight(Query query, IndexSearcher searcher, TopDocs topDocs) throws IOException {
|
||||
return highlight(query, searcher, topDocs, 1);
|
||||
public String[] highlight(String field, Query query, IndexSearcher searcher, TopDocs topDocs) throws IOException {
|
||||
return highlight(field, query, searcher, topDocs, 1);
|
||||
}
|
||||
|
||||
public String[] highlight(Query query, IndexSearcher searcher, TopDocs topDocs, int maxPassages) throws IOException {
|
||||
/**
|
||||
* Highlights the top-N passages from a single field.
|
||||
*
|
||||
* @param field field name to highlight.
|
||||
* Must have a stored string value and also be indexed with offsets.
|
||||
* @param query query to highlight.
|
||||
* @param searcher searcher that was previously used to execute the query.
|
||||
* @param topDocs TopDocs containing the summary result documents to highlight.
|
||||
* @param maxPassages The maximum number of top-N ranked passages used to
|
||||
* form the highlighted snippets.
|
||||
* @return Array of formatted snippets corresponding to the documents in <code>topDocs</code>.
|
||||
* If no highlights were found for a document, its value is <code>null</code>.
|
||||
* @throws IOException if an I/O error occurred during processing
|
||||
* @throws IllegalArgumentException if <code>field</code> was indexed without
|
||||
* {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
|
||||
*/
|
||||
public String[] highlight(String field, Query query, IndexSearcher searcher, TopDocs topDocs, int maxPassages) throws IOException {
|
||||
Map<String,String[]> res = highlightFields(new String[] { field }, query, searcher, topDocs, maxPassages);
|
||||
return res.get(field);
|
||||
}
|
||||
|
||||
/**
|
||||
* Highlights the top passages from multiple fields.
|
||||
* <p>
|
||||
* Conceptually, this behaves as a more efficent form of:
|
||||
* <pre class="prettyprint">
|
||||
* Map m = new HashMap();
|
||||
* for (String field : fields) {
|
||||
* m.put(field, highlight(field, query, searcher, topDocs));
|
||||
* }
|
||||
* return m;
|
||||
* </pre>
|
||||
*
|
||||
* @param fields field names to highlight.
|
||||
* Must have a stored string value and also be indexed with offsets.
|
||||
* @param query query to highlight.
|
||||
* @param searcher searcher that was previously used to execute the query.
|
||||
* @param topDocs TopDocs containing the summary result documents to highlight.
|
||||
* @return Map keyed on field name, containing the array of formatted snippets
|
||||
* corresponding to the documents in <code>topDocs</code>.
|
||||
* If no highlights were found for a document, its value is <code>null</code>.
|
||||
* @throws IOException if an I/O error occurred during processing
|
||||
* @throws IllegalArgumentException if <code>field</code> was indexed without
|
||||
* {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
|
||||
*/
|
||||
public Map<String,String[]> highlightFields(String fields[], Query query, IndexSearcher searcher, TopDocs topDocs) throws IOException {
|
||||
return highlightFields(fields, query, searcher, topDocs, 1);
|
||||
}
|
||||
|
||||
/**
|
||||
* Highlights the top-N passages from multiple fields.
|
||||
* <p>
|
||||
* Conceptually, this behaves as a more efficient form of:
|
||||
* <pre class="prettyprint">
|
||||
* Map m = new HashMap();
|
||||
* for (String field : fields) {
|
||||
* m.put(field, highlight(field, query, searcher, topDocs, maxPassages));
|
||||
* }
|
||||
* return m;
|
||||
* </pre>
|
||||
*
|
||||
* @param fields field names to highlight.
|
||||
* Must have a stored string value and also be indexed with offsets.
|
||||
* @param query query to highlight.
|
||||
* @param searcher searcher that was previously used to execute the query.
|
||||
* @param topDocs TopDocs containing the summary result documents to highlight.
|
||||
* @param maxPassages The maximum number of top-N ranked passages per-field used to
|
||||
* form the highlighted snippets.
|
||||
* @return Map keyed on field name, containing the array of formatted snippets
|
||||
* corresponding to the documents in <code>topDocs</code>.
|
||||
* If no highlights were found for a document, its value is <code>null</code>.
|
||||
* @throws IOException if an I/O error occurred during processing
|
||||
* @throws IllegalArgumentException if <code>field</code> was indexed without
|
||||
* {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
|
||||
*/
|
||||
public Map<String,String[]> highlightFields(String fields[], Query query, IndexSearcher searcher, TopDocs topDocs, int maxPassages) throws IOException {
|
||||
final IndexReader reader = searcher.getIndexReader();
|
||||
final ScoreDoc scoreDocs[] = topDocs.scoreDocs;
|
||||
query = rewrite(query);
|
||||
SortedSet<Term> terms = new TreeSet<Term>();
|
||||
query.extractTerms(terms);
|
||||
terms = terms.subSet(floor, ceiling);
|
||||
Term termTexts[] = terms.toArray(new Term[terms.size()]);
|
||||
// TODO: should we have some reasonable defaults for term pruning? (e.g. stopwords)
|
||||
SortedSet<Term> queryTerms = new TreeSet<Term>();
|
||||
query.extractTerms(queryTerms);
|
||||
|
||||
int docids[] = new int[scoreDocs.length];
|
||||
for (int i = 0; i < docids.length; i++) {
|
||||
|
@ -140,21 +229,44 @@ public final class PostingsHighlighter {
|
|||
}
|
||||
IndexReaderContext readerContext = reader.getContext();
|
||||
List<AtomicReaderContext> leaves = readerContext.leaves();
|
||||
|
||||
|
||||
BreakIterator bi = (BreakIterator)breakIterator.clone();
|
||||
|
||||
// sort for sequential io
|
||||
Arrays.sort(docids);
|
||||
Arrays.sort(fields);
|
||||
|
||||
// pull stored data
|
||||
LimitedStoredFieldVisitor visitor = new LimitedStoredFieldVisitor(field, maxLength);
|
||||
String contents[] = new String[docids.length];
|
||||
for (int i = 0; i < contents.length; i++) {
|
||||
// pull stored data:
|
||||
LimitedStoredFieldVisitor visitor = new LimitedStoredFieldVisitor(fields, maxLength);
|
||||
String contents[][] = new String[fields.length][docids.length];
|
||||
for (int i = 0; i < docids.length; i++) {
|
||||
reader.document(docids[i], visitor);
|
||||
contents[i] = visitor.getValue();
|
||||
for (int j = 0; j < fields.length; j++) {
|
||||
contents[j][i] = visitor.getValue(j).toString();
|
||||
}
|
||||
visitor.reset();
|
||||
}
|
||||
|
||||
BreakIterator bi = (BreakIterator)breakIterator.clone();
|
||||
Map<String,String[]> highlights = new HashMap<String,String[]>();
|
||||
for (int i = 0; i < fields.length; i++) {
|
||||
String field = fields[i];
|
||||
Term floor = new Term(field, "");
|
||||
Term ceiling = new Term(field, UnicodeUtil.BIG_TERM);
|
||||
SortedSet<Term> fieldTerms = queryTerms.subSet(floor, ceiling);
|
||||
// TODO: should we have some reasonable defaults for term pruning? (e.g. stopwords)
|
||||
Term terms[] = fieldTerms.toArray(new Term[fieldTerms.size()]);
|
||||
Map<Integer,String> fieldHighlights = highlightField(field, contents[i], bi, terms, docids, leaves, maxPassages);
|
||||
|
||||
String[] result = new String[scoreDocs.length];
|
||||
for (int j = 0; j < scoreDocs.length; j++) {
|
||||
result[j] = fieldHighlights.get(scoreDocs[j].doc);
|
||||
}
|
||||
highlights.put(field, result);
|
||||
}
|
||||
return highlights;
|
||||
}
|
||||
|
||||
private Map<Integer,String> highlightField(String field, String contents[], BreakIterator bi, Term terms[], int[] docids, List<AtomicReaderContext> leaves, int maxPassages) throws IOException {
|
||||
Map<Integer,String> highlights = new HashMap<Integer,String>();
|
||||
|
||||
// reuse in the real sense... for docs in same segment we just advance our old enum
|
||||
|
@ -178,9 +290,9 @@ public final class PostingsHighlighter {
|
|||
}
|
||||
if (leaf != lastLeaf) {
|
||||
termsEnum = t.iterator(null);
|
||||
postings = new DocsAndPositionsEnum[terms.size()];
|
||||
postings = new DocsAndPositionsEnum[terms.length];
|
||||
}
|
||||
Passage passages[] = highlightDoc(termTexts, content.length(), bi, doc - subContext.docBase, termsEnum, postings, maxPassages);
|
||||
Passage passages[] = highlightDoc(field, terms, content.length(), bi, doc - subContext.docBase, termsEnum, postings, maxPassages);
|
||||
if (passages.length > 0) {
|
||||
// otherwise a null snippet
|
||||
highlights.put(doc, formatter.format(passages, content));
|
||||
|
@ -188,17 +300,13 @@ public final class PostingsHighlighter {
|
|||
lastLeaf = leaf;
|
||||
}
|
||||
|
||||
String[] result = new String[scoreDocs.length];
|
||||
for (int i = 0; i < scoreDocs.length; i++) {
|
||||
result[i] = highlights.get(scoreDocs[i].doc);
|
||||
}
|
||||
return result;
|
||||
return highlights;
|
||||
}
|
||||
|
||||
// algorithm: treat sentence snippets as miniature documents
|
||||
// we can intersect these with the postings lists via BreakIterator.preceding(offset),s
|
||||
// score each sentence as norm(sentenceStartOffset) * sum(weight * tf(freq))
|
||||
private Passage[] highlightDoc(Term terms[], int contentLength, BreakIterator bi, int doc,
|
||||
private Passage[] highlightDoc(String field, Term terms[], int contentLength, BreakIterator bi, int doc,
|
||||
TermsEnum termsEnum, DocsAndPositionsEnum[] postings, int n) throws IOException {
|
||||
PriorityQueue<OffsetsEnum> pq = new PriorityQueue<OffsetsEnum>();
|
||||
float weights[] = new float[terms.length];
|
||||
|
@ -381,17 +489,24 @@ public final class PostingsHighlighter {
|
|||
}
|
||||
|
||||
private static class LimitedStoredFieldVisitor extends StoredFieldVisitor {
|
||||
private final String field;
|
||||
private final String fields[];
|
||||
private final int maxLength;
|
||||
private final StringBuilder builder = new StringBuilder();
|
||||
private final StringBuilder builders[];
|
||||
private int currentField = -1;
|
||||
|
||||
public LimitedStoredFieldVisitor(String field, int maxLength) {
|
||||
this.field = field;
|
||||
public LimitedStoredFieldVisitor(String fields[], int maxLength) {
|
||||
this.fields = fields;
|
||||
this.maxLength = maxLength;
|
||||
builders = new StringBuilder[fields.length];
|
||||
for (int i = 0; i < builders.length; i++) {
|
||||
builders[i] = new StringBuilder();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void stringField(FieldInfo fieldInfo, String value) throws IOException {
|
||||
assert currentField >= 0;
|
||||
StringBuilder builder = builders[currentField];
|
||||
if (builder.length() > 0) {
|
||||
builder.append(' '); // for the offset gap, TODO: make this configurable
|
||||
}
|
||||
|
@ -404,22 +519,24 @@ public final class PostingsHighlighter {
|
|||
|
||||
@Override
|
||||
public Status needsField(FieldInfo fieldInfo) throws IOException {
|
||||
if (fieldInfo.name.equals(field)) {
|
||||
if (builder.length() > maxLength) {
|
||||
return Status.STOP;
|
||||
}
|
||||
return Status.YES;
|
||||
} else {
|
||||
currentField = Arrays.binarySearch(fields, fieldInfo.name);
|
||||
if (currentField < 0) {
|
||||
return Status.NO;
|
||||
} else if (builders[currentField].length() > maxLength) {
|
||||
return fields.length == 1 ? Status.STOP : Status.NO;
|
||||
}
|
||||
return Status.YES;
|
||||
}
|
||||
|
||||
String getValue() {
|
||||
return builder.toString();
|
||||
String getValue(int i) {
|
||||
return builders[i].toString();
|
||||
}
|
||||
|
||||
void reset() {
|
||||
builder.setLength(0);
|
||||
currentField = -1;
|
||||
for (int i = 0; i < fields.length; i++) {
|
||||
builders[i].setLength(0);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,6 +17,8 @@ package org.apache.lucene.sandbox.postingshighlight;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.document.Document;
|
||||
|
@ -63,11 +65,11 @@ public class TestPostingsHighlighter extends LuceneTestCase {
|
|||
iw.close();
|
||||
|
||||
IndexSearcher searcher = newSearcher(ir);
|
||||
PostingsHighlighter highlighter = new PostingsHighlighter("body");
|
||||
PostingsHighlighter highlighter = new PostingsHighlighter();
|
||||
Query query = new TermQuery(new Term("body", "highlighting"));
|
||||
TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER);
|
||||
assertEquals(2, topDocs.totalHits);
|
||||
String snippets[] = highlighter.highlight(query, searcher, topDocs);
|
||||
String snippets[] = highlighter.highlight("body", query, searcher, topDocs);
|
||||
assertEquals(2, snippets.length);
|
||||
assertEquals("Just a test <b>highlighting</b> from postings. ", snippets[0]);
|
||||
assertEquals("<b>Highlighting</b> the first term. ", snippets[1]);
|
||||
|
@ -99,11 +101,11 @@ public class TestPostingsHighlighter extends LuceneTestCase {
|
|||
iw.close();
|
||||
|
||||
IndexSearcher searcher = newSearcher(ir);
|
||||
PostingsHighlighter highlighter = new PostingsHighlighter("body");
|
||||
PostingsHighlighter highlighter = new PostingsHighlighter();
|
||||
Query query = new TermQuery(new Term("body", "test"));
|
||||
TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER);
|
||||
assertEquals(2, topDocs.totalHits);
|
||||
String snippets[] = highlighter.highlight(query, searcher, topDocs);
|
||||
String snippets[] = highlighter.highlight("body", query, searcher, topDocs);
|
||||
assertEquals(2, snippets.length);
|
||||
assertEquals("This is a <b>test</b>.", snippets[0]);
|
||||
assertEquals("<b>Test</b> a one sentence document.", snippets[1]);
|
||||
|
@ -112,6 +114,47 @@ public class TestPostingsHighlighter extends LuceneTestCase {
|
|||
dir.close();
|
||||
}
|
||||
|
||||
public void testMultipleFields() throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.SIMPLE, true));
|
||||
iwc.setMergePolicy(newLogMergePolicy());
|
||||
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
|
||||
|
||||
FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
|
||||
offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
|
||||
Field body = new Field("body", "", offsetsType);
|
||||
Field title = new Field("title", "", offsetsType);
|
||||
Document doc = new Document();
|
||||
doc.add(body);
|
||||
doc.add(title);
|
||||
|
||||
body.setStringValue("This is a test. Just a test highlighting from postings. Feel free to ignore.");
|
||||
title.setStringValue("I am hoping for the best.");
|
||||
iw.addDocument(doc);
|
||||
body.setStringValue("Highlighting the first term. Hope it works.");
|
||||
title.setStringValue("But best may not be good enough.");
|
||||
iw.addDocument(doc);
|
||||
|
||||
IndexReader ir = iw.getReader();
|
||||
iw.close();
|
||||
|
||||
IndexSearcher searcher = newSearcher(ir);
|
||||
PostingsHighlighter highlighter = new PostingsHighlighter();
|
||||
BooleanQuery query = new BooleanQuery();
|
||||
query.add(new TermQuery(new Term("body", "highlighting")), BooleanClause.Occur.SHOULD);
|
||||
query.add(new TermQuery(new Term("title", "best")), BooleanClause.Occur.SHOULD);
|
||||
TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER);
|
||||
assertEquals(2, topDocs.totalHits);
|
||||
Map<String,String[]> snippets = highlighter.highlightFields(new String [] { "body", "title" }, query, searcher, topDocs);
|
||||
assertEquals(2, snippets.size());
|
||||
assertEquals("Just a test <b>highlighting</b> from postings. ", snippets.get("body")[0]);
|
||||
assertEquals("<b>Highlighting</b> the first term. ", snippets.get("body")[1]);
|
||||
assertEquals("I am hoping for the <b>best</b>.", snippets.get("title")[0]);
|
||||
assertEquals("But <b>best</b> may not be good enough.", snippets.get("title")[1]);
|
||||
ir.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
public void testMultipleTerms() throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
|
||||
|
@ -133,14 +176,14 @@ public class TestPostingsHighlighter extends LuceneTestCase {
|
|||
iw.close();
|
||||
|
||||
IndexSearcher searcher = newSearcher(ir);
|
||||
PostingsHighlighter highlighter = new PostingsHighlighter("body");
|
||||
PostingsHighlighter highlighter = new PostingsHighlighter();
|
||||
BooleanQuery query = new BooleanQuery();
|
||||
query.add(new TermQuery(new Term("body", "highlighting")), BooleanClause.Occur.SHOULD);
|
||||
query.add(new TermQuery(new Term("body", "just")), BooleanClause.Occur.SHOULD);
|
||||
query.add(new TermQuery(new Term("body", "first")), BooleanClause.Occur.SHOULD);
|
||||
TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER);
|
||||
assertEquals(2, topDocs.totalHits);
|
||||
String snippets[] = highlighter.highlight(query, searcher, topDocs);
|
||||
String snippets[] = highlighter.highlight("body", query, searcher, topDocs);
|
||||
assertEquals(2, snippets.length);
|
||||
assertEquals("<b>Just</b> a test <b>highlighting</b> from postings. ", snippets[0]);
|
||||
assertEquals("<b>Highlighting</b> the <b>first</b> term. ", snippets[1]);
|
||||
|
@ -170,11 +213,11 @@ public class TestPostingsHighlighter extends LuceneTestCase {
|
|||
iw.close();
|
||||
|
||||
IndexSearcher searcher = newSearcher(ir);
|
||||
PostingsHighlighter highlighter = new PostingsHighlighter("body");
|
||||
PostingsHighlighter highlighter = new PostingsHighlighter();
|
||||
Query query = new TermQuery(new Term("body", "test"));
|
||||
TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER);
|
||||
assertEquals(2, topDocs.totalHits);
|
||||
String snippets[] = highlighter.highlight(query, searcher, topDocs, 2);
|
||||
String snippets[] = highlighter.highlight("body", query, searcher, topDocs, 2);
|
||||
assertEquals(2, snippets.length);
|
||||
assertEquals("This is a <b>test</b>. Just a <b>test</b> highlighting from postings. ", snippets[0]);
|
||||
assertEquals("This <b>test</b> is another <b>test</b>. ... <b>Test</b> <b>test</b> <b>test</b> <b>test</b>.", snippets[1]);
|
||||
|
@ -204,12 +247,12 @@ public class TestPostingsHighlighter extends LuceneTestCase {
|
|||
iw.close();
|
||||
|
||||
IndexSearcher searcher = newSearcher(ir);
|
||||
PostingsHighlighter highlighter = new PostingsHighlighter("body");
|
||||
PostingsHighlighter highlighter = new PostingsHighlighter();
|
||||
Query query = new TermQuery(new Term("body", "test"));
|
||||
TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER);
|
||||
assertEquals(2, topDocs.totalHits);
|
||||
try {
|
||||
highlighter.highlight(query, searcher, topDocs, 2);
|
||||
highlighter.highlight("body", query, searcher, topDocs, 2);
|
||||
fail("did not hit expected exception");
|
||||
} catch (IllegalArgumentException iae) {
|
||||
// expected
|
||||
|
|
|
@ -109,14 +109,12 @@ public class TestPostingsHighlighterRanking extends LuceneTestCase {
|
|||
private void checkQuery(IndexSearcher is, Query query, int doc, int maxTopN) throws IOException {
|
||||
for (int n = 1; n < maxTopN; n++) {
|
||||
FakePassageFormatter f1 = new FakePassageFormatter();
|
||||
PostingsHighlighter p1 = new PostingsHighlighter("body",
|
||||
Integer.MAX_VALUE-1,
|
||||
PostingsHighlighter p1 = new PostingsHighlighter(Integer.MAX_VALUE-1,
|
||||
BreakIterator.getSentenceInstance(Locale.ROOT),
|
||||
new PassageScorer(),
|
||||
f1);
|
||||
FakePassageFormatter f2 = new FakePassageFormatter();
|
||||
PostingsHighlighter p2 = new PostingsHighlighter("body",
|
||||
Integer.MAX_VALUE-1,
|
||||
PostingsHighlighter p2 = new PostingsHighlighter(Integer.MAX_VALUE-1,
|
||||
BreakIterator.getSentenceInstance(Locale.ROOT),
|
||||
new PassageScorer(),
|
||||
f2);
|
||||
|
@ -124,8 +122,8 @@ public class TestPostingsHighlighterRanking extends LuceneTestCase {
|
|||
bq.add(query, BooleanClause.Occur.MUST);
|
||||
bq.add(new TermQuery(new Term("id", Integer.toString(doc))), BooleanClause.Occur.MUST);
|
||||
TopDocs td = is.search(bq, 1);
|
||||
p1.highlight(bq, is, td, n);
|
||||
p2.highlight(bq, is, td, n+1);
|
||||
p1.highlight("body", bq, is, td, n);
|
||||
p2.highlight("body", bq, is, td, n+1);
|
||||
assertTrue(f2.seen.containsAll(f1.seen));
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue