LUCENE-5087: Add getMultiValuedSeparator to PostingsHighlighter

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1498945 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2013-07-02 14:08:26 +00:00
parent 7b5c09d061
commit 6b1cbf2d40
4 changed files with 80 additions and 3 deletions

View File

@ -268,6 +268,11 @@ New Features
* SOLR-4565: Extend NorwegianLightStemFilter and NorwegianMinimalStemFilter
to handle "nynorsk" (Erlend Garåsen, janhoy via Robert Muir)
* LUCENE-5087: Add getMultiValuedSeparator to PostingsHighlighter, for cases
where you want a different logical separator between field values. This can
be set to e.g. U+2029 PARAGRAPH SEPARATOR if you never want passes to span
values. (Mike McCandless, Robert Muir)
API Changes
* LUCENE-5077: Make it easier to use compressed norms. Lucene42NormsFormat takes

View File

@ -369,7 +369,11 @@ public class PostingsHighlighter {
* identical to what was indexed. */
protected String[][] loadFieldValues(IndexSearcher searcher, String[] fields, int[] docids, int maxLength) throws IOException {
String contents[][] = new String[fields.length][docids.length];
LimitedStoredFieldVisitor visitor = new LimitedStoredFieldVisitor(fields, maxLength);
char valueSeparators[] = new char[fields.length];
for (int i = 0; i < fields.length; i++) {
valueSeparators[i] = getMultiValuedSeparator(fields[i]);
}
LimitedStoredFieldVisitor visitor = new LimitedStoredFieldVisitor(fields, valueSeparators, maxLength);
for (int i = 0; i < docids.length; i++) {
searcher.doc(docids[i], visitor);
for (int j = 0; j < fields.length; j++) {
@ -379,6 +383,16 @@ public class PostingsHighlighter {
}
return contents;
}
/**
* Returns the logical separator between values for multi-valued fields.
* The default value is a space character, which means passages can span across values,
* but a subclass can override, for example with {@code U+2029 PARAGRAPH SEPARATOR (PS)}
* if each value holds a discrete passage for highlighting.
*/
protected char getMultiValuedSeparator(String field) {
return ' ';
}
private Map<Integer,String> highlightField(String field, String contents[], BreakIterator bi, BytesRef terms[], int[] docids, List<AtomicReaderContext> leaves, int maxPassages) throws IOException {
Map<Integer,String> highlights = new HashMap<Integer,String>();
@ -652,12 +666,15 @@ public class PostingsHighlighter {
private static class LimitedStoredFieldVisitor extends StoredFieldVisitor {
private final String fields[];
private final char valueSeparators[];
private final int maxLength;
private final StringBuilder builders[];
private int currentField = -1;
public LimitedStoredFieldVisitor(String fields[], int maxLength) {
public LimitedStoredFieldVisitor(String fields[], char valueSeparators[], int maxLength) {
assert fields.length == valueSeparators.length;
this.fields = fields;
this.valueSeparators = valueSeparators;
this.maxLength = maxLength;
builders = new StringBuilder[fields.length];
for (int i = 0; i < builders.length; i++) {
@ -670,7 +687,7 @@ public class PostingsHighlighter {
assert currentField >= 0;
StringBuilder builder = builders[currentField];
if (builder.length() > 0 && builder.length() < maxLength) {
builder.append(' '); // for the offset gap, TODO: make this configurable
builder.append(valueSeparators[currentField]);
}
if (builder.length() + value.length() > maxLength) {
builder.append(value, 0, maxLength - builder.length());

View File

@ -922,4 +922,48 @@ public class TestPostingsHighlighter extends LuceneTestCase {
ir.close();
dir.close();
}
/** customizing the gap separator to force a sentence break */
public void testGapSeparator() throws Exception {
Directory dir = newDirectory();
// use simpleanalyzer for more natural tokenization (else "test." is a token)
IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.SIMPLE, true));
iwc.setMergePolicy(newLogMergePolicy());
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
Document doc = new Document();
Field body1 = new Field("body", "", offsetsType);
body1.setStringValue("This is a multivalued field");
doc.add(body1);
Field body2 = new Field("body", "", offsetsType);
body2.setStringValue("This is something different");
doc.add(body2);
iw.addDocument(doc);
IndexReader ir = iw.getReader();
iw.close();
IndexSearcher searcher = newSearcher(ir);
PostingsHighlighter highlighter = new PostingsHighlighter() {
@Override
protected char getMultiValuedSeparator(String field) {
assert field.equals("body");
return '\u2029';
}
};
Query query = new TermQuery(new Term("body", "field"));
TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER);
assertEquals(1, topDocs.totalHits);
String snippets[] = highlighter.highlight("body", query, searcher, topDocs);
assertEquals(1, snippets.length);
assertEquals("This is a multivalued <b>field</b>\u2029", snippets[0]);
ir.close();
dir.close();
}
}

View File

@ -67,6 +67,7 @@ import org.apache.solr.util.plugin.PluginInfoInitialized;
* &lt;str name="hl.bs.variant"&gt;&lt;/str&gt;
* &lt;str name="hl.bs.type"&gt;SENTENCE&lt;/str&gt;
* &lt;int name="hl.maxAnalyzedChars"&gt;10000&lt;/int&gt;
* &lt;str name="hl.multiValuedSeparatorChar"&gt; &lt;/str&gt;
* &lt;/lst&gt;
* &lt;/requestHandler&gt;
* </pre>
@ -96,6 +97,7 @@ import org.apache.solr.util.plugin.PluginInfoInitialized;
* <li>hl.bs.country (string) specifies country code for BreakIterator. default is empty string (root locale)
* <li>hl.bs.variant (string) specifies country code for BreakIterator. default is empty string (root locale)
* <li>hl.maxAnalyzedChars specifies how many characters at most will be processed in a document.
* <li>hl.multiValuedSeparatorChar specifies the logical separator between values for multi-valued fields.
* NOTE: currently hl.maxAnalyzedChars cannot yet be specified per-field
* </ul>
*
@ -167,6 +169,15 @@ public class PostingsSolrHighlighter extends SolrHighlighter implements PluginIn
String type = params.getFieldParam(field, HighlightParams.BS_TYPE);
return parseBreakIterator(type, locale);
}
@Override
protected char getMultiValuedSeparator(String field) {
String sep = params.getFieldParam(field, HighlightParams.MULTI_VALUED_SEPARATOR, " ");
if (sep.length() != 1) {
throw new IllegalArgumentException(HighlightParams.MULTI_VALUED_SEPARATOR + " must be exactly one character.");
}
return sep.charAt(0);
}
};
Map<String,String[]> snippets = highlighter.highlightFields(fieldNames, query, searcher, docIDs, maxPassages);