mirror of https://github.com/apache/lucene.git
LUCENE-5087: Add getMultiValuedSeparator to PostingsHighlighter
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1498945 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
7b5c09d061
commit
6b1cbf2d40
|
@ -268,6 +268,11 @@ New Features
|
|||
* SOLR-4565: Extend NorwegianLightStemFilter and NorwegianMinimalStemFilter
|
||||
to handle "nynorsk" (Erlend Garåsen, janhoy via Robert Muir)
|
||||
|
||||
* LUCENE-5087: Add getMultiValuedSeparator to PostingsHighlighter, for cases
|
||||
where you want a different logical separator between field values. This can
|
||||
be set to e.g. U+2029 PARAGRAPH SEPARATOR if you never want passes to span
|
||||
values. (Mike McCandless, Robert Muir)
|
||||
|
||||
API Changes
|
||||
|
||||
* LUCENE-5077: Make it easier to use compressed norms. Lucene42NormsFormat takes
|
||||
|
|
|
@ -369,7 +369,11 @@ public class PostingsHighlighter {
|
|||
* identical to what was indexed. */
|
||||
protected String[][] loadFieldValues(IndexSearcher searcher, String[] fields, int[] docids, int maxLength) throws IOException {
|
||||
String contents[][] = new String[fields.length][docids.length];
|
||||
LimitedStoredFieldVisitor visitor = new LimitedStoredFieldVisitor(fields, maxLength);
|
||||
char valueSeparators[] = new char[fields.length];
|
||||
for (int i = 0; i < fields.length; i++) {
|
||||
valueSeparators[i] = getMultiValuedSeparator(fields[i]);
|
||||
}
|
||||
LimitedStoredFieldVisitor visitor = new LimitedStoredFieldVisitor(fields, valueSeparators, maxLength);
|
||||
for (int i = 0; i < docids.length; i++) {
|
||||
searcher.doc(docids[i], visitor);
|
||||
for (int j = 0; j < fields.length; j++) {
|
||||
|
@ -379,6 +383,16 @@ public class PostingsHighlighter {
|
|||
}
|
||||
return contents;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the logical separator between values for multi-valued fields.
|
||||
* The default value is a space character, which means passages can span across values,
|
||||
* but a subclass can override, for example with {@code U+2029 PARAGRAPH SEPARATOR (PS)}
|
||||
* if each value holds a discrete passage for highlighting.
|
||||
*/
|
||||
protected char getMultiValuedSeparator(String field) {
|
||||
return ' ';
|
||||
}
|
||||
|
||||
private Map<Integer,String> highlightField(String field, String contents[], BreakIterator bi, BytesRef terms[], int[] docids, List<AtomicReaderContext> leaves, int maxPassages) throws IOException {
|
||||
Map<Integer,String> highlights = new HashMap<Integer,String>();
|
||||
|
@ -652,12 +666,15 @@ public class PostingsHighlighter {
|
|||
|
||||
private static class LimitedStoredFieldVisitor extends StoredFieldVisitor {
|
||||
private final String fields[];
|
||||
private final char valueSeparators[];
|
||||
private final int maxLength;
|
||||
private final StringBuilder builders[];
|
||||
private int currentField = -1;
|
||||
|
||||
public LimitedStoredFieldVisitor(String fields[], int maxLength) {
|
||||
public LimitedStoredFieldVisitor(String fields[], char valueSeparators[], int maxLength) {
|
||||
assert fields.length == valueSeparators.length;
|
||||
this.fields = fields;
|
||||
this.valueSeparators = valueSeparators;
|
||||
this.maxLength = maxLength;
|
||||
builders = new StringBuilder[fields.length];
|
||||
for (int i = 0; i < builders.length; i++) {
|
||||
|
@ -670,7 +687,7 @@ public class PostingsHighlighter {
|
|||
assert currentField >= 0;
|
||||
StringBuilder builder = builders[currentField];
|
||||
if (builder.length() > 0 && builder.length() < maxLength) {
|
||||
builder.append(' '); // for the offset gap, TODO: make this configurable
|
||||
builder.append(valueSeparators[currentField]);
|
||||
}
|
||||
if (builder.length() + value.length() > maxLength) {
|
||||
builder.append(value, 0, maxLength - builder.length());
|
||||
|
|
|
@ -922,4 +922,48 @@ public class TestPostingsHighlighter extends LuceneTestCase {
|
|||
ir.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
/** customizing the gap separator to force a sentence break */
|
||||
public void testGapSeparator() throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
// use simpleanalyzer for more natural tokenization (else "test." is a token)
|
||||
IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.SIMPLE, true));
|
||||
iwc.setMergePolicy(newLogMergePolicy());
|
||||
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
|
||||
|
||||
FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
|
||||
offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
|
||||
Document doc = new Document();
|
||||
|
||||
Field body1 = new Field("body", "", offsetsType);
|
||||
body1.setStringValue("This is a multivalued field");
|
||||
doc.add(body1);
|
||||
|
||||
Field body2 = new Field("body", "", offsetsType);
|
||||
body2.setStringValue("This is something different");
|
||||
doc.add(body2);
|
||||
|
||||
iw.addDocument(doc);
|
||||
|
||||
IndexReader ir = iw.getReader();
|
||||
iw.close();
|
||||
|
||||
IndexSearcher searcher = newSearcher(ir);
|
||||
PostingsHighlighter highlighter = new PostingsHighlighter() {
|
||||
@Override
|
||||
protected char getMultiValuedSeparator(String field) {
|
||||
assert field.equals("body");
|
||||
return '\u2029';
|
||||
}
|
||||
};
|
||||
Query query = new TermQuery(new Term("body", "field"));
|
||||
TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER);
|
||||
assertEquals(1, topDocs.totalHits);
|
||||
String snippets[] = highlighter.highlight("body", query, searcher, topDocs);
|
||||
assertEquals(1, snippets.length);
|
||||
assertEquals("This is a multivalued <b>field</b>\u2029", snippets[0]);
|
||||
|
||||
ir.close();
|
||||
dir.close();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -67,6 +67,7 @@ import org.apache.solr.util.plugin.PluginInfoInitialized;
|
|||
* <str name="hl.bs.variant"></str>
|
||||
* <str name="hl.bs.type">SENTENCE</str>
|
||||
* <int name="hl.maxAnalyzedChars">10000</int>
|
||||
* <str name="hl.multiValuedSeparatorChar"> </str>
|
||||
* </lst>
|
||||
* </requestHandler>
|
||||
* </pre>
|
||||
|
@ -96,6 +97,7 @@ import org.apache.solr.util.plugin.PluginInfoInitialized;
|
|||
* <li>hl.bs.country (string) specifies country code for BreakIterator. default is empty string (root locale)
|
||||
* <li>hl.bs.variant (string) specifies country code for BreakIterator. default is empty string (root locale)
|
||||
* <li>hl.maxAnalyzedChars specifies how many characters at most will be processed in a document.
|
||||
* <li>hl.multiValuedSeparatorChar specifies the logical separator between values for multi-valued fields.
|
||||
* NOTE: currently hl.maxAnalyzedChars cannot yet be specified per-field
|
||||
* </ul>
|
||||
*
|
||||
|
@ -167,6 +169,15 @@ public class PostingsSolrHighlighter extends SolrHighlighter implements PluginIn
|
|||
String type = params.getFieldParam(field, HighlightParams.BS_TYPE);
|
||||
return parseBreakIterator(type, locale);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected char getMultiValuedSeparator(String field) {
|
||||
String sep = params.getFieldParam(field, HighlightParams.MULTI_VALUED_SEPARATOR, " ");
|
||||
if (sep.length() != 1) {
|
||||
throw new IllegalArgumentException(HighlightParams.MULTI_VALUED_SEPARATOR + " must be exactly one character.");
|
||||
}
|
||||
return sep.charAt(0);
|
||||
}
|
||||
};
|
||||
|
||||
Map<String,String[]> snippets = highlighter.highlightFields(fieldNames, query, searcher, docIDs, maxPassages);
|
||||
|
|
Loading…
Reference in New Issue