diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 3603295a581..f15db7161e0 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -268,6 +268,11 @@ New Features * SOLR-4565: Extend NorwegianLightStemFilter and NorwegianMinimalStemFilter to handle "nynorsk" (Erlend GarĂ¥sen, janhoy via Robert Muir) +* LUCENE-5087: Add getMultiValuedSeparator to PostingsHighlighter, for cases + where you want a different logical separator between field values. This can + be set to e.g. U+2029 PARAGRAPH SEPARATOR if you never want passes to span + values. (Mike McCandless, Robert Muir) + API Changes * LUCENE-5077: Make it easier to use compressed norms. Lucene42NormsFormat takes diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PostingsHighlighter.java b/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PostingsHighlighter.java index b52fcbe1006..715efebde7d 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PostingsHighlighter.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PostingsHighlighter.java @@ -369,7 +369,11 @@ public class PostingsHighlighter { * identical to what was indexed. */ protected String[][] loadFieldValues(IndexSearcher searcher, String[] fields, int[] docids, int maxLength) throws IOException { String contents[][] = new String[fields.length][docids.length]; - LimitedStoredFieldVisitor visitor = new LimitedStoredFieldVisitor(fields, maxLength); + char valueSeparators[] = new char[fields.length]; + for (int i = 0; i < fields.length; i++) { + valueSeparators[i] = getMultiValuedSeparator(fields[i]); + } + LimitedStoredFieldVisitor visitor = new LimitedStoredFieldVisitor(fields, valueSeparators, maxLength); for (int i = 0; i < docids.length; i++) { searcher.doc(docids[i], visitor); for (int j = 0; j < fields.length; j++) { @@ -379,6 +383,16 @@ public class PostingsHighlighter { } return contents; } + + /** + * Returns the logical separator between values for multi-valued fields. + * The default value is a space character, which means passages can span across values, + * but a subclass can override, for example with {@code U+2029 PARAGRAPH SEPARATOR (PS)} + * if each value holds a discrete passage for highlighting. + */ + protected char getMultiValuedSeparator(String field) { + return ' '; + } private Map highlightField(String field, String contents[], BreakIterator bi, BytesRef terms[], int[] docids, List leaves, int maxPassages) throws IOException { Map highlights = new HashMap(); @@ -652,12 +666,15 @@ public class PostingsHighlighter { private static class LimitedStoredFieldVisitor extends StoredFieldVisitor { private final String fields[]; + private final char valueSeparators[]; private final int maxLength; private final StringBuilder builders[]; private int currentField = -1; - public LimitedStoredFieldVisitor(String fields[], int maxLength) { + public LimitedStoredFieldVisitor(String fields[], char valueSeparators[], int maxLength) { + assert fields.length == valueSeparators.length; this.fields = fields; + this.valueSeparators = valueSeparators; this.maxLength = maxLength; builders = new StringBuilder[fields.length]; for (int i = 0; i < builders.length; i++) { @@ -670,7 +687,7 @@ public class PostingsHighlighter { assert currentField >= 0; StringBuilder builder = builders[currentField]; if (builder.length() > 0 && builder.length() < maxLength) { - builder.append(' '); // for the offset gap, TODO: make this configurable + builder.append(valueSeparators[currentField]); } if (builder.length() + value.length() > maxLength) { builder.append(value, 0, maxLength - builder.length()); diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighter.java b/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighter.java index 8308bed309f..f72c59c8486 100644 --- a/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighter.java +++ b/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighter.java @@ -922,4 +922,48 @@ public class TestPostingsHighlighter extends LuceneTestCase { ir.close(); dir.close(); } + + /** customizing the gap separator to force a sentence break */ + public void testGapSeparator() throws Exception { + Directory dir = newDirectory(); + // use simpleanalyzer for more natural tokenization (else "test." is a token) + IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.SIMPLE, true)); + iwc.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); + + FieldType offsetsType = new FieldType(TextField.TYPE_STORED); + offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); + Document doc = new Document(); + + Field body1 = new Field("body", "", offsetsType); + body1.setStringValue("This is a multivalued field"); + doc.add(body1); + + Field body2 = new Field("body", "", offsetsType); + body2.setStringValue("This is something different"); + doc.add(body2); + + iw.addDocument(doc); + + IndexReader ir = iw.getReader(); + iw.close(); + + IndexSearcher searcher = newSearcher(ir); + PostingsHighlighter highlighter = new PostingsHighlighter() { + @Override + protected char getMultiValuedSeparator(String field) { + assert field.equals("body"); + return '\u2029'; + } + }; + Query query = new TermQuery(new Term("body", "field")); + TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER); + assertEquals(1, topDocs.totalHits); + String snippets[] = highlighter.highlight("body", query, searcher, topDocs); + assertEquals(1, snippets.length); + assertEquals("This is a multivalued field\u2029", snippets[0]); + + ir.close(); + dir.close(); + } } diff --git a/solr/core/src/java/org/apache/solr/highlight/PostingsSolrHighlighter.java b/solr/core/src/java/org/apache/solr/highlight/PostingsSolrHighlighter.java index a1623f411aa..012c9dedadb 100644 --- a/solr/core/src/java/org/apache/solr/highlight/PostingsSolrHighlighter.java +++ b/solr/core/src/java/org/apache/solr/highlight/PostingsSolrHighlighter.java @@ -67,6 +67,7 @@ import org.apache.solr.util.plugin.PluginInfoInitialized; * <str name="hl.bs.variant"></str> * <str name="hl.bs.type">SENTENCE</str> * <int name="hl.maxAnalyzedChars">10000</int> + * <str name="hl.multiValuedSeparatorChar"> </str> * </lst> * </requestHandler> * @@ -96,6 +97,7 @@ import org.apache.solr.util.plugin.PluginInfoInitialized; *
  • hl.bs.country (string) specifies country code for BreakIterator. default is empty string (root locale) *
  • hl.bs.variant (string) specifies country code for BreakIterator. default is empty string (root locale) *
  • hl.maxAnalyzedChars specifies how many characters at most will be processed in a document. + *
  • hl.multiValuedSeparatorChar specifies the logical separator between values for multi-valued fields. * NOTE: currently hl.maxAnalyzedChars cannot yet be specified per-field * * @@ -167,6 +169,15 @@ public class PostingsSolrHighlighter extends SolrHighlighter implements PluginIn String type = params.getFieldParam(field, HighlightParams.BS_TYPE); return parseBreakIterator(type, locale); } + + @Override + protected char getMultiValuedSeparator(String field) { + String sep = params.getFieldParam(field, HighlightParams.MULTI_VALUED_SEPARATOR, " "); + if (sep.length() != 1) { + throw new IllegalArgumentException(HighlightParams.MULTI_VALUED_SEPARATOR + " must be exactly one character."); + } + return sep.charAt(0); + } }; Map snippets = highlighter.highlightFields(fieldNames, query, searcher, docIDs, maxPassages);