From 41ea8fdcec064b5920a1ae392974fa6c51e9aef6 Mon Sep 17 00:00:00 2001 From: Jim Ferenczi Date: Mon, 17 Jul 2017 19:10:46 +0200 Subject: [PATCH] Picks offset source for the unified highlighter directly from the es mapping (#25747) This commit changes how the offset source is picked for each field using the es mapping rather than the underlying Lucene field infos. It's mandatory for large mappings where field infos retrieval can be costly (the global field infos is merged for each highlighted field in every hit by the Lucene impl). Fixes #25699 --- .../uhighlight/CustomUnifiedHighlighter.java | 28 +++++++++++++--- .../highlight/UnifiedHighlighter.java | 32 +++++++++++++------ .../CustomUnifiedHighlighterTests.java | 6 ++-- 3 files changed, 49 insertions(+), 17 deletions(-) diff --git a/core/src/main/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighter.java b/core/src/main/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighter.java index ebc13298202..669303cc3e4 100644 --- a/core/src/main/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighter.java +++ b/core/src/main/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighter.java @@ -20,6 +20,8 @@ package org.apache.lucene.search.uhighlight; import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.Term; import org.apache.lucene.queries.CommonTermsQuery; import org.apache.lucene.search.DocIdSetIterator; @@ -62,6 +64,7 @@ public class CustomUnifiedHighlighter extends UnifiedHighlighter { public static final char MULTIVAL_SEP_CHAR = (char) 0; private static final Snippet[] EMPTY_SNIPPET = new Snippet[0]; + private final OffsetSource offsetSource; private final String fieldValue; private final PassageFormatter passageFormatter; private final BreakIterator breakIterator; @@ -71,24 +74,27 @@ public class CustomUnifiedHighlighter extends UnifiedHighlighter { /** * Creates a new instance of {@link CustomUnifiedHighlighter} * - * @param analyzer the analyzer used for the field at index time, used for multi term queries internally + * @param analyzer the analyzer used for the field at index time, used for multi term queries internally. * @param passageFormatter our own {@link CustomPassageFormatter} - * which generates snippets in forms of {@link Snippet} objects + * which generates snippets in forms of {@link Snippet} objects. + * @param offsetSource the {@link OffsetSource} to used for offsets retrieval. * @param breakIteratorLocale the {@link Locale} to use for dividing text into passages. - * If null {@link Locale#ROOT} is used + * If null {@link Locale#ROOT} is used. * @param breakIterator the {@link BreakIterator} to use for dividing text into passages. * If null {@link BreakIterator#getSentenceInstance(Locale)} is used. - * @param fieldValue the original field values delimited by MULTIVAL_SEP_CHAR - * @param noMatchSize The size of the text that should be returned when no highlighting can be performed + * @param fieldValue the original field values delimited by MULTIVAL_SEP_CHAR. + * @param noMatchSize The size of the text that should be returned when no highlighting can be performed. */ public CustomUnifiedHighlighter(IndexSearcher searcher, Analyzer analyzer, + OffsetSource offsetSource, PassageFormatter passageFormatter, @Nullable Locale breakIteratorLocale, @Nullable BreakIterator breakIterator, String fieldValue, int noMatchSize) { super(searcher, analyzer); + this.offsetSource = offsetSource; this.breakIterator = breakIterator; this.breakIteratorLocale = breakIteratorLocale == null ? Locale.ROOT : breakIteratorLocale; this.passageFormatter = passageFormatter; @@ -213,4 +219,16 @@ public class CustomUnifiedHighlighter extends UnifiedHighlighter { return null; } } + + /** + * Forces the offset source for this highlighter + */ + @Override + protected OffsetSource getOffsetSource(String field) { + if (offsetSource == null) { + return super.getOffsetSource(field); + } + return offsetSource; + } + } diff --git a/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/UnifiedHighlighter.java b/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/UnifiedHighlighter.java index 64f9b6365b3..223e9f44b4c 100644 --- a/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/UnifiedHighlighter.java +++ b/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/UnifiedHighlighter.java @@ -19,6 +19,7 @@ package org.elasticsearch.search.fetch.subphase.highlight; import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.index.IndexOptions; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.highlight.Encoder; import org.apache.lucene.search.uhighlight.Snippet; @@ -26,11 +27,13 @@ import org.apache.lucene.search.uhighlight.BoundedBreakIteratorScanner; import org.apache.lucene.search.uhighlight.CustomPassageFormatter; import org.apache.lucene.search.uhighlight.CustomSeparatorBreakIterator; import org.apache.lucene.search.uhighlight.CustomUnifiedHighlighter; +import org.apache.lucene.search.uhighlight.UnifiedHighlighter.OffsetSource; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.CollectionUtil; import org.elasticsearch.common.Strings; import org.elasticsearch.common.text.Text; import org.elasticsearch.index.mapper.FieldMapper; +import org.elasticsearch.index.mapper.MappedFieldType; import org.elasticsearch.search.fetch.FetchPhaseExecutionException; import org.elasticsearch.search.fetch.FetchSubPhase; import org.elasticsearch.search.internal.SearchContext; @@ -90,34 +93,35 @@ public class UnifiedHighlighter implements Highlighter { return obj; } }).collect(Collectors.toList()); - IndexSearcher searcher = new IndexSearcher(hitContext.reader()); - CustomUnifiedHighlighter highlighter; + final IndexSearcher searcher = new IndexSearcher(hitContext.reader()); + final CustomUnifiedHighlighter highlighter; + final String fieldValue = mergeFieldValues(fieldValues, MULTIVAL_SEP_CHAR); + final OffsetSource offsetSource = getOffsetSource(fieldMapper.fieldType()); if (field.fieldOptions().numberOfFragments() == 0) { // we use a control char to separate values, which is the only char that the custom break iterator // breaks the text on, so we don't lose the distinction between the different values of a field and we // get back a snippet per value - String fieldValue = mergeFieldValues(fieldValues, MULTIVAL_SEP_CHAR); CustomSeparatorBreakIterator breakIterator = new CustomSeparatorBreakIterator(MULTIVAL_SEP_CHAR); - highlighter = - new CustomUnifiedHighlighter(searcher, analyzer, mapperHighlighterEntry.passageFormatter, - field.fieldOptions().boundaryScannerLocale(), breakIterator, fieldValue, - field.fieldOptions().noMatchSize()); + highlighter = new CustomUnifiedHighlighter(searcher, analyzer, offsetSource, + mapperHighlighterEntry.passageFormatter, field.fieldOptions().boundaryScannerLocale(), + breakIterator, fieldValue, field.fieldOptions().noMatchSize()); numberOfFragments = fieldValues.size(); // we are highlighting the whole content, one snippet per value } else { //using paragraph separator we make sure that each field value holds a discrete passage for highlighting - String fieldValue = mergeFieldValues(fieldValues, MULTIVAL_SEP_CHAR); BreakIterator bi = getBreakIterator(field); - highlighter = new CustomUnifiedHighlighter(searcher, analyzer, + highlighter = new CustomUnifiedHighlighter(searcher, analyzer, offsetSource, mapperHighlighterEntry.passageFormatter, field.fieldOptions().boundaryScannerLocale(), bi, fieldValue, field.fieldOptions().noMatchSize()); numberOfFragments = field.fieldOptions().numberOfFragments(); } + if (field.fieldOptions().requireFieldMatch()) { final String fieldName = highlighterContext.fieldName; highlighter.setFieldMatcher((name) -> fieldName.equals(name)); } else { highlighter.setFieldMatcher((name) -> true); } + Snippet[] fieldSnippets = highlighter.highlightField(highlighterContext.fieldName, highlighterContext.query, hitContext.docId(), numberOfFragments); for (Snippet fieldSnippet : fieldSnippets) { @@ -213,6 +217,16 @@ public class UnifiedHighlighter implements Highlighter { return rawValue.substring(0, Math.min(rawValue.length(), Integer.MAX_VALUE - 1)); } + private OffsetSource getOffsetSource(MappedFieldType fieldType) { + if (fieldType.indexOptions() == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) { + return fieldType.storeTermVectorOffsets() ? OffsetSource.POSTINGS_WITH_TERM_VECTORS : OffsetSource.POSTINGS; + } + if (fieldType.storeTermVectors()) { + return OffsetSource.TERM_VECTORS; + } + return OffsetSource.ANALYSIS; + } + private static class HighlighterEntry { Map mappers = new HashMap<>(); diff --git a/core/src/test/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighterTests.java b/core/src/test/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighterTests.java index 27544448e0c..a2fe5d453de 100644 --- a/core/src/test/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighterTests.java +++ b/core/src/test/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighterTests.java @@ -77,9 +77,9 @@ public class CustomUnifiedHighlighterTests extends ESTestCase { TopDocs topDocs = searcher.search(new MatchAllDocsQuery(), 1, Sort.INDEXORDER); assertThat(topDocs.totalHits, equalTo(1L)); String rawValue = Strings.arrayToDelimitedString(inputs, String.valueOf(MULTIVAL_SEP_CHAR)); - CustomUnifiedHighlighter highlighter = new CustomUnifiedHighlighter(searcher, analyzer, - new CustomPassageFormatter("", "", new DefaultEncoder()), locale, breakIterator, rawValue, - noMatchSize); + CustomUnifiedHighlighter highlighter = new CustomUnifiedHighlighter(searcher, analyzer, null, + new CustomPassageFormatter("", "", new DefaultEncoder()), locale, + breakIterator, rawValue, noMatchSize); highlighter.setFieldMatcher((name) -> "text".equals(name)); final Snippet[] snippets = highlighter.highlightField("text", query, topDocs.scoreDocs[0].doc, expectedPassages.length);