diff --git a/core/src/main/java/org/apache/lucene/search/uhighlight/BoundedBreakIteratorScanner.java b/core/src/main/java/org/apache/lucene/search/uhighlight/BoundedBreakIteratorScanner.java new file mode 100644 index 00000000000..1cd5fb9340d --- /dev/null +++ b/core/src/main/java/org/apache/lucene/search/uhighlight/BoundedBreakIteratorScanner.java @@ -0,0 +1,171 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.lucene.search.uhighlight; + +import java.text.BreakIterator; +import java.text.CharacterIterator; +import java.util.Locale; + +/** + * A custom break iterator that scans text to find break-delimited passages bounded by + * a provided maximum length. This class delegates the boundary search to a first level + * break iterator. When this break iterator finds a passage greater than the maximum length + * a secondary break iterator is used to re-split the passage at the first boundary after + * maximum length. + * This is useful to split passages created by {@link BreakIterator}s like `sentence` that + * can create big outliers on semi-structured text. + * + * WARNING: This break iterator is designed to work with the {@link UnifiedHighlighter}. + **/ +public class BoundedBreakIteratorScanner extends BreakIterator { + private final BreakIterator mainBreak; + private final BreakIterator innerBreak; + private final int maxLen; + + private int lastPrecedingOffset = -1; + private int windowStart = -1; + private int windowEnd = -1; + private int innerStart = -1; + private int innerEnd = 0; + + private BoundedBreakIteratorScanner(BreakIterator mainBreak, + BreakIterator innerBreak, + int maxLen) { + this.mainBreak = mainBreak; + this.innerBreak = innerBreak; + this.maxLen = maxLen; + } + + @Override + public CharacterIterator getText() { + return mainBreak.getText(); + } + + @Override + public void setText(CharacterIterator newText) { + reset(); + mainBreak.setText(newText); + innerBreak.setText(newText); + } + + @Override + public void setText(String newText) { + reset(); + mainBreak.setText(newText); + innerBreak.setText(newText); + } + + private void reset() { + lastPrecedingOffset = -1; + windowStart = -1; + windowEnd = -1; + innerStart = -1; + innerEnd = 0; + } + + /** + * Must be called with increasing offset. See {@link FieldHighlighter} for usage. + */ + @Override + public int preceding(int offset) { + if (offset < lastPrecedingOffset) { + throw new IllegalArgumentException("offset < lastPrecedingOffset: " + + "usage doesn't look like UnifiedHighlighter"); + } + if (offset > windowStart && offset < windowEnd) { + innerStart = innerEnd; + innerEnd = windowEnd; + } else { + windowStart = innerStart = mainBreak.preceding(offset); + windowEnd = innerEnd = mainBreak.following(offset-1); + } + + if (innerEnd - innerStart > maxLen) { + // the current split is too big, + // so starting from the current term we try to find boundaries on the left first + if (offset - maxLen > innerStart) { + innerStart = Math.max(innerStart, + innerBreak.preceding(offset - maxLen)); + } + // and then we try to expand the passage to the right with the remaining size + int remaining = Math.max(0, maxLen - (offset - innerStart)); + if (offset + remaining < windowEnd) { + innerEnd = Math.min(windowEnd, + innerBreak.following(offset + remaining)); + } + } + lastPrecedingOffset = offset - 1; + return innerStart; + } + + /** + * Can be invoked only after a call to preceding(offset+1). + * See {@link FieldHighlighter} for usage. + */ + @Override + public int following(int offset) { + if (offset != lastPrecedingOffset || innerEnd == -1) { + throw new IllegalArgumentException("offset != lastPrecedingOffset: " + + "usage doesn't look like UnifiedHighlighter"); + } + return innerEnd; + } + + /** + * Returns a {@link BreakIterator#getSentenceInstance(Locale)} bounded to maxLen. + * Secondary boundaries are found using a {@link BreakIterator#getWordInstance(Locale)}. + */ + public static BreakIterator getSentence(Locale locale, int maxLen) { + final BreakIterator sBreak = BreakIterator.getSentenceInstance(locale); + final BreakIterator wBreak = BreakIterator.getWordInstance(locale); + return new BoundedBreakIteratorScanner(sBreak, wBreak, maxLen); + } + + + @Override + public int current() { + // Returns the last offset of the current split + return this.innerEnd; + } + + @Override + public int first() { + throw new IllegalStateException("first() should not be called in this context"); + } + + @Override + public int next() { + throw new IllegalStateException("next() should not be called in this context"); + } + + @Override + public int last() { + throw new IllegalStateException("last() should not be called in this context"); + } + + @Override + public int next(int n) { + throw new IllegalStateException("next(n) should not be called in this context"); + } + + @Override + public int previous() { + throw new IllegalStateException("previous() should not be called in this context"); + } +} diff --git a/core/src/main/java/org/apache/lucene/search/uhighlight/CustomFieldHighlighter.java b/core/src/main/java/org/apache/lucene/search/uhighlight/CustomFieldHighlighter.java new file mode 100644 index 00000000000..915e7cc1531 --- /dev/null +++ b/core/src/main/java/org/apache/lucene/search/uhighlight/CustomFieldHighlighter.java @@ -0,0 +1,79 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.lucene.search.uhighlight; + +import java.text.BreakIterator; +import java.util.Locale; + +import static org.apache.lucene.search.uhighlight.CustomUnifiedHighlighter.MULTIVAL_SEP_CHAR; + +/** + * Custom {@link FieldHighlighter} that creates a single passage bounded to {@code noMatchSize} when + * no highlights were found. + */ +class CustomFieldHighlighter extends FieldHighlighter { + private static final Passage[] EMPTY_PASSAGE = new Passage[0]; + + private final Locale breakIteratorLocale; + private final int noMatchSize; + private final String fieldValue; + + CustomFieldHighlighter(String field, FieldOffsetStrategy fieldOffsetStrategy, + Locale breakIteratorLocale, BreakIterator breakIterator, + PassageScorer passageScorer, int maxPassages, int maxNoHighlightPassages, + PassageFormatter passageFormatter, int noMatchSize, String fieldValue) { + super(field, fieldOffsetStrategy, breakIterator, passageScorer, maxPassages, + maxNoHighlightPassages, passageFormatter); + this.breakIteratorLocale = breakIteratorLocale; + this.noMatchSize = noMatchSize; + this.fieldValue = fieldValue; + } + + @Override + protected Passage[] getSummaryPassagesNoHighlight(int maxPassages) { + if (noMatchSize > 0) { + int pos = 0; + while (pos < fieldValue.length() && fieldValue.charAt(pos) == MULTIVAL_SEP_CHAR) { + pos ++; + } + if (pos < fieldValue.length()) { + int end = fieldValue.indexOf(MULTIVAL_SEP_CHAR, pos); + if (end == -1) { + end = fieldValue.length(); + } + if (noMatchSize+pos < end) { + BreakIterator bi = BreakIterator.getWordInstance(breakIteratorLocale); + bi.setText(fieldValue); + // Finds the next word boundary **after** noMatchSize. + end = bi.following(noMatchSize + pos); + if (end == BreakIterator.DONE) { + end = fieldValue.length(); + } + } + Passage passage = new Passage(); + passage.setScore(Float.NaN); + passage.setStartOffset(pos); + passage.setEndOffset(end); + return new Passage[]{passage}; + } + } + return EMPTY_PASSAGE; + } +} diff --git a/core/src/main/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighter.java b/core/src/main/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighter.java index 4f1ec5fdb83..4a20fb0478f 100644 --- a/core/src/main/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighter.java +++ b/core/src/main/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighter.java @@ -33,6 +33,8 @@ import org.apache.lucene.search.spans.SpanNearQuery; import org.apache.lucene.search.spans.SpanOrQuery; import org.apache.lucene.search.spans.SpanQuery; import org.apache.lucene.search.spans.SpanTermQuery; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.automaton.CharacterRunAutomaton; import org.elasticsearch.common.Nullable; import org.elasticsearch.common.lucene.all.AllTermQuery; import org.elasticsearch.common.lucene.search.MultiPhrasePrefixQuery; @@ -47,6 +49,7 @@ import java.util.Collections; import java.util.List; import java.util.Locale; import java.util.Map; +import java.util.Set; /** * Subclass of the {@link UnifiedHighlighter} that works for a single field in a single document. @@ -57,37 +60,41 @@ import java.util.Map; * Supports both returning empty snippets and non highlighted snippets when no highlighting can be performed. */ public class CustomUnifiedHighlighter extends UnifiedHighlighter { + public static final char MULTIVAL_SEP_CHAR = (char) 0; private static final Snippet[] EMPTY_SNIPPET = new Snippet[0]; private final String fieldValue; private final PassageFormatter passageFormatter; private final BreakIterator breakIterator; - private final boolean returnNonHighlightedSnippets; + private final Locale breakIteratorLocale; + private final int noMatchSize; /** * Creates a new instance of {@link CustomUnifiedHighlighter} * * @param analyzer the analyzer used for the field at index time, used for multi term queries internally * @param passageFormatter our own {@link CustomPassageFormatter} - * which generates snippets in forms of {@link Snippet} objects + * which generates snippets in forms of {@link Snippet} objects + * @param breakIteratorLocale the {@link Locale} to use for dividing text into passages. + * If null {@link Locale#ROOT} is used * @param breakIterator the {@link BreakIterator} to use for dividing text into passages. - * If null {@link BreakIterator#getSentenceInstance(Locale)} is used. - * @param fieldValue the original field values as constructor argument, loaded from the _source field or - * the relevant stored field. - * @param returnNonHighlightedSnippets whether non highlighted snippets should be - * returned rather than empty snippets when no highlighting can be performed + * If null {@link BreakIterator#getSentenceInstance(Locale)} is used. + * @param fieldValue the original field values delimited by MULTIVAL_SEP_CHAR + * @param noMatchSize The size of the text that should be returned when no highlighting can be performed */ public CustomUnifiedHighlighter(IndexSearcher searcher, Analyzer analyzer, PassageFormatter passageFormatter, + @Nullable Locale breakIteratorLocale, @Nullable BreakIterator breakIterator, String fieldValue, - boolean returnNonHighlightedSnippets) { + int noMatchSize) { super(searcher, analyzer); this.breakIterator = breakIterator; + this.breakIteratorLocale = breakIteratorLocale == null ? Locale.ROOT : breakIteratorLocale; this.passageFormatter = passageFormatter; this.fieldValue = fieldValue; - this.returnNonHighlightedSnippets = returnNonHighlightedSnippets; + this.noMatchSize = noMatchSize; } /** @@ -111,16 +118,13 @@ public class CustomUnifiedHighlighter extends UnifiedHighlighter { @Override protected List loadFieldValues(String[] fields, DocIdSetIterator docIter, int cacheCharsThreshold) throws IOException { - //we only highlight one field, one document at a time + // we only highlight one field, one document at a time return Collections.singletonList(new String[]{fieldValue}); } @Override protected BreakIterator getBreakIterator(String field) { - if (breakIterator != null) { - return breakIterator; - } - return super.getBreakIterator(field); + return breakIterator; } @Override @@ -129,11 +133,18 @@ public class CustomUnifiedHighlighter extends UnifiedHighlighter { } @Override - protected int getMaxNoHighlightPassages(String field) { - if (returnNonHighlightedSnippets) { - return 1; - } - return 0; + protected FieldHighlighter getFieldHighlighter(String field, Query query, Set allTerms, int maxPassages) { + BytesRef[] terms = filterExtractedTerms(getFieldMatcher(field), allTerms); + Set highlightFlags = getFlags(field); + PhraseHelper phraseHelper = getPhraseHelper(field, query, highlightFlags); + CharacterRunAutomaton[] automata = getAutomata(field, query, highlightFlags); + OffsetSource offsetSource = getOptimizedOffsetSource(field, terms, phraseHelper, automata); + BreakIterator breakIterator = new SplittingBreakIterator(getBreakIterator(field), + UnifiedHighlighter.MULTIVAL_SEP_CHAR); + FieldOffsetStrategy strategy = + getOffsetStrategy(offsetSource, field, terms, phraseHelper, automata, highlightFlags); + return new CustomFieldHighlighter(field, strategy, breakIteratorLocale, breakIterator, + getScorer(field), maxPassages, (noMatchSize > 0 ? 1 : 0), getFormatter(field), noMatchSize, fieldValue); } @Override @@ -146,7 +157,6 @@ public class CustomUnifiedHighlighter extends UnifiedHighlighter { return rewriteCustomQuery(query); } - /** * Translate custom queries in queries that are supported by the unified highlighter. */ diff --git a/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/FastVectorHighlighter.java b/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/FastVectorHighlighter.java index 0c15f1106b1..b1d557e851a 100644 --- a/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/FastVectorHighlighter.java +++ b/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/FastVectorHighlighter.java @@ -52,13 +52,14 @@ import java.util.Map; public class FastVectorHighlighter implements Highlighter { private static final BoundaryScanner DEFAULT_SIMPLE_BOUNDARY_SCANNER = new SimpleBoundaryScanner(); - private static final BoundaryScanner DEFAULT_SENTENCE_BOUNDARY_SCANNER = new BreakIteratorBoundaryScanner( - BreakIterator.getSentenceInstance(Locale.ROOT)); - private static final BoundaryScanner DEFAULT_WORD_BOUNDARY_SCANNER = new BreakIteratorBoundaryScanner( - BreakIterator.getWordInstance(Locale.ROOT)); + private static final BoundaryScanner DEFAULT_SENTENCE_BOUNDARY_SCANNER = + new BreakIteratorBoundaryScanner(BreakIterator.getSentenceInstance(Locale.ROOT)); + private static final BoundaryScanner DEFAULT_WORD_BOUNDARY_SCANNER = + new BreakIteratorBoundaryScanner(BreakIterator.getWordInstance(Locale.ROOT)); + + public static final Setting SETTING_TV_HIGHLIGHT_MULTI_VALUE = + Setting.boolSetting("search.highlight.term_vector_multi_value", true, Setting.Property.NodeScope); - public static final Setting SETTING_TV_HIGHLIGHT_MULTI_VALUE = Setting.boolSetting("search.highlight.term_vector_multi_value", - true, Setting.Property.NodeScope); private static final String CACHE_KEY = "highlight-fsv"; private final Boolean termVectorMultiValue; @@ -74,11 +75,12 @@ public class FastVectorHighlighter implements Highlighter { FieldMapper mapper = highlighterContext.mapper; if (canHighlight(mapper) == false) { - throw new IllegalArgumentException("the field [" + highlighterContext.fieldName - + "] should be indexed with term vector with position offsets to be used with fast vector highlighter"); + throw new IllegalArgumentException("the field [" + highlighterContext.fieldName + + "] should be indexed with term vector with position offsets to be used with fast vector highlighter"); } - Encoder encoder = field.fieldOptions().encoder().equals("html") ? HighlightUtils.Encoders.HTML : HighlightUtils.Encoders.DEFAULT; + Encoder encoder = field.fieldOptions().encoder().equals("html") ? + HighlightUtils.Encoders.HTML : HighlightUtils.Encoders.DEFAULT; if (!hitContext.cache().containsKey(CACHE_KEY)) { hitContext.cache().put(CACHE_KEY, new HighlighterEntry()); @@ -90,21 +92,21 @@ public class FastVectorHighlighter implements Highlighter { if (field.fieldOptions().requireFieldMatch()) { if (cache.fieldMatchFieldQuery == null) { /* - * we use top level reader to rewrite the query against all readers, with use caching it across hits (and across - * readers...) + * we use top level reader to rewrite the query against all readers, + * with use caching it across hits (and across readers...) */ - cache.fieldMatchFieldQuery = new CustomFieldQuery(highlighterContext.query, hitContext.topLevelReader(), - true, field.fieldOptions().requireFieldMatch()); + cache.fieldMatchFieldQuery = new CustomFieldQuery(highlighterContext.query, + hitContext.topLevelReader(), true, field.fieldOptions().requireFieldMatch()); } fieldQuery = cache.fieldMatchFieldQuery; } else { if (cache.noFieldMatchFieldQuery == null) { /* - * we use top level reader to rewrite the query against all readers, with use caching it across hits (and across - * readers...) + * we use top level reader to rewrite the query against all readers, + * with use caching it across hits (and across readers...) */ - cache.noFieldMatchFieldQuery = new CustomFieldQuery(highlighterContext.query, hitContext.topLevelReader(), - true, field.fieldOptions().requireFieldMatch()); + cache.noFieldMatchFieldQuery = new CustomFieldQuery(highlighterContext.query, + hitContext.topLevelReader(), true, field.fieldOptions().requireFieldMatch()); } fieldQuery = cache.noFieldMatchFieldQuery; } @@ -128,7 +130,7 @@ public class FastVectorHighlighter implements Highlighter { } } else { fragListBuilder = field.fieldOptions().fragmentOffset() == -1 ? - new SimpleFragListBuilder() : new SimpleFragListBuilder(field.fieldOptions().fragmentOffset()); + new SimpleFragListBuilder() : new SimpleFragListBuilder(field.fieldOptions().fragmentOffset()); if (field.fieldOptions().scoreOrdered()) { if (!forceSource && mapper.fieldType().stored()) { fragmentsBuilder = new ScoreOrderFragmentsBuilder(field.fieldOptions().preTags(), @@ -142,7 +144,8 @@ public class FastVectorHighlighter implements Highlighter { fragmentsBuilder = new SimpleFragmentsBuilder(mapper, field.fieldOptions().preTags(), field.fieldOptions().postTags(), boundaryScanner); } else { - fragmentsBuilder = new SourceSimpleFragmentsBuilder(mapper, context, field.fieldOptions().preTags(), + fragmentsBuilder = + new SourceSimpleFragmentsBuilder(mapper, context, field.fieldOptions().preTags(), field.fieldOptions().postTags(), boundaryScanner); } } @@ -153,8 +156,8 @@ public class FastVectorHighlighter implements Highlighter { entry.fragmentsBuilder = fragmentsBuilder; if (cache.fvh == null) { // parameters to FVH are not requires since: - // first two booleans are not relevant since they are set on the CustomFieldQuery (phrase and fieldMatch) - // fragment builders are used explicitly + // first two booleans are not relevant since they are set on the CustomFieldQuery + // (phrase and fieldMatch) fragment builders are used explicitly cache.fvh = new org.apache.lucene.search.vectorhighlight.FastVectorHighlighter(); } CustomFieldQuery.highlightFilters.set(field.fieldOptions().highlightFilter()); @@ -172,13 +175,14 @@ public class FastVectorHighlighter implements Highlighter { // we highlight against the low level reader and docId, because if we load source, we want to reuse it if possible // Only send matched fields if they were requested to save time. if (field.fieldOptions().matchedFields() != null && !field.fieldOptions().matchedFields().isEmpty()) { - fragments = cache.fvh.getBestFragments(fieldQuery, hitContext.reader(), hitContext.docId(), mapper.fieldType().name(), - field.fieldOptions().matchedFields(), fragmentCharSize, numberOfFragments, entry.fragListBuilder, - entry.fragmentsBuilder, field.fieldOptions().preTags(), field.fieldOptions().postTags(), encoder); + fragments = cache.fvh.getBestFragments(fieldQuery, hitContext.reader(), hitContext.docId(), + mapper.fieldType().name(), field.fieldOptions().matchedFields(), fragmentCharSize, + numberOfFragments, entry.fragListBuilder, entry.fragmentsBuilder, field.fieldOptions().preTags(), + field.fieldOptions().postTags(), encoder); } else { - fragments = cache.fvh.getBestFragments(fieldQuery, hitContext.reader(), hitContext.docId(), mapper.fieldType().name(), - fragmentCharSize, numberOfFragments, entry.fragListBuilder, entry.fragmentsBuilder, field.fieldOptions().preTags(), - field.fieldOptions().postTags(), encoder); + fragments = cache.fvh.getBestFragments(fieldQuery, hitContext.reader(), hitContext.docId(), + mapper.fieldType().name(), fragmentCharSize, numberOfFragments, entry.fragListBuilder, + entry.fragmentsBuilder, field.fieldOptions().preTags(), field.fieldOptions().postTags(), encoder); } if (fragments != null && fragments.length > 0) { @@ -187,11 +191,13 @@ public class FastVectorHighlighter implements Highlighter { int noMatchSize = highlighterContext.field.fieldOptions().noMatchSize(); if (noMatchSize > 0) { - // Essentially we just request that a fragment is built from 0 to noMatchSize using the normal fragmentsBuilder + // Essentially we just request that a fragment is built from 0 to noMatchSize using + // the normal fragmentsBuilder FieldFragList fieldFragList = new SimpleFieldFragList(-1 /*ignored*/); fieldFragList.add(0, noMatchSize, Collections.emptyList()); - fragments = entry.fragmentsBuilder.createFragments(hitContext.reader(), hitContext.docId(), mapper.fieldType().name(), - fieldFragList, 1, field.fieldOptions().preTags(), field.fieldOptions().postTags(), encoder); + fragments = entry.fragmentsBuilder.createFragments(hitContext.reader(), hitContext.docId(), + mapper.fieldType().name(), fieldFragList, 1, field.fieldOptions().preTags(), + field.fieldOptions().postTags(), encoder); if (fragments != null && fragments.length > 0) { return new HighlightField(highlighterContext.fieldName, Text.convertFromStringArray(fragments)); } @@ -200,7 +206,8 @@ public class FastVectorHighlighter implements Highlighter { return null; } catch (Exception e) { - throw new FetchPhaseExecutionException(context, "Failed to highlight field [" + highlighterContext.fieldName + "]", e); + throw new FetchPhaseExecutionException(context, + "Failed to highlight field [" + highlighterContext.fieldName + "]", e); } } @@ -212,24 +219,31 @@ public class FastVectorHighlighter implements Highlighter { private static BoundaryScanner getBoundaryScanner(Field field) { final FieldOptions fieldOptions = field.fieldOptions(); - final Locale boundaryScannerLocale = fieldOptions.boundaryScannerLocale(); - switch(fieldOptions.boundaryScannerType()) { - case SENTENCE: - if (boundaryScannerLocale != null) { - return new BreakIteratorBoundaryScanner(BreakIterator.getSentenceInstance(boundaryScannerLocale)); - } - return DEFAULT_SENTENCE_BOUNDARY_SCANNER; - case WORD: - if (boundaryScannerLocale != null) { - return new BreakIteratorBoundaryScanner(BreakIterator.getWordInstance(boundaryScannerLocale)); - } - return DEFAULT_WORD_BOUNDARY_SCANNER; - default: - if (fieldOptions.boundaryMaxScan() != SimpleBoundaryScanner.DEFAULT_MAX_SCAN + final Locale boundaryScannerLocale = + fieldOptions.boundaryScannerLocale() != null ? fieldOptions.boundaryScannerLocale() : + Locale.ROOT; + final HighlightBuilder.BoundaryScannerType type = + fieldOptions.boundaryScannerType() != null ? fieldOptions.boundaryScannerType() : + HighlightBuilder.BoundaryScannerType.CHARS; + switch(type) { + case SENTENCE: + if (boundaryScannerLocale != null) { + return new BreakIteratorBoundaryScanner(BreakIterator.getSentenceInstance(boundaryScannerLocale)); + } + return DEFAULT_SENTENCE_BOUNDARY_SCANNER; + case WORD: + if (boundaryScannerLocale != null) { + return new BreakIteratorBoundaryScanner(BreakIterator.getWordInstance(boundaryScannerLocale)); + } + return DEFAULT_WORD_BOUNDARY_SCANNER; + case CHARS: + if (fieldOptions.boundaryMaxScan() != SimpleBoundaryScanner.DEFAULT_MAX_SCAN || fieldOptions.boundaryChars() != SimpleBoundaryScanner.DEFAULT_BOUNDARY_CHARS) { - return new SimpleBoundaryScanner(fieldOptions.boundaryMaxScan(), fieldOptions.boundaryChars()); - } - return DEFAULT_SIMPLE_BOUNDARY_SCANNER; + return new SimpleBoundaryScanner(fieldOptions.boundaryMaxScan(), fieldOptions.boundaryChars()); + } + return DEFAULT_SIMPLE_BOUNDARY_SCANNER; + default: + throw new IllegalArgumentException("Invalid boundary scanner type: " + type.toString()); } } diff --git a/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightBuilder.java b/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightBuilder.java index 45b8c612a76..c7c9c547b51 100644 --- a/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightBuilder.java +++ b/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightBuilder.java @@ -95,7 +95,7 @@ public class HighlightBuilder extends AbstractHighlighterBuilder 0); + field.fieldOptions().boundaryScannerLocale(), breakIterator, fieldValue, + field.fieldOptions().noMatchSize()); numberOfFragments = fieldValues.size(); // we are highlighting the whole content, one snippet per value } else { //using paragraph separator we make sure that each field value holds a discrete passage for highlighting - String fieldValue = mergeFieldValues(fieldValues, HighlightUtils.PARAGRAPH_SEPARATOR); + String fieldValue = mergeFieldValues(fieldValues, MULTIVAL_SEP_CHAR); + BreakIterator bi = getBreakIterator(field); highlighter = new CustomUnifiedHighlighter(searcher, analyzer, - mapperHighlighterEntry.passageFormatter, null, fieldValue, field.fieldOptions().noMatchSize() > 0); + mapperHighlighterEntry.passageFormatter, field.fieldOptions().boundaryScannerLocale(), bi, + fieldValue, field.fieldOptions().noMatchSize()); numberOfFragments = field.fieldOptions().numberOfFragments(); } if (field.fieldOptions().requireFieldMatch()) { @@ -144,11 +151,34 @@ public class UnifiedHighlighter implements Highlighter { return null; } - static class HighlighterEntry { + private BreakIterator getBreakIterator(SearchContextHighlight.Field field) { + final SearchContextHighlight.FieldOptions fieldOptions = field.fieldOptions(); + final Locale locale = + fieldOptions.boundaryScannerLocale() != null ? fieldOptions.boundaryScannerLocale() : + Locale.ROOT; + final HighlightBuilder.BoundaryScannerType type = + fieldOptions.boundaryScannerType() != null ? fieldOptions.boundaryScannerType() : + HighlightBuilder.BoundaryScannerType.SENTENCE; + int maxLen = fieldOptions.fragmentCharSize(); + switch (type) { + case SENTENCE: + if (maxLen > 0) { + return BoundedBreakIteratorScanner.getSentence(locale, maxLen); + } + return BreakIterator.getSentenceInstance(locale); + case WORD: + // ignore maxLen + return BreakIterator.getWordInstance(locale); + default: + throw new IllegalArgumentException("Invalid boundary scanner type: " + type.toString()); + } + } + + private static class HighlighterEntry { Map mappers = new HashMap<>(); } - static class MapperHighlighterEntry { + private static class MapperHighlighterEntry { final CustomPassageFormatter passageFormatter; private MapperHighlighterEntry(CustomPassageFormatter passageFormatter) { diff --git a/core/src/test/java/org/apache/lucene/search/uhighlight/BoundedBreakIteratorScannerTests.java b/core/src/test/java/org/apache/lucene/search/uhighlight/BoundedBreakIteratorScannerTests.java new file mode 100644 index 00000000000..0cf62e8ce6c --- /dev/null +++ b/core/src/test/java/org/apache/lucene/search/uhighlight/BoundedBreakIteratorScannerTests.java @@ -0,0 +1,138 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.lucene.search.uhighlight; + +import org.elasticsearch.test.ESTestCase; + +import java.text.BreakIterator; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Locale; + +import static org.hamcrest.Matchers.greaterThan; +import static org.hamcrest.Matchers.greaterThanOrEqualTo; +import static org.hamcrest.Matchers.lessThanOrEqualTo; + +public class BoundedBreakIteratorScannerTests extends ESTestCase { + private static final String[] WORD_BOUNDARIES = + new String[] { " ", " ", "\t", "#", "\n" }; + private static final String[] SENTENCE_BOUNDARIES = + new String[] { "! ", "? ", ". ", ".\n", ".\n\n" }; + + private void testRandomAsciiTextCase(BreakIterator bi, int maxLen) { + // Generate a random set of unique terms with ascii character + int maxSize = randomIntBetween(5, 100); + String[] vocabulary = new String[maxSize]; + for (int i = 0; i < maxSize; i++) { + if (rarely()) { + vocabulary[i] = randomAsciiOfLengthBetween(50, 200); + } else { + vocabulary[i] = randomAsciiOfLengthBetween(1, 30); + } + } + + // Generate a random text made of random terms separated with word-boundaries + // and sentence-boundaries. + StringBuilder text = new StringBuilder(); + List offsetList = new ArrayList<> (); + List sizeList = new ArrayList<> (); + // the number of sentences to generate + int numSentences = randomIntBetween(10, 100); + int maxTermLen = 0; + for (int i = 0; i < numSentences; i++) { + // the number of terms in the sentence + int numTerms = randomIntBetween(5, 10); + for (int j = 0; j < numTerms; j++) { + int termId = randomIntBetween(0, vocabulary.length - 1); + String term = vocabulary[termId].toLowerCase(Locale.ROOT); + if (j == 0) { + // capitalize the first letter of the first term in the sentence + term = term.substring(0, 1).toUpperCase(Locale.ROOT) + term.substring(1); + } else { + String sep = randomFrom(WORD_BOUNDARIES); + text.append(sep); + } + maxTermLen = Math.max(term.length(), maxTermLen); + offsetList.add(text.length()); + sizeList.add(term.length()); + text.append(term); + } + String boundary = randomFrom(SENTENCE_BOUNDARIES); + text.append(boundary); + } + + int[] sizes = sizeList.stream().mapToInt(i->i).toArray(); + int[] offsets = offsetList.stream().mapToInt(i->i).toArray(); + + bi.setText(text.toString()); + int currentPos = randomIntBetween(0, 20); + int lastEnd = -1; + int maxPassageLen = maxLen+(maxTermLen*2); + while (currentPos < offsets.length) { + // find the passage that contains the current term + int nextOffset = offsets[currentPos]; + int start = bi.preceding(nextOffset+1); + int end = bi.following(nextOffset); + + // check that the passage is valid + assertThat(start, greaterThanOrEqualTo(lastEnd)); + assertThat(end, greaterThan(start)); + assertThat(start, lessThanOrEqualTo(nextOffset)); + assertThat(end, greaterThanOrEqualTo(nextOffset)); + int passageLen = end-start; + assertThat(passageLen, lessThanOrEqualTo(maxPassageLen)); + + // checks that the start and end of the passage are on word boundaries. + int startPos = Arrays.binarySearch(offsets, start); + int endPos = Arrays.binarySearch(offsets, end); + if (startPos < 0) { + int lastWordEnd = + offsets[Math.abs(startPos)-2] + sizes[Math.abs(startPos)-2]; + assertThat(start, greaterThanOrEqualTo(lastWordEnd)); + } + if (endPos < 0) { + if (Math.abs(endPos)-2 < offsets.length) { + int lastWordEnd = + offsets[Math.abs(endPos) - 2] + sizes[Math.abs(endPos) - 2]; + assertThat(end, greaterThanOrEqualTo(lastWordEnd)); + } + // advance the position to the end of the current passage + currentPos = (Math.abs(endPos) - 1); + } else { + // advance the position to the end of the current passage + currentPos = endPos; + } + // randomly advance to the next term to highlight + currentPos += randomIntBetween(0, 20); + lastEnd = end; + } + } + + public void testBoundedSentence() { + for (int i = 0; i < 20; i++) { + int maxLen = randomIntBetween(10, 500); + testRandomAsciiTextCase( + BoundedBreakIteratorScanner.getSentence(Locale.ROOT, maxLen), + maxLen + ); + } + } +} diff --git a/core/src/test/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighterTests.java b/core/src/test/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighterTests.java index 83b42750f92..23e867d2573 100644 --- a/core/src/test/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighterTests.java +++ b/core/src/test/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighterTests.java @@ -20,20 +20,22 @@ package org.apache.lucene.search.uhighlight; import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldType; import org.apache.lucene.document.TextField; +import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexOptions; -import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.queries.CommonTermsQuery; import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.MatchAllDocsQuery; +import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.Sort; import org.apache.lucene.search.TermQuery; @@ -41,219 +43,167 @@ import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.highlight.DefaultEncoder; import org.apache.lucene.search.highlight.Snippet; import org.apache.lucene.store.Directory; +import org.elasticsearch.common.Strings; import org.elasticsearch.common.lucene.all.AllTermQuery; import org.elasticsearch.common.lucene.search.MultiPhrasePrefixQuery; -import org.elasticsearch.search.fetch.subphase.highlight.HighlightUtils; import org.elasticsearch.test.ESTestCase; -import java.io.IOException; +import java.text.BreakIterator; +import java.util.Locale; +import static org.apache.lucene.search.uhighlight.CustomUnifiedHighlighter.MULTIVAL_SEP_CHAR; import static org.hamcrest.CoreMatchers.equalTo; public class CustomUnifiedHighlighterTests extends ESTestCase { - public void testCustomUnifiedHighlighter() throws Exception { + private void assertHighlightOneDoc(String fieldName, String[] inputs, Analyzer analyzer, Query query, + Locale locale, BreakIterator breakIterator, + int noMatchSize, String[] expectedPassages) throws Exception { Directory dir = newDirectory(); - IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random())); - iwc.setMergePolicy(newLogMergePolicy()); + IndexWriterConfig iwc = newIndexWriterConfig(analyzer); + iwc.setMergePolicy(newTieredMergePolicy(random())); RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); - - FieldType offsetsType = new FieldType(TextField.TYPE_STORED); - offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); - offsetsType.setStoreTermVectorOffsets(true); - offsetsType.setStoreTermVectorPositions(true); - offsetsType.setStoreTermVectors(true); - - //good position but only one match - final String firstValue = "This is a test. Just a test1 highlighting from unified highlighter."; - Field body = new Field("body", "", offsetsType); + FieldType ft = new FieldType(TextField.TYPE_STORED); + ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); + ft.freeze(); Document doc = new Document(); - doc.add(body); - body.setStringValue(firstValue); - - //two matches, not the best snippet due to its length though - final String secondValue = "This is the second highlighting value to perform highlighting on a longer text " + - "that gets scored lower."; - Field body2 = new Field("body", "", offsetsType); - doc.add(body2); - body2.setStringValue(secondValue); - - //two matches and short, will be scored highest - final String thirdValue = "This is highlighting the third short highlighting value."; - Field body3 = new Field("body", "", offsetsType); - doc.add(body3); - body3.setStringValue(thirdValue); - - //one match, same as first but at the end, will be scored lower due to its position - final String fourthValue = "Just a test4 highlighting from unified highlighter."; - Field body4 = new Field("body", "", offsetsType); - doc.add(body4); - body4.setStringValue(fourthValue); - + for (String input : inputs) { + Field field = new Field(fieldName, "", ft); + field.setStringValue(input); + doc.add(field); + } iw.addDocument(doc); - - IndexReader ir = iw.getReader(); + DirectoryReader reader = iw.getReader(); + IndexSearcher searcher = newSearcher(reader); iw.close(); - - String firstHlValue = "Just a test1 highlighting from unified highlighter."; - String secondHlValue = "This is the second highlighting value to perform highlighting on a" + - " longer text that gets scored lower."; - String thirdHlValue = "This is highlighting the third short highlighting value."; - String fourthHlValue = "Just a test4 highlighting from unified highlighter."; - - IndexSearcher searcher = newSearcher(ir); - Query query = new TermQuery(new Term("body", "highlighting")); - - TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); + TopDocs topDocs = searcher.search(new MatchAllDocsQuery(), 1, Sort.INDEXORDER); assertThat(topDocs.totalHits, equalTo(1)); - - int docId = topDocs.scoreDocs[0].doc; - - String fieldValue = firstValue + HighlightUtils.PARAGRAPH_SEPARATOR + secondValue + - HighlightUtils.PARAGRAPH_SEPARATOR + thirdValue + HighlightUtils.PARAGRAPH_SEPARATOR + fourthValue; - - CustomUnifiedHighlighter highlighter = new CustomUnifiedHighlighter(searcher, iwc.getAnalyzer(), - new CustomPassageFormatter("", "", new DefaultEncoder()), null, fieldValue, true); - Snippet[] snippets = highlighter.highlightField("body", query, docId, 5); - - assertThat(snippets.length, equalTo(4)); - - assertThat(snippets[0].getText(), equalTo(firstHlValue)); - assertThat(snippets[1].getText(), equalTo(secondHlValue)); - assertThat(snippets[2].getText(), equalTo(thirdHlValue)); - assertThat(snippets[3].getText(), equalTo(fourthHlValue)); - ir.close(); + String rawValue = Strings.arrayToDelimitedString(inputs, String.valueOf(MULTIVAL_SEP_CHAR)); + CustomUnifiedHighlighter highlighter = new CustomUnifiedHighlighter(searcher, analyzer, + new CustomPassageFormatter("", "", new DefaultEncoder()), locale, breakIterator, rawValue, + noMatchSize); + highlighter.setFieldMatcher((name) -> "text".equals(name)); + final Snippet[] snippets = + highlighter.highlightField("text", query, topDocs.scoreDocs[0].doc, expectedPassages.length); + assertEquals(snippets.length, expectedPassages.length); + for (int i = 0; i < snippets.length; i++) { + assertEquals(snippets[i].getText(), expectedPassages[i]); + } + reader.close(); dir.close(); } + public void testSimple() throws Exception { + final String[] inputs = { + "This is a test. Just a test1 highlighting from unified highlighter.", + "This is the second highlighting value to perform highlighting on a longer text that gets scored lower.", + "This is highlighting the third short highlighting value.", + "Just a test4 highlighting from unified highlighter." + }; + + String[] expectedPassages = { + "Just a test1 highlighting from unified highlighter.", + "This is the second highlighting value to perform highlighting on a" + + " longer text that gets scored lower.", + "This is highlighting the third short highlighting value.", + "Just a test4 highlighting from unified highlighter." + }; + Query query = new TermQuery(new Term("text", "highlighting")); + assertHighlightOneDoc("text", inputs, new StandardAnalyzer(), query, Locale.ROOT, + BreakIterator.getSentenceInstance(Locale.ROOT), 0, expectedPassages); + } + public void testNoMatchSize() throws Exception { - Directory dir = newDirectory(); - Analyzer analyzer = new StandardAnalyzer(); - IndexWriterConfig iwc = newIndexWriterConfig(analyzer); - iwc.setMergePolicy(newLogMergePolicy()); - RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); - - FieldType offsetsType = new FieldType(TextField.TYPE_STORED); - offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); - offsetsType.setStoreTermVectorOffsets(true); - offsetsType.setStoreTermVectorPositions(true); - offsetsType.setStoreTermVectors(true); - Field body = new Field("body", "", offsetsType); - Field none = new Field("none", "", offsetsType); - Document doc = new Document(); - doc.add(body); - doc.add(none); - - String firstValue = "This is a test. Just a test highlighting from unified. Feel free to ignore."; - body.setStringValue(firstValue); - none.setStringValue(firstValue); - iw.addDocument(doc); - - IndexReader ir = iw.getReader(); - iw.close(); - - Query query = new TermQuery(new Term("none", "highlighting")); - - IndexSearcher searcher = newSearcher(ir); - TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); - assertThat(topDocs.totalHits, equalTo(1)); - int docId = topDocs.scoreDocs[0].doc; - - CustomPassageFormatter passageFormatter = new CustomPassageFormatter("", "", new DefaultEncoder()); - CustomUnifiedHighlighter highlighter = new CustomUnifiedHighlighter(searcher, analyzer, passageFormatter, - null, firstValue, false); - Snippet[] snippets = highlighter.highlightField("body", query, docId, 5); - assertThat(snippets.length, equalTo(0)); - - highlighter = new CustomUnifiedHighlighter(searcher, analyzer, passageFormatter, null, firstValue, true); - snippets = highlighter.highlightField("body", query, docId, 5); - assertThat(snippets.length, equalTo(1)); - assertThat(snippets[0].getText(), equalTo("This is a test.")); - ir.close(); - dir.close(); - } - - - private IndexReader indexOneDoc(Directory dir, String field, String value, Analyzer analyzer) throws IOException { - IndexWriterConfig iwc = newIndexWriterConfig(analyzer); - iwc.setMergePolicy(newLogMergePolicy()); - RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); - - FieldType ft = new FieldType(TextField.TYPE_STORED); - ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); - Field textField = new Field(field, "", ft); - Document doc = new Document(); - doc.add(textField); - - textField.setStringValue(value); - iw.addDocument(doc); - IndexReader ir = iw.getReader(); - iw.close(); - return ir; + final String[] inputs = { + "This is a test. Just a test highlighting from unified. Feel free to ignore." + }; + Query query = new TermQuery(new Term("body", "highlighting")); + assertHighlightOneDoc("text", inputs, new StandardAnalyzer(), query, Locale.ROOT, + BreakIterator.getSentenceInstance(Locale.ROOT), 100, inputs); } public void testMultiPhrasePrefixQuery() throws Exception { - Analyzer analyzer = new StandardAnalyzer(); - Directory dir = newDirectory(); - String value = "The quick brown fox."; - IndexReader ir = indexOneDoc(dir, "text", value, analyzer); + final String[] inputs = { + "The quick brown fox." + }; + final String[] outputs = { + "The quick brown fox." + }; MultiPhrasePrefixQuery query = new MultiPhrasePrefixQuery(); query.add(new Term("text", "quick")); query.add(new Term("text", "brown")); query.add(new Term("text", "fo")); - IndexSearcher searcher = newSearcher(ir); - TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); - assertThat(topDocs.totalHits, equalTo(1)); - int docId = topDocs.scoreDocs[0].doc; - CustomPassageFormatter passageFormatter = new CustomPassageFormatter("", "", new DefaultEncoder()); - CustomUnifiedHighlighter highlighter = new CustomUnifiedHighlighter(searcher, analyzer, - passageFormatter, null, value, false); - Snippet[] snippets = highlighter.highlightField("text", query, docId, 5); - assertThat(snippets.length, equalTo(1)); - assertThat(snippets[0].getText(), equalTo("The quick brown fox.")); - ir.close(); - dir.close(); + assertHighlightOneDoc("text", inputs, new StandardAnalyzer(), query, Locale.ROOT, + BreakIterator.getSentenceInstance(Locale.ROOT), 0, outputs); } - public void testAllTermQuery() throws IOException { - Directory dir = newDirectory(); - String value = "The quick brown fox."; - Analyzer analyzer = new StandardAnalyzer(); - IndexReader ir = indexOneDoc(dir, "all", value, analyzer); - AllTermQuery query = new AllTermQuery(new Term("all", "fox")); - IndexSearcher searcher = newSearcher(ir); - TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); - assertThat(topDocs.totalHits, equalTo(1)); - int docId = topDocs.scoreDocs[0].doc; - CustomPassageFormatter passageFormatter = new CustomPassageFormatter("", "", new DefaultEncoder()); - CustomUnifiedHighlighter highlighter = new CustomUnifiedHighlighter(searcher, analyzer, - passageFormatter, null, value, false); - Snippet[] snippets = highlighter.highlightField("all", query, docId, 5); - assertThat(snippets.length, equalTo(1)); - assertThat(snippets[0].getText(), equalTo("The quick brown fox.")); - ir.close(); - dir.close(); + public void testAllTermQuery() throws Exception { + final String[] inputs = { + "The quick brown fox." + }; + final String[] outputs = { + "The quick brown fox." + }; + AllTermQuery query = new AllTermQuery(new Term("text", "fox")); + assertHighlightOneDoc("text", inputs, new StandardAnalyzer(), query, Locale.ROOT, + BreakIterator.getSentenceInstance(Locale.ROOT), 0, outputs); } - public void testCommonTermsQuery() throws IOException { - Directory dir = newDirectory(); - String value = "The quick brown fox."; - Analyzer analyzer = new StandardAnalyzer(); - IndexReader ir = indexOneDoc(dir, "text", value, analyzer); + public void testCommonTermsQuery() throws Exception { + final String[] inputs = { + "The quick brown fox." + }; + final String[] outputs = { + "The quick brown fox." + }; CommonTermsQuery query = new CommonTermsQuery(BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD, 128); query.add(new Term("text", "quick")); query.add(new Term("text", "brown")); query.add(new Term("text", "fox")); - IndexSearcher searcher = newSearcher(ir); - TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); - assertThat(topDocs.totalHits, equalTo(1)); - int docId = topDocs.scoreDocs[0].doc; - CustomPassageFormatter passageFormatter = new CustomPassageFormatter("", "", new DefaultEncoder()); - CustomUnifiedHighlighter highlighter = new CustomUnifiedHighlighter(searcher, analyzer, - passageFormatter, null, value, false); - Snippet[] snippets = highlighter.highlightField("text", query, docId, 5); - assertThat(snippets.length, equalTo(1)); - assertThat(snippets[0].getText(), equalTo("The quick brown fox.")); - ir.close(); - dir.close(); + assertHighlightOneDoc("text", inputs, new StandardAnalyzer(), query, Locale.ROOT, + BreakIterator.getSentenceInstance(Locale.ROOT), 0, outputs); + } + + public void testSentenceBoundedBreakIterator() throws Exception { + final String[] inputs = { + "The quick brown fox in a long sentence with another quick brown fox. " + + "Another sentence with brown fox." + }; + final String[] outputs = { + "The quick brown", + "fox in a long", + "with another quick", + "brown fox.", + "sentence with brown", + "fox.", + }; + BooleanQuery query = new BooleanQuery.Builder() + .add(new TermQuery(new Term("text", "quick")), BooleanClause.Occur.SHOULD) + .add(new TermQuery(new Term("text", "brown")), BooleanClause.Occur.SHOULD) + .add(new TermQuery(new Term("text", "fox")), BooleanClause.Occur.SHOULD) + .build(); + assertHighlightOneDoc("text", inputs, new StandardAnalyzer(), query, Locale.ROOT, + BoundedBreakIteratorScanner.getSentence(Locale.ROOT, 10), 0, outputs); + } + + public void testRepeat() throws Exception { + final String[] inputs = { + "Fun fun fun fun fun fun fun fun fun fun" + }; + final String[] outputs = { + "Fun fun fun", + "fun fun", + "fun fun fun", + "fun fun" + }; + Query query = new TermQuery(new Term("text", "fun")); + assertHighlightOneDoc("text", inputs, new StandardAnalyzer(), query, Locale.ROOT, + BoundedBreakIteratorScanner.getSentence(Locale.ROOT, 10), 0, outputs); + + query = new PhraseQuery.Builder() + .add(new Term("text", "fun")) + .add(new Term("text", "fun")) + .build(); + assertHighlightOneDoc("text", inputs, new StandardAnalyzer(), query, Locale.ROOT, + BoundedBreakIteratorScanner.getSentence(Locale.ROOT, 10), 0, outputs); } } diff --git a/core/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/HighlighterSearchIT.java b/core/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/HighlighterSearchIT.java index 7db99ff3232..11496309d47 100644 --- a/core/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/HighlighterSearchIT.java +++ b/core/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/HighlighterSearchIT.java @@ -751,52 +751,69 @@ public class HighlighterSearchIT extends ESIntegTestCase { assertHighlight(searchResponse, 0, "field2", 0, 1, equalTo("The quick brown fox jumps over")); } - public void testFastVectorHighlighterWithSentenceBoundaryScanner() throws Exception { + public void testHighlighterWithSentenceBoundaryScanner() throws Exception { assertAcked(prepareCreate("test").addMapping("type1", type1TermVectorMapping())); ensureGreen(); indexRandom(true, client().prepareIndex("test", "type1") .setSource("field1", "A sentence with few words. Another sentence with even more words.")); - logger.info("--> highlighting and searching on 'field' with sentence boundary_scanner"); - SearchSourceBuilder source = searchSource() + for (String type : new String[] {"unified", "fvh"}) { + logger.info("--> highlighting and searching on 'field' with sentence boundary_scanner"); + SearchSourceBuilder source = searchSource() .query(termQuery("field1", "sentence")) .highlighter(highlight() - .field("field1", 20, 2) - .order("score") - .preTags("").postTags("") - .boundaryScannerType(BoundaryScannerType.SENTENCE)); + .field("field1", 21, 2) + .highlighterType(type) + .preTags("").postTags("") + .boundaryScannerType(BoundaryScannerType.SENTENCE)); + SearchResponse searchResponse = client().prepareSearch("test").setSource(source).get(); - SearchResponse searchResponse = client().prepareSearch("test").setSource(source).get(); + assertHighlight(searchResponse, 0, "field1", 0, 2, anyOf( + equalTo("A sentence with few words"), + equalTo("A sentence with few words. ") + )); - assertHighlight(searchResponse, 0, "field1", 0, 2, equalTo("A sentence with few words. ")); - assertHighlight(searchResponse, 0, "field1", 1, 2, equalTo("Another sentence with even more words. ")); + assertHighlight(searchResponse, 0, "field1", 1, 2, anyOf( + equalTo("Another sentence with"), + equalTo("Another sentence with even more words. ") + )); + } } - public void testFastVectorHighlighterWithSentenceBoundaryScannerAndLocale() throws Exception { + public void testHighlighterWithSentenceBoundaryScannerAndLocale() throws Exception { assertAcked(prepareCreate("test").addMapping("type1", type1TermVectorMapping())); ensureGreen(); indexRandom(true, client().prepareIndex("test", "type1") .setSource("field1", "A sentence with few words. Another sentence with even more words.")); - logger.info("--> highlighting and searching on 'field' with sentence boundary_scanner"); - SearchSourceBuilder source = searchSource() + for (String type : new String[] {"fvh", "unified"}) { + logger.info("--> highlighting and searching on 'field' with sentence boundary_scanner"); + SearchSourceBuilder source = searchSource() .query(termQuery("field1", "sentence")) .highlighter(highlight() - .field("field1", 20, 2) - .order("score") - .preTags("").postTags("") - .boundaryScannerType(BoundaryScannerType.SENTENCE) - .boundaryScannerLocale(Locale.ENGLISH.toLanguageTag())); + .field("field1", 21, 2) + .highlighterType(type) + .preTags("").postTags("") + .boundaryScannerType(BoundaryScannerType.SENTENCE) + .boundaryScannerLocale(Locale.ENGLISH.toLanguageTag())); - SearchResponse searchResponse = client().prepareSearch("test").setSource(source).get(); + SearchResponse searchResponse = client().prepareSearch("test").setSource(source).get(); - assertHighlight(searchResponse, 0, "field1", 0, 2, equalTo("A sentence with few words. ")); - assertHighlight(searchResponse, 0, "field1", 1, 2, equalTo("Another sentence with even more words. ")); + assertHighlight(searchResponse, 0, "field1", 0, 2, anyOf( + equalTo("A sentence with few words"), + equalTo("A sentence with few words. ") + )); + + assertHighlight(searchResponse, 0, "field1", 1, 2, anyOf( + equalTo("Another sentence with"), + equalTo("Another sentence with even more words. ") + )); + } } - public void testFastVectorHighlighterWithWordBoundaryScanner() throws Exception { + public void testHighlighterWithWordBoundaryScanner() throws Exception { assertAcked(prepareCreate("test").addMapping("type1", type1TermVectorMapping())); ensureGreen(); @@ -804,39 +821,48 @@ public class HighlighterSearchIT extends ESIntegTestCase { .setSource("field1", "some quick and hairy brown:fox jumped over the lazy dog")); logger.info("--> highlighting and searching on 'field' with word boundary_scanner"); - SearchSourceBuilder source = searchSource() - .query(termQuery("field1", "some")) - .highlighter(highlight() - .field("field1", 23, 1) - .order("score") - .preTags("").postTags("") - .boundaryScannerType(BoundaryScannerType.WORD)); + for (String type : new String[] {"unified", "fvh"}) { + SearchSourceBuilder source = searchSource() + .query(termQuery("field1", "some")) + .highlighter(highlight() + .field("field1", 23, 1) + .highlighterType(type) + .preTags("").postTags("") + .boundaryScannerType(BoundaryScannerType.WORD)); - SearchResponse searchResponse = client().prepareSearch("test").setSource(source).get(); + SearchResponse searchResponse = client().prepareSearch("test").setSource(source).get(); - assertHighlight(searchResponse, 0, "field1", 0, 1, equalTo("some quick and hairy brown")); + assertHighlight(searchResponse, 0, "field1", 0, 1, anyOf( + equalTo("some quick and hairy brown"), + equalTo("some") + )); + } } - public void testFastVectorHighlighterWithWordBoundaryScannerAndLocale() throws Exception { + public void testHighlighterWithWordBoundaryScannerAndLocale() throws Exception { assertAcked(prepareCreate("test").addMapping("type1", type1TermVectorMapping())); ensureGreen(); indexRandom(true, client().prepareIndex("test", "type1") .setSource("field1", "some quick and hairy brown:fox jumped over the lazy dog")); - logger.info("--> highlighting and searching on 'field' with word boundary_scanner"); - SearchSourceBuilder source = searchSource() + for (String type : new String[] {"unified", "fvh"}) { + SearchSourceBuilder source = searchSource() .query(termQuery("field1", "some")) .highlighter(highlight() - .field("field1", 23, 1) - .order("score") - .preTags("").postTags("") - .boundaryScannerType(BoundaryScannerType.WORD) - .boundaryScannerLocale(Locale.ENGLISH.toLanguageTag())); + .field("field1", 23, 1) + .highlighterType(type) + .preTags("").postTags("") + .boundaryScannerType(BoundaryScannerType.WORD) + .boundaryScannerLocale(Locale.ENGLISH.toLanguageTag())); - SearchResponse searchResponse = client().prepareSearch("test").setSource(source).get(); + SearchResponse searchResponse = client().prepareSearch("test").setSource(source).get(); - assertHighlight(searchResponse, 0, "field1", 0, 1, equalTo("some quick and hairy brown")); + assertHighlight(searchResponse, 0, "field1", 0, 1, anyOf( + equalTo("some quick and hairy brown"), + equalTo("some") + )); + } } /** @@ -1841,16 +1867,16 @@ public class HighlighterSearchIT extends ESIntegTestCase { response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get(); assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some")); + // Unified hl also works but the fragment is longer than the plain highlighter because of the boundary is the word + field.highlighterType("unified"); + response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get(); + assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some")); + // Postings hl also works but the fragment is the whole first sentence (size ignored) field.highlighterType("postings"); response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get(); assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some of me should get cut off.")); - // Unified hl also works but the fragment is the whole first sentence (size ignored) - field.highlighterType("unified"); - response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get(); - assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some of me should get cut off.")); - // We can also ask for a fragment longer than the input string and get the whole string field.highlighterType("plain").noMatchSize(text.length() * 2); response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get(); @@ -1860,16 +1886,15 @@ public class HighlighterSearchIT extends ESIntegTestCase { response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get(); assertHighlight(response, 0, "text", 0, 1, equalTo(text)); + field.highlighterType("unified"); + response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get(); + assertHighlight(response, 0, "text", 0, 1, equalTo(text)); + //no difference using postings hl as the noMatchSize is ignored (just needs to be greater than 0) field.highlighterType("postings"); response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get(); assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some of me should get cut off.")); - //no difference using unified hl as the noMatchSize is ignored (just needs to be greater than 0) - field.highlighterType("unified"); - response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get(); - assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some of me should get cut off.")); - // We can also ask for a fragment exactly the size of the input field and get the whole field field.highlighterType("plain").noMatchSize(text.length()); response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get(); @@ -1879,16 +1904,16 @@ public class HighlighterSearchIT extends ESIntegTestCase { response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get(); assertHighlight(response, 0, "text", 0, 1, equalTo(text)); + // unified hl returns the first sentence as the noMatchSize does not cross sentence boundary. + field.highlighterType("unified"); + response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get(); + assertHighlight(response, 0, "text", 0, 1, equalTo(text)); + //no difference using postings hl as the noMatchSize is ignored (just needs to be greater than 0) field.highlighterType("postings"); response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get(); assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some of me should get cut off.")); - //no difference using unified hl as the noMatchSize is ignored (just needs to be greater than 0) - field.highlighterType("unified"); - response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get(); - assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some of me should get cut off.")); - // You can set noMatchSize globally in the highlighter as well field.highlighterType("plain").noMatchSize(null); response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field).noMatchSize(21)).get(); @@ -1898,12 +1923,12 @@ public class HighlighterSearchIT extends ESIntegTestCase { response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field).noMatchSize(21)).get(); assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some")); - field.highlighterType("postings"); - response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field).noMatchSize(21)).get(); - assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some of me should get cut off.")); - field.highlighterType("unified"); response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field).noMatchSize(21)).get(); + assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some")); + + field.highlighterType("postings"); + response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field).noMatchSize(21)).get(); assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some of me should get cut off.")); // We don't break if noMatchSize is less than zero though @@ -1947,16 +1972,15 @@ public class HighlighterSearchIT extends ESIntegTestCase { response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get(); assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some")); + field.highlighterType("unified"); + response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get(); + assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some")); + // Postings hl also works but the fragment is the whole first sentence (size ignored) field.highlighterType("postings"); response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get(); assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some of me should get cut off.")); - // Unified hl also works but the fragment is the whole first sentence (size ignored) - field.highlighterType("unified"); - response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get(); - assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some of me should get cut off.")); - // And noMatchSize returns nothing when the first entry is empty string! index("test", "type1", "2", "text", new String[] {"", text2}); refresh(); @@ -1980,11 +2004,12 @@ public class HighlighterSearchIT extends ESIntegTestCase { .highlighter(new HighlightBuilder().field(field)).get(); assertNotHighlighted(response, 0, "text"); + // except for the unified highlighter which starts from the first string with actual content field.highlighterType("unified"); response = client().prepareSearch("test") .setQuery(idsQueryBuilder) .highlighter(new HighlightBuilder().field(field)).get(); - assertNotHighlighted(response, 0, "text"); + assertHighlight(response, 0, "text", 0, 1, equalTo("I am short")); // But if the field was actually empty then you should get no highlighting field index("test", "type1", "3", "text", new String[] {}); @@ -2031,7 +2056,7 @@ public class HighlighterSearchIT extends ESIntegTestCase { .highlighter(new HighlightBuilder().field(field)).get(); assertNotHighlighted(response, 0, "text"); - field.highlighterType("fvh"); + field.highlighterType("unified"); response = client().prepareSearch("test") .setQuery(idsQueryBuilder) .highlighter(new HighlightBuilder().field(field)).get(); @@ -2081,16 +2106,16 @@ public class HighlighterSearchIT extends ESIntegTestCase { response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get(); assertHighlight(response, 0, "text", 0, 1, equalTo("This is the first sentence")); + field.highlighterType("unified"); + response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get(); + assertHighlight(response, 0, "text", 0, 1, equalTo("This is the first sentence")); + + // Postings hl also works but the fragment is the whole first sentence (size ignored) field.highlighterType("postings"); response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get(); assertHighlight(response, 0, "text", 0, 1, equalTo("This is the first sentence.")); - // Unified hl also works but the fragment is the whole first sentence (size ignored) - field.highlighterType("unified"); - response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get(); - assertHighlight(response, 0, "text", 0, 1, equalTo("This is the first sentence.")); - //if there's a match we only return the values with matches (whole value as number_of_fragments == 0) MatchQueryBuilder queryBuilder = QueryBuilders.matchQuery("text", "third fifth"); field.highlighterType("plain"); diff --git a/docs/reference/search/request/highlighting.asciidoc b/docs/reference/search/request/highlighting.asciidoc index 81f454bb158..033ba01cfc4 100644 --- a/docs/reference/search/request/highlighting.asciidoc +++ b/docs/reference/search/request/highlighting.asciidoc @@ -140,6 +140,9 @@ It supports accurate phrase and multi-term (fuzzy, prefix, regex) highlighting a * `highlight_query` * `pre_tags and `post_tags` * `require_field_match` +* `boundary_scanner` (`sentence` (**default**) or `word`) +* `max_fragment_length` (only for `sentence` scanner) +* `no_match_size` ==== Force highlighter type @@ -345,7 +348,7 @@ parameter to control the margin to start highlighting from. In the case where there is no matching fragment to highlight, the default is to not return anything. Instead, we can return a snippet of text from the beginning of the field by setting `no_match_size` (default `0`) to the length -of the text that you want returned. The actual length may be shorter than +of the text that you want returned. The actual length may be shorter or longer than specified as it tries to break on a word boundary. When using the postings highlighter it is not possible to control the actual size of the snippet, therefore the first sentence gets returned whenever `no_match_size` is @@ -504,21 +507,26 @@ GET /_search [[boundary-scanners]] ==== Boundary Scanners -When highlighting a field using the fast vector highlighter, you can specify -how to break the highlighted fragments using `boundary_scanner`, which accepts +When highlighting a field using the unified highlighter or the fast vector highlighter, +you can specify how to break the highlighted fragments using `boundary_scanner`, which accepts the following values: -* `chars` (default): allows to configure which characters (`boundary_chars`) +* `chars` (default mode for the FVH): allows to configure which characters (`boundary_chars`) constitute a boundary for highlighting. It's a single string with each boundary character defined in it (defaults to `.,!? \t\n`). It also allows configuring the `boundary_max_scan` to control how far to look for boundary characters -(defaults to `20`). +(defaults to `20`). Works only with the Fast Vector Highlighter. -* `word` and `sentence`: use Java's https://docs.oracle.com/javase/8/docs/api/java/text/BreakIterator.html[BreakIterator] -to break the highlighted fragments at the next _word_ or _sentence_ boundary. +* `sentence` and `word`: use Java's https://docs.oracle.com/javase/8/docs/api/java/text/BreakIterator.html[BreakIterator] +to break the highlighted fragments at the next _sentence_ or _word_ boundary. You can further specify `boundary_scanner_locale` to control which Locale is used to search the text for these boundaries. +[NOTE] +When used with the `unified` highlighter, the `sentence` scanner splits sentence +bigger than `fragment_size` at the first word boundary next to `fragment_size`. +You can set `fragment_size` to 0 to never split any sentence. + [[matched-fields]] ==== Matched Fields The Fast Vector Highlighter can combine matches on multiple fields to