From e2f3f626ee4c7f2d2df1e09a31b971c81e95be44 Mon Sep 17 00:00:00 2001 From: Dawid Weiss Date: Thu, 10 Sep 2020 13:17:13 +0200 Subject: [PATCH] LUCENE-9464: Add high(er)-level hit highlighter example that demonstrates and uses low-level components (#1820) --- .../FieldValueHighlighters.java | 139 ++++++ .../matchhighlight/MatchHighlighter.java | 308 ++++++++++++ .../matchhighlight/MatchRegionRetriever.java | 11 +- .../matchhighlight/AnalyzerWithGaps.java | 51 ++ .../search/matchhighlight/IndexBuilder.java | 105 ++++ .../matchhighlight/TestMatchHighlighter.java | 466 ++++++++++++++++++ .../TestMatchRegionRetriever.java | 311 +++++------- 7 files changed, 1199 insertions(+), 192 deletions(-) create mode 100644 lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/FieldValueHighlighters.java create mode 100644 lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/MatchHighlighter.java create mode 100644 lucene/highlighter/src/test/org/apache/lucene/search/matchhighlight/AnalyzerWithGaps.java create mode 100644 lucene/highlighter/src/test/org/apache/lucene/search/matchhighlight/IndexBuilder.java create mode 100644 lucene/highlighter/src/test/org/apache/lucene/search/matchhighlight/TestMatchHighlighter.java diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/FieldValueHighlighters.java b/lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/FieldValueHighlighters.java new file mode 100644 index 00000000000..ece66937e0e --- /dev/null +++ b/lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/FieldValueHighlighters.java @@ -0,0 +1,139 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search.matchhighlight; + +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.function.BiPredicate; +import java.util.function.Predicate; + +/** + * A factory of {@link org.apache.lucene.search.matchhighlight.MatchHighlighter.FieldValueHighlighter} classes + * that cover typical use cases (verbatim values, highlights, abbreviations). + * + * @see MatchHighlighter#appendFieldHighlighter + */ +public final class FieldValueHighlighters { + private FieldValueHighlighters() { + } + + private static abstract class AbstractFieldValueHighlighter implements MatchHighlighter.FieldValueHighlighter { + private final BiPredicate testPredicate; + + protected AbstractFieldValueHighlighter(BiPredicate testPredicate) { + this.testPredicate = testPredicate; + } + + @Override + public final boolean isApplicable(String field, boolean hasMatches) { + return testPredicate.test(field, hasMatches); + } + } + + /** + * Displays up to {@code maxLeadingCharacters} of the field's value, regardless of whether it contained + * highlights or not. + */ + public static MatchHighlighter.FieldValueHighlighter maxLeadingCharacters(int maxLeadingCharacters, String ellipsis, Set fields) { + PassageSelector passageSelector = defaultPassageSelector(); + PassageFormatter passageFormatter = new PassageFormatter(ellipsis, "", ""); + return new AbstractFieldValueHighlighter((field, hasMatches) -> fields.contains(field)) { + @Override + public List format(String field, String[] values, String contiguousValue, + List valueRanges, List matchOffsets) { + List bestPassages = + passageSelector.pickBest(contiguousValue, Collections.emptyList(), maxLeadingCharacters, 1, valueRanges); + + return passageFormatter.format(contiguousValue, bestPassages, valueRanges); + } + + @Override + public Collection alwaysFetchedFields() { + return fields; + } + }; + } + + /** + * Default preconfigured {@link PassageSelector}. + */ + public static PassageSelector defaultPassageSelector() { + return new PassageSelector( + PassageSelector.DEFAULT_SCORER, + new BreakIteratorShrinkingAdjuster()); + } + + /** + * Highlights fields matching predicate {@code matchFields} only if they contained query matches. + */ + public static MatchHighlighter.FieldValueHighlighter highlighted( + int maxPassageWindow, + int maxPassages, + PassageFormatter passageFormatter, + Predicate matchFields) { + PassageSelector passageSelector = defaultPassageSelector(); + return new AbstractFieldValueHighlighter((field, hasMatches) -> matchFields.test(field) && hasMatches) { + @Override + public List format(String field, String[] values, String contiguousValue, + List valueRanges, List matchOffsets) { + assert matchOffsets != null; + + List bestPassages = + passageSelector.pickBest(contiguousValue, matchOffsets, maxPassageWindow, maxPassages, valueRanges); + + return passageFormatter.format(contiguousValue, bestPassages, valueRanges); + } + }; + } + + /** + * Always returns raw field values, no highlighting or value truncation is applied. + */ + public static MatchHighlighter.FieldValueHighlighter verbatimValue(String field, String... moreFields) { + HashSet matchFields = new HashSet<>(Arrays.asList(moreFields)); + matchFields.add(field); + return new AbstractFieldValueHighlighter((fld, hasMatches) -> matchFields.contains(fld)) { + @Override + public Collection alwaysFetchedFields() { + return matchFields; + } + + @Override + public List format(String field, String[] values, String contiguousValue, List valueRanges, + List matchOffsets) { + return Arrays.asList(values); + } + }; + } + + /** + * Matches all fields and omits their value in the output (so that no highlight or value is emitted). + */ + public static MatchHighlighter.FieldValueHighlighter skipRemaining() { + return new AbstractFieldValueHighlighter((field, hasMatches) -> true) { + @Override + public List format(String field, String[] values, String contiguousValue, List valueRanges, + List matchOffsets) { + return null; + } + }; + } +} diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/MatchHighlighter.java b/lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/MatchHighlighter.java new file mode 100644 index 00000000000..20938b02ecb --- /dev/null +++ b/lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/MatchHighlighter.java @@ -0,0 +1,308 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search.matchhighlight; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.DocumentStoredFieldVisitor; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.IndexableField; +import org.apache.lucene.index.LeafReader; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.search.TopDocs; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.HashSet; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.function.Predicate; +import java.util.stream.Stream; + +/** + * An example highlighter that combines several lower-level highlighting + * utilities in this package into a fully featured, ready-to-use component. + *

+ * Note that if you need to customize or tweak the details of highlighting, + * it is better to assemble your own highlighter using those low-level + * building blocks, rather than extend or modify this one. + */ +public class MatchHighlighter { + private final IndexSearcher searcher; + private final OffsetsRetrievalStrategySupplier offsetsRetrievalStrategies; + private final Analyzer analyzer; + + private final HashSet fieldsAlwaysReturned = new HashSet<>(); + private final List fieldHighlighters = new ArrayList<>(); + + /** + * Actual per-field highlighter. Field highlighters are probed whether they + * are applicable to a particular combination of (field, hasMatches) pair. If a highlighter + * declares it is applicable, its {@link #format} method is invoked and the result + * is returned as the field's value. + * + * @see FieldValueHighlighters + */ + public interface FieldValueHighlighter { + /** + * Check if this highlighter can be applied to a given field. + * + * @param field Field name + * @param hasMatches {@code true} if the field has a non-empty set of match regions. + */ + boolean isApplicable(String field, boolean hasMatches); + + /** + * Do format field values appropriately. + */ + List format(String field, String[] values, String contiguousValue, + List valueRanges, List matchOffsets); + + /** + * @return Returns a set of fields that must be fetched for each document, regardless + * of whether they had matches or not. This is useful to load and return certain fields + * that should always be included (identifiers, document titles, etc.). + */ + default Collection alwaysFetchedFields() { + return Collections.emptyList(); + } + + /** + * Returns a new field value highlighter that is a combination of this one and another one. + */ + default FieldValueHighlighter or(FieldValueHighlighter other) { + FieldValueHighlighter first = this; + FieldValueHighlighter second = other; + + HashSet fieldUnion = new HashSet<>(); + fieldUnion.addAll(first.alwaysFetchedFields()); + fieldUnion.addAll(second.alwaysFetchedFields()); + + return new FieldValueHighlighter() { + @Override + public boolean isApplicable(String field, boolean hasMatches) { + return first.isApplicable(field, hasMatches) + || second.isApplicable(field, hasMatches); + } + + @Override + public List format(String field, String[] values, String contiguousValue, + List valueRanges, List matchOffsets) { + FieldValueHighlighter delegate = + first.isApplicable(field, matchOffsets != null && !matchOffsets.isEmpty()) ? first : second; + return delegate.format(field, values, contiguousValue, valueRanges, matchOffsets); + } + + @Override + public Collection alwaysFetchedFields() { + return fieldUnion; + } + }; + } + } + + /** + * Append a new highlighter to field highlighters chain. The order of field highlighters + * is important (first-matching wins). + */ + public MatchHighlighter appendFieldHighlighter(FieldValueHighlighter highlighter) { + fieldHighlighters.add(highlighter); + fieldsAlwaysReturned.addAll(highlighter.alwaysFetchedFields()); + return this; + } + + /** + * Always fetch the given set of fields for all input documents. + */ + public void alwaysFetchFields(String field, String... otherFields) { + Stream.concat(Stream.of(field), Stream.of(otherFields)) + .forEach(fld -> fieldsAlwaysReturned.add(Objects.requireNonNull(fld))); + } + + /** + * Single document's highlights. + */ + public static class DocHighlights { + public final int docId; + public final Map> fields = new LinkedHashMap<>(); + + public DocHighlights(int docId) { + this.docId = docId; + } + } + + /** + * An {@link OffsetRange} of a match, together with the source query that caused it. + */ + public static class QueryOffsetRange extends OffsetRange { + public final Query query; + + QueryOffsetRange(Query query, int from, int to) { + super(from, to); + this.query = query; + } + } + + private static class DocHit { + final int docId; + private final LeafReader leafReader; + private final int leafDocId; + private final LinkedHashMap> matchRanges + = new LinkedHashMap<>(); + + DocHit(int docId, LeafReader leafReader, int leafDocId) { + this.docId = docId; + this.leafReader = leafReader; + this.leafDocId = leafDocId; + } + + void addMatches(Query query, Map> hits) { + hits.forEach((field, offsets) -> { + List target = matchRanges.computeIfAbsent(field, (fld) -> new ArrayList<>()); + offsets.forEach(o -> target.add(new QueryOffsetRange(query, o.from, o.to))); + }); + } + + Document document(Predicate needsField) throws IOException { + // Only load the fields that have a chance to be highlighted. + DocumentStoredFieldVisitor visitor = new DocumentStoredFieldVisitor() { + @Override + public Status needsField(FieldInfo fieldInfo) { + return (matchRanges.containsKey(fieldInfo.name) || + needsField.test(fieldInfo.name)) ? Status.YES : Status.NO; + } + }; + + leafReader.document(leafDocId, visitor); + return visitor.getDocument(); + } + } + + public MatchHighlighter(IndexSearcher searcher, Analyzer analyzer) { + this(searcher, analyzer, MatchRegionRetriever.computeOffsetRetrievalStrategies(searcher.getIndexReader(), analyzer)); + } + + public MatchHighlighter(IndexSearcher searcher, + Analyzer analyzer, + OffsetsRetrievalStrategySupplier offsetsRetrievalStrategies) { + this.searcher = searcher; + this.offsetsRetrievalStrategies = offsetsRetrievalStrategies; + this.analyzer = analyzer; + } + + public Stream highlight(TopDocs topDocs, Query... queries) throws IOException { + // We want to preserve topDocs document ordering and MatchRegionRetriever is optimized + // for streaming, so we'll just prepopulate the map in proper order. + LinkedHashMap docHits = new LinkedHashMap<>(); + for (ScoreDoc scoreDoc : topDocs.scoreDocs) { + docHits.put(scoreDoc.doc, null); + } + + // Collect match ranges for each query and associate each range to the origin query. + for (Query q : queries) { + MatchRegionRetriever highlighter = + new MatchRegionRetriever(searcher, searcher.rewrite(q), offsetsRetrievalStrategies); + highlighter.highlightDocuments(topDocs, + (int docId, LeafReader leafReader, int leafDocId, Map> hits) -> { + DocHit docHit = docHits.get(docId); + if (docHit == null) { + docHit = new DocHit(docId, leafReader, leafDocId); + docHits.put(docId, docHit); + } + docHit.addMatches(q, hits); + }); + } + + return docHits.values().stream() + .filter(Objects::nonNull) // This should always the case? + .map(this::computeDocFieldValues); + } + + private DocHighlights computeDocFieldValues(DocHit docHit) { + Document doc; + try { + doc = docHit.document(fieldsAlwaysReturned::contains); + } catch (IOException e) { + throw new UncheckedIOException(e); + } + + DocHighlights docHighlights = new DocHighlights(docHit.docId); + + HashSet unique = new HashSet<>(); + for (IndexableField indexableField : doc) { + String field = indexableField.name(); + if (!unique.add(field)) { + continue; + } + + String[] values = doc.getValues(field); + String contiguousValue = contiguousFieldValue(field, values); + List valueRanges = computeValueRanges(field, values); + List offsets = docHit.matchRanges.get(field); + + List formattedValues = fieldValueHighlighter(field, offsets != null) + .format(field, values, contiguousValue, valueRanges, offsets); + + if (formattedValues != null) { + docHighlights.fields.put(field, formattedValues); + } + } + + return docHighlights; + } + + private List computeValueRanges(String field, String[] values) { + ArrayList valueRanges = new ArrayList<>(); + int offset = 0; + for (CharSequence v : values) { + valueRanges.add(new OffsetRange(offset, offset + v.length())); + offset += v.length(); + offset += analyzer.getOffsetGap(field); + } + return valueRanges; + } + + private String contiguousFieldValue(String field, String[] values) { + String value; + if (values.length == 1) { + value = values[0]; + } else { + // TODO: This can be inefficient if offset gap is large but the logic + // of applying offsets would get much more complicated so leaving for now + // (would have to recalculate all offsets to omit gaps). + String fieldGapPadding = " ".repeat(analyzer.getOffsetGap(field)); + value = String.join(fieldGapPadding, values); + } + return value; + } + + private FieldValueHighlighter fieldValueHighlighter(String field, boolean hasMatches) { + for (FieldValueHighlighter highlighter : fieldHighlighters) { + if (highlighter.isApplicable(field, hasMatches)) { + return highlighter; + } + } + throw new RuntimeException("No field highlighter could be matched to field: " + field); + } +} diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/MatchRegionRetriever.java b/lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/MatchRegionRetriever.java index 16c9a11e968..2861ac66191 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/MatchRegionRetriever.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/MatchRegionRetriever.java @@ -80,22 +80,23 @@ public class MatchRegionRetriever { /** * A constructor with the default offset strategy supplier. + * + * @param analyzer An analyzer that may be used to reprocess (retokenize) document fields + * in the absence of position offsets in the index. Note that the analyzer must return + * tokens (positions and offsets) identical to the ones stored in the index. */ public MatchRegionRetriever(IndexSearcher searcher, Query query, Analyzer analyzer) throws IOException { - this(searcher, query, analyzer, computeOffsetRetrievalStrategies(searcher.getIndexReader(), analyzer)); + this(searcher, query, computeOffsetRetrievalStrategies(searcher.getIndexReader(), analyzer)); } /** * @param searcher Index searcher to be used for retrieving matches. * @param query The query for which matches should be retrieved. The query should be rewritten * against the provided searcher. - * @param analyzer An analyzer that may be used to reprocess (retokenize) document fields - * in the absence of position offsets in the index. Note that the analyzer must return - * tokens (positions and offsets) identical to the ones stored in the index. * @param fieldOffsetStrategySupplier A custom supplier of per-field {@link OffsetsRetrievalStrategy} * instances. */ - public MatchRegionRetriever(IndexSearcher searcher, Query query, Analyzer analyzer, + public MatchRegionRetriever(IndexSearcher searcher, Query query, OffsetsRetrievalStrategySupplier fieldOffsetStrategySupplier) throws IOException { leaves = searcher.getIndexReader().leaves(); diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/matchhighlight/AnalyzerWithGaps.java b/lucene/highlighter/src/test/org/apache/lucene/search/matchhighlight/AnalyzerWithGaps.java new file mode 100644 index 00000000000..3009c91bbe5 --- /dev/null +++ b/lucene/highlighter/src/test/org/apache/lucene/search/matchhighlight/AnalyzerWithGaps.java @@ -0,0 +1,51 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search.matchhighlight; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.DelegatingAnalyzerWrapper; + +/** + * An analyzer for tests that has a predefined offset and position gap. + */ +class AnalyzerWithGaps extends DelegatingAnalyzerWrapper { + private final Analyzer delegate; + private final int offsetGap; + private final int positionGap; + + AnalyzerWithGaps(int offsetGap, int positionGap, Analyzer delegate) { + super(delegate.getReuseStrategy()); + this.delegate = delegate; + this.offsetGap = offsetGap; + this.positionGap = positionGap; + } + + @Override + protected Analyzer getWrappedAnalyzer(String fieldName) { + return delegate; + } + + @Override + public int getOffsetGap(String fieldName) { + return offsetGap; + } + + @Override + public int getPositionIncrementGap(String fieldName) { + return positionGap; + } +} \ No newline at end of file diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/matchhighlight/IndexBuilder.java b/lucene/highlighter/src/test/org/apache/lucene/search/matchhighlight/IndexBuilder.java new file mode 100644 index 00000000000..2a6e783f80e --- /dev/null +++ b/lucene/highlighter/src/test/org/apache/lucene/search/matchhighlight/IndexBuilder.java @@ -0,0 +1,105 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search.matchhighlight; + +import com.carrotsearch.randomizedtesting.RandomizedTest; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.NumericDocValuesField; +import org.apache.lucene.document.StringField; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.IndexableField; +import org.apache.lucene.search.Sort; +import org.apache.lucene.search.SortField; +import org.apache.lucene.store.ByteBuffersDirectory; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.IOUtils; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.function.BiFunction; +import java.util.function.Consumer; + +/** + * Utility class for building an ephemeral document index + * and running a block of code on its reader. + */ +class IndexBuilder { + public static final String FLD_ID = "id"; + public static final String FLD_SORT_ORDER = "id_order"; + + private final BiFunction toField; + private final ArrayList documents = new ArrayList<>(); + private int seq; + + class DocFields { + final Document document; + + public DocFields(Document doc) { + this.document = doc; + } + + public void add(String field, String... values) { + assert values.length > 0 : "At least one field value is required."; + for (String value : values) { + document.add(toField.apply(field, value)); + } + } + } + + IndexBuilder(BiFunction valueToField) { + this.toField = valueToField; + } + + public IndexBuilder doc(String field, String... values) { + return doc(fields -> { + fields.add(field, values); + }); + } + + public IndexBuilder doc(Consumer fields) { + Document doc = new Document(); + doc.add(new NumericDocValuesField(FLD_SORT_ORDER, seq)); + doc.add(new StringField(FLD_ID, Integer.toString(seq++), Field.Store.YES)); + fields.accept(new DocFields(doc)); + documents.add(doc); + return this; + } + + public IndexBuilder build(Analyzer analyzer, IOUtils.IOConsumer block) throws IOException { + IndexWriterConfig config = new IndexWriterConfig(analyzer); + config.setIndexSort(new Sort(new SortField(FLD_SORT_ORDER, SortField.Type.LONG))); + try (Directory directory = new ByteBuffersDirectory()) { + IndexWriter iw = new IndexWriter(directory, config); + for (Document doc : documents) { + iw.addDocument(doc); + } + if (RandomizedTest.randomBoolean()) { + iw.commit(); + } + iw.flush(); + + try (DirectoryReader reader = DirectoryReader.open(iw)) { + block.accept(reader); + } + } + return this; + } +} diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/matchhighlight/TestMatchHighlighter.java b/lucene/highlighter/src/test/org/apache/lucene/search/matchhighlight/TestMatchHighlighter.java new file mode 100644 index 00000000000..d1acf982541 --- /dev/null +++ b/lucene/highlighter/src/test/org/apache/lucene/search/matchhighlight/TestMatchHighlighter.java @@ -0,0 +1,466 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search.matchhighlight; + +import com.carrotsearch.randomizedtesting.RandomizedTest; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.WhitespaceTokenizer; +import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper; +import org.apache.lucene.analysis.synonym.SynonymGraphFilter; +import org.apache.lucene.analysis.synonym.SynonymMap; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.FieldType; +import org.apache.lucene.document.TextField; +import org.apache.lucene.index.IndexOptions; +import org.apache.lucene.index.IndexableField; +import org.apache.lucene.index.Term; +import org.apache.lucene.queries.intervals.IntervalQuery; +import org.apache.lucene.queries.intervals.Intervals; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.MatchAllDocsQuery; +import org.apache.lucene.search.PhraseQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.Sort; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.util.CharsRef; +import org.apache.lucene.util.LuceneTestCase; +import org.hamcrest.Matchers; +import org.junit.Before; +import org.junit.Test; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.Objects; +import java.util.Set; +import java.util.function.Function; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +public class TestMatchHighlighter extends LuceneTestCase { + private static final String FLD_ID = "id"; + private static final String FLD_TEXT1 = "text1"; + private static final String FLD_TEXT2 = "text2"; + + private FieldType TYPE_TEXT_POSITIONS_OFFSETS; + private FieldType TYPE_TEXT_POSITIONS; + + private PerFieldAnalyzerWrapper analyzer; + + @Before + public void setup() throws IOException { + TYPE_TEXT_POSITIONS = TextField.TYPE_STORED; + + TYPE_TEXT_POSITIONS_OFFSETS = new FieldType(TextField.TYPE_STORED); + TYPE_TEXT_POSITIONS_OFFSETS.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); + TYPE_TEXT_POSITIONS_OFFSETS.freeze(); + + Map fieldAnalyzers = new HashMap<>(); + + // Create an analyzer with some synonyms, just to showcase them. + SynonymMap synonymMap = buildSynonymMap(new String[][]{ + {"moon\u0000shine", "firewater"}, + {"firewater", "moon\u0000shine"}, + }); + + // Make a non-empty offset gap so that break iterator doesn't go haywire on multivalues + // glued together. + final int offsetGap = RandomizedTest.randomIntBetween(1, 2); + final int positionGap = RandomizedTest.randomFrom(new int[]{0, 1, 100}); + Analyzer synonymsAnalyzer = + new AnalyzerWithGaps(offsetGap, positionGap, new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName) { + Tokenizer tokenizer = new WhitespaceTokenizer(); + TokenStream tokenStream = new SynonymGraphFilter(tokenizer, synonymMap, true); + return new TokenStreamComponents(tokenizer, tokenStream); + } + }); + + fieldAnalyzers.put(FLD_TEXT1, synonymsAnalyzer); + fieldAnalyzers.put(FLD_TEXT2, synonymsAnalyzer); + + analyzer = new PerFieldAnalyzerWrapper(new MissingAnalyzer(), fieldAnalyzers); + } + + static SynonymMap buildSynonymMap(String[][] synonyms) throws IOException { + SynonymMap.Builder builder = new SynonymMap.Builder(); + for (String[] pair : synonyms) { + assertThat(pair.length, Matchers.equalTo(2)); + builder.add(new CharsRef(pair[0]), new CharsRef(pair[1]), true); + } + return builder.build(); + } + + @Test + public void testBasicUsage() throws IOException { + new IndexBuilder(this::toField) + .doc(FLD_TEXT1, "foo bar baz") + .doc(FLD_TEXT1, "bar foo baz") + .doc(fields -> { + fields.add(FLD_TEXT1, "Very long content but not matching anything."); + fields.add(FLD_TEXT2, "no foo but bar"); + }) + .build(analyzer, reader -> { + Query query = new BooleanQuery.Builder() + .add(new TermQuery(new Term(FLD_TEXT1, "foo")), BooleanClause.Occur.SHOULD) + .add(new TermQuery(new Term(FLD_TEXT2, "bar")), BooleanClause.Occur.SHOULD) + .build(); + + // In the most basic scenario, we run a search against a query, retrieve + // top docs... + IndexSearcher searcher = new IndexSearcher(reader); + Sort sortOrder = Sort.INDEXORDER; // So that results are consistently ordered. + TopDocs topDocs = searcher.search(query, 10, sortOrder); + + // ...and would want a fixed set of fields from those documents, some of them + // possibly highlighted if they matched the query. + // + // This configures the highlighter so that the FLD_ID field is always returned verbatim, + // and FLD_TEXT1 is returned *only if it contained a query match*. + MatchHighlighter highlighter = + new MatchHighlighter(searcher, analyzer) + .appendFieldHighlighter(FieldValueHighlighters.verbatimValue(FLD_ID)) + .appendFieldHighlighter(FieldValueHighlighters.highlighted( + 80 * 3, 1, new PassageFormatter("...", ">", "<"), FLD_TEXT1::equals)) + .appendFieldHighlighter(FieldValueHighlighters.skipRemaining()); + + // Note document field highlights are a stream over documents in topDocs. In the remaining code we will just + // collect them on the fly into a preformatted string. + Stream highlights = highlighter.highlight(topDocs, query); + assertHighlights(toDocList(highlights), + " 0. id: 0", + " text1: >foo< bar baz", + " 1. id: 1", + " text1: bar >foo< baz", + " 2. id: 2"); + + // In a more realistic use case, you'd want to show the value of a given field *regardless* of whether it + // contained a highlight or not -- it is odd that document "id: 2" above doesn't have the 'text1' field + // shown because that field wasn't part of the query match. + // + // Let's say the field is also potentially long; if it contains a match, + // we would want to display the contextual snippet surrounding that match. If it does not contain any + // matches, we would want to display its content up to a given number of characters (lead lines). + // + // Let's do this by adding an appropriate field highlighter on FLD_TEXT1. + highlighter = + new MatchHighlighter(searcher, analyzer) + .appendFieldHighlighter(FieldValueHighlighters.verbatimValue(FLD_ID)) + .appendFieldHighlighter(FieldValueHighlighters.highlighted( + 80 * 3, 1, new PassageFormatter("...", ">", "<"), FLD_TEXT1::equals)) + .appendFieldHighlighter(FieldValueHighlighters.maxLeadingCharacters(10, "...", Set.of(FLD_TEXT1))) + .appendFieldHighlighter(FieldValueHighlighters.skipRemaining()); + + assertHighlights(toDocList(highlighter.highlight(topDocs, query)), + " 0. id: 0", + " text1: >foo< bar baz", + " 1. id: 1", + " text1: bar >foo< baz", + " 2. id: 2", + " text1: Very long..."); + + // Field highlighters can apply to multiple fields and be chained for convenience. + // For example, this defines a combined highlighter over both FLD_TEXT1 and FLD_TEXT2. + Set fields = Set.of(FLD_TEXT1, FLD_TEXT2); + MatchHighlighter.FieldValueHighlighter highlightedOrAbbreviated = + FieldValueHighlighters.highlighted(80 * 3, 1, new PassageFormatter("...", ">", "<"), fields::contains) + .or(FieldValueHighlighters.maxLeadingCharacters(10, "...", fields)); + + highlighter = + new MatchHighlighter(searcher, analyzer) + .appendFieldHighlighter(FieldValueHighlighters.verbatimValue(FLD_ID)) + .appendFieldHighlighter(highlightedOrAbbreviated) + .appendFieldHighlighter(FieldValueHighlighters.skipRemaining()); + + assertHighlights(toDocList(highlighter.highlight(topDocs, query)), + " 0. id: 0", + " text1: >foo< bar baz", + " 1. id: 1", + " text1: bar >foo< baz", + " 2. id: 2", + " text1: Very long...", + " text2: no foo but >bar<"); + }); + } + + @Test + public void testSynonymHighlight() throws IOException { + // There is nothing special needed to highlight or process complex queries, synonyms, etc. + // Synonyms defined in the constructor of this class. + new IndexBuilder(this::toField) + .doc(FLD_TEXT1, "Where the moon shine falls, firewater flows.") + .build(analyzer, reader -> { + IndexSearcher searcher = new IndexSearcher(reader); + Sort sortOrder = Sort.INDEXORDER; // So that results are consistently ordered. + + MatchHighlighter highlighter = + new MatchHighlighter(searcher, analyzer) + .appendFieldHighlighter(FieldValueHighlighters.highlighted( + 80 * 3, 1, new PassageFormatter("...", ">", "<"), FLD_TEXT1::equals)) + .appendFieldHighlighter(FieldValueHighlighters.skipRemaining()); + + Query query = new TermQuery(new Term(FLD_TEXT1, "firewater")); + assertHighlights(toDocList(highlighter.highlight(searcher.search(query, 10, sortOrder), query)), + "0. text1: Where the >moon shine< falls, >firewater< flows."); + + query = new PhraseQuery(FLD_TEXT1, "moon", "shine"); + assertHighlights(toDocList(highlighter.highlight(searcher.search(query, 10, sortOrder), query)), + "0. text1: Where the >moon shine< falls, >firewater< flows."); + }); + } + + @Test + public void testCustomFieldHighlightHandling() throws IOException { + // Match highlighter is a showcase of individual components in this package, suitable + // to create any kind of field-display designs. + // + // In this example we will build a custom field highlighting handler that + // highlights matches over a multivalued field, shows that field's values if it received + // no matches and limits the number of values displayed to at most 2 (with an appropriate message). + new IndexBuilder(this::toField) + // Just one document, one field, four values. + .doc(FLD_TEXT1, "foo bar", "bar foo baz", "bar baz foo", "baz baz baz") + .build(analyzer, reader -> { + IndexSearcher searcher = new IndexSearcher(reader); + Sort sortOrder = Sort.INDEXORDER; + + // Let's start with the simple predefined highlighter so that the field's value shows + // and is highlighted when it was part of the hit. + MatchHighlighter.FieldValueHighlighter highlighted = FieldValueHighlighters.highlighted( + 80 * 3, 2, new PassageFormatter("...", ">", "<"), FLD_TEXT1::equals); + MatchHighlighter highlighter = + new MatchHighlighter(searcher, analyzer) + .appendFieldHighlighter(highlighted) + .appendFieldHighlighter(FieldValueHighlighters.skipRemaining()); + + Query query = new TermQuery(new Term(FLD_TEXT1, "foo")); + TopDocs topDocs = searcher.search(query, 10, sortOrder); + + // Note the highlighter is configured with at most 2 snippets so the match on the + // third value ("bar baz foo") is omitted. Ellipsis isn't inserted too because + // values are displayed in full. + assertHighlights(toDocList(highlighter.highlight(topDocs, query)), + "0. text1: >foo< bar, bar >foo< baz"); + + // So the above works fine if the field received a match but omits it otherwise. We can + // force the display of this field by chaining with verbatim value highlighter: + highlighter = + new MatchHighlighter(searcher, analyzer) + .appendFieldHighlighter(highlighted.or(FieldValueHighlighters.verbatimValue(FLD_TEXT1))) + .appendFieldHighlighter(FieldValueHighlighters.skipRemaining()); + + assertHighlights(toDocList(highlighter.highlight(topDocs, new MatchAllDocsQuery())), + "0. text1: foo bar, bar foo baz, bar baz foo, baz baz baz"); + + // But this is not exactly what we'd like because we want to limit the display of values to the first two. + // Let's just write a custom field highlighter handler that does it. + class AtMostNValuesHighlighter implements MatchHighlighter.FieldValueHighlighter { + private final String field; + private final int limit; + + AtMostNValuesHighlighter(String field, int limit) { + this.field = field; + this.limit = limit; + } + + @Override + public boolean isApplicable(String field, boolean hasMatches) { + return Objects.equals(field, this.field); + } + + @Override + public List format(String field, String[] values, String contiguousValue, + List valueRanges, List matchOffsets) { + if (values.length <= limit) { + return Arrays.asList(values); + } else { + List collected = Stream.of(values).limit(limit).collect(Collectors.toList()); + int remaining = values.length - collected.size(); + collected.add(String.format(Locale.ROOT, "[%d omitted]", remaining)); + return collected; + } + } + + @Override + public Collection alwaysFetchedFields() { + return Collections.singleton(field); + } + } + + // We can now chain it as usual and contemplate the result. + highlighter = + new MatchHighlighter(searcher, analyzer) + .appendFieldHighlighter(highlighted.or(new AtMostNValuesHighlighter(FLD_TEXT1, 2))) + .appendFieldHighlighter(FieldValueHighlighters.skipRemaining()); + + assertHighlights(toDocList(highlighter.highlight(topDocs, query)), + "0. text1: >foo< bar, bar >foo< baz"); + assertHighlights(toDocList(highlighter.highlight(topDocs, new MatchAllDocsQuery())), + "0. text1: foo bar, bar foo baz, [2 omitted]"); + }); + } + + @Test + public void testHighlightMoreQueriesAtOnceShowoff() throws IOException { + // Match highlighter underlying components are powerful enough to build interesting, + // if not always super-practical, things. In this case, we would like to highlight + // a set of matches of *more than one* query over the same set of input documents. This includes + // highest-scoring passage resolution (from multiple hits) and different highlight markers + // for each query. + new IndexBuilder(this::toField) + .doc(FLD_TEXT1, "foo bar baz") + .doc(FLD_TEXT1, "foo baz bar") + .build(analyzer, reader -> { + // Let's start with the two queries. The first one will be an unordered + // query for (foo, baz) with a max gap of 1; let's use intervals for this. + Query q1 = new IntervalQuery(FLD_TEXT1, + Intervals.maxgaps(1, + Intervals.unordered( + Intervals.term("foo"), + Intervals.term("baz")))); + + // The second one will be a simpler term query for "bar". + Query q2 = new TermQuery(new Term(FLD_TEXT1, "bar")); + + // Let's fetch matching documents by combining the two into a Boolean query. + Query query = new BooleanQuery.Builder() + .add(q1, BooleanClause.Occur.SHOULD) + .add(q2, BooleanClause.Occur.SHOULD) + .build(); + + IndexSearcher searcher = new IndexSearcher(reader); + Sort sortOrder = Sort.INDEXORDER; // So that results are consistently ordered. + TopDocs topDocs = searcher.search(query, 10, sortOrder); + + // If we use the "regular" highlighter, the result will be slightly odd: a nested + // highlight over "bar" within the first match. Also, you can't distinguish which of the sub-queries + // caused which highlight marker... but if it were HTML then you could give the span + // some semi-translucent background and layered matches would be visible. + MatchHighlighter highlighter = + new MatchHighlighter(searcher, analyzer) + .appendFieldHighlighter(FieldValueHighlighters.highlighted( + 80 * 3, 1, new PassageFormatter("...", "", ""), FLD_TEXT1::equals)) + .appendFieldHighlighter(FieldValueHighlighters.skipRemaining()); + + assertHighlights(toDocList(highlighter.highlight(topDocs, query)), + "0. text1: foo bar baz", + "1. text1: foo baz bar"); + + // To separate highlights for multiple queries we'll pass them separately to the + // highlighter and differentiate highlight markers upon their application. Let's start with the customized + // field highlighter first. This utilizes the fact that match ranges passed from MatchHighlighter + // contain a reference to the original query which brought up the match. + class SeparateMarkerFieldHighlighter implements MatchHighlighter.FieldValueHighlighter { + private final String field; + private final Map queryClassMap; + + SeparateMarkerFieldHighlighter(String field, Map queryClassMap) { + this.field = field; + this.queryClassMap = queryClassMap; + } + + @Override + public boolean isApplicable(String field, boolean hasMatches) { + return Objects.equals(field, this.field) && hasMatches; + } + + @Override + public List format(String field, String[] values, String contiguousValue, + List valueRanges, List matchOffsets) { + PassageSelector passageSelector = new PassageSelector(); + int maxPassageWindow = 80; + int maxPassages = 3; + List bestPassages = + passageSelector.pickBest(contiguousValue, matchOffsets, maxPassageWindow, maxPassages, valueRanges); + + // We know the offset ranges passed to us by MatchHighlighter are instances of QueryOffsetRange + // so we compute the class based on that. + Function queryToClass = + (range) -> queryClassMap.get(((MatchHighlighter.QueryOffsetRange) range).query); + + PassageFormatter passageFormatter = new PassageFormatter("...", + (range) -> "", + (range) -> ""); + + return passageFormatter.format(contiguousValue, bestPassages, valueRanges); + } + } + + // And this is pretty much it. We now set up query classes to display, set up the highlighter... + Map queryClassMap = Map.of(q1, "q1", q2, "q2"); + highlighter = + new MatchHighlighter(searcher, analyzer) + .appendFieldHighlighter(new SeparateMarkerFieldHighlighter(FLD_TEXT1, queryClassMap)) + .appendFieldHighlighter(FieldValueHighlighters.skipRemaining()); + + // ...and run highlighting. Note the query passed to the highlighter are individual sub-clauses + // of the Boolean query used to fetch documents. + assertHighlights(toDocList(highlighter.highlight(topDocs, q1, q2)), + "0. text1: foo bar baz", + "1. text1: foo baz bar"); + }); + } + + private void assertHighlights(List> docList, String... expectedFormattedLines) { + ArrayList actualLines = new ArrayList<>(); + for (int doc = 0; doc < docList.size(); doc++) { + List fields = docList.get(doc); + for (int i = 0; i < fields.size(); i++) { + actualLines.add((i == 0 ? String.format(Locale.ROOT, "%2d. ", doc) : " ") + fields.get(i)); + } + } + + if (!Arrays.equals( + Stream.of(expectedFormattedLines).map(String::trim).toArray(), + actualLines.stream().map(String::trim).toArray())) { + throw new AssertionError("Actual hits were:\n" + + String.join("\n", actualLines) + "\n\n but expected them to be:\n" + + String.join("\n", expectedFormattedLines)); + } + } + + private List> toDocList(Stream highlights) { + return highlights.map(docHighlights -> + docHighlights.fields.entrySet().stream() + .map(e -> e.getKey() + ": " + String.join(", ", e.getValue())) + .collect(Collectors.toList()) + ).collect(Collectors.toList()); + } + + private IndexableField toField(String name, String value) { + switch (name) { + case FLD_TEXT1: + return new Field(name, value, TYPE_TEXT_POSITIONS_OFFSETS); + case FLD_TEXT2: + return new Field(name, value, TYPE_TEXT_POSITIONS); + default: + throw new AssertionError("Don't know how to handle this field: " + name); + } + } +} diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/matchhighlight/TestMatchRegionRetriever.java b/lucene/highlighter/src/test/org/apache/lucene/search/matchhighlight/TestMatchRegionRetriever.java index 0fd9ca03b7b..691877c1199 100644 --- a/lucene/highlighter/src/test/org/apache/lucene/search/matchhighlight/TestMatchRegionRetriever.java +++ b/lucene/highlighter/src/test/org/apache/lucene/search/matchhighlight/TestMatchRegionRetriever.java @@ -20,21 +20,17 @@ import com.carrotsearch.randomizedtesting.RandomizedTest; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.WhitespaceAnalyzer; import org.apache.lucene.analysis.core.WhitespaceTokenizer; import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper; import org.apache.lucene.analysis.synonym.SynonymGraphFilter; import org.apache.lucene.analysis.synonym.SynonymMap; -import org.apache.lucene.analysis.util.CharTokenizer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldType; -import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; -import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.IndexWriter; -import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.IndexableField; import org.apache.lucene.index.Term; import org.apache.lucene.queries.intervals.IntervalQuery; @@ -52,19 +48,13 @@ import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.spans.SpanNearQuery; import org.apache.lucene.search.spans.SpanTermQuery; -import org.apache.lucene.store.ByteBuffersDirectory; -import org.apache.lucene.store.Directory; -import org.apache.lucene.util.CharsRef; -import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.LuceneTestCase; import org.hamcrest.Matchers; import org.junit.Before; import org.junit.Test; import java.io.IOException; -import java.io.UncheckedIOException; import java.util.ArrayList; -import java.util.Collection; import java.util.HashMap; import java.util.List; import java.util.Locale; @@ -75,11 +65,9 @@ import java.util.stream.Collectors; import java.util.stream.Stream; import static org.hamcrest.Matchers.containsInAnyOrder; -import static org.hamcrest.Matchers.emptyArray; -import static org.hamcrest.Matchers.not; public class TestMatchRegionRetriever extends LuceneTestCase { - private static final String FLD_ID = "field_id"; + private static final String FLD_ID = IndexBuilder.FLD_ID; private static final String FLD_TEXT_POS_OFFS1 = "field_text_offs1"; private static final String FLD_TEXT_POS_OFFS2 = "field_text_offs2"; @@ -100,7 +88,7 @@ public class TestMatchRegionRetriever extends LuceneTestCase { private Analyzer analyzer; @Before - public void setup() { + public void setup() throws IOException { TYPE_STORED_WITH_OFFSETS = new FieldType(TextField.TYPE_STORED); TYPE_STORED_WITH_OFFSETS.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); TYPE_STORED_WITH_OFFSETS.freeze(); @@ -109,26 +97,24 @@ public class TestMatchRegionRetriever extends LuceneTestCase { TYPE_STORED_NO_POSITIONS.setIndexOptions(IndexOptions.DOCS_AND_FREQS); TYPE_STORED_NO_POSITIONS.freeze(); + final int offsetGap = RandomizedTest.randomIntBetween(0, 2); + final int positionGap = RandomizedTest.randomFrom(new int[]{0, 1, 100}); Analyzer whitespaceAnalyzer = - new Analyzer() { - final int offsetGap = RandomizedTest.randomIntBetween(0, 2); - final int positionGap = RandomizedTest.randomFrom(new int[]{0, 1, 100}); + new AnalyzerWithGaps(offsetGap, positionGap, + new WhitespaceAnalyzer(WhitespaceTokenizer.DEFAULT_MAX_WORD_LEN)); + SynonymMap synonymMap = TestMatchHighlighter.buildSynonymMap(new String[][] { + {"foo\u0000bar", "syn1"}, + {"baz", "syn2\u0000syn3"}, + }); + + Analyzer synonymsAnalyzer = + new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { - WhitespaceTokenizer tokenizer = - new WhitespaceTokenizer(CharTokenizer.DEFAULT_MAX_WORD_LEN); - return new TokenStreamComponents(tokenizer); - } - - @Override - public int getOffsetGap(String fieldName) { - return offsetGap; - } - - @Override - public int getPositionIncrementGap(String fieldName) { - return positionGap; + Tokenizer tokenizer = new WhitespaceTokenizer(); + TokenStream tokenStream = new SynonymGraphFilter(tokenizer, synonymMap, true); + return new TokenStreamComponents(tokenizer, tokenStream); } }; @@ -138,26 +124,8 @@ public class TestMatchRegionRetriever extends LuceneTestCase { fieldAnalyzers.put(FLD_TEXT_POS_OFFS1, whitespaceAnalyzer); fieldAnalyzers.put(FLD_TEXT_POS_OFFS2, whitespaceAnalyzer); fieldAnalyzers.put(FLD_TEXT_NOPOS, whitespaceAnalyzer); - - try { - SynonymMap.Builder b = new SynonymMap.Builder(); - b.add(new CharsRef("foo\u0000bar"), new CharsRef("syn1"), true); - b.add(new CharsRef("baz"), new CharsRef("syn2\u0000syn3"), true); - SynonymMap synonymMap = b.build(); - Analyzer synonymsAnalyzer = - new Analyzer() { - @Override - protected TokenStreamComponents createComponents(String fieldName) { - Tokenizer tokenizer = new WhitespaceTokenizer(); - TokenStream tokenStream = new SynonymGraphFilter(tokenizer, synonymMap, true); - return new TokenStreamComponents(tokenizer, tokenStream); - } - }; - fieldAnalyzers.put(FLD_TEXT_SYNONYMS_POS_OFFS, synonymsAnalyzer); - fieldAnalyzers.put(FLD_TEXT_SYNONYMS_POS, synonymsAnalyzer); - } catch (IOException e) { - throw new UncheckedIOException(e); - } + fieldAnalyzers.put(FLD_TEXT_SYNONYMS_POS_OFFS, synonymsAnalyzer); + fieldAnalyzers.put(FLD_TEXT_SYNONYMS_POS, synonymsAnalyzer); analyzer = new PerFieldAnalyzerWrapper(new MissingAnalyzer(), fieldAnalyzers); } @@ -184,13 +152,12 @@ public class TestMatchRegionRetriever extends LuceneTestCase { } private void checkTermQuery(String field) throws IOException { - withReader( - List.of( - Map.of(field, values("foo bar baz")), - Map.of(field, values("bar foo baz")), - Map.of(field, values("bar baz foo")), - Map.of(field, values("bar bar bar irrelevant"))), - reader -> { + new IndexBuilder(this::toField) + .doc(field, "foo bar baz") + .doc(field, "bar foo baz") + .doc(field, "bar baz foo") + .doc(field, "bar bar bar irrelevant") + .build(analyzer, reader -> { assertThat(highlights(reader, new TermQuery(new Term(field, "foo"))), containsInAnyOrder( fmt("0: (%s: '>foo< bar baz')", field), @@ -217,17 +184,17 @@ public class TestMatchRegionRetriever extends LuceneTestCase { .add(new TermQuery(new Term(field, "xyz")), BooleanClause.Occur.MUST_NOT) .build(); - withReader( - List.of( - Map.of(field, values("foo bar baz abc")), - Map.of(field, values("bar foo baz def")), - Map.of(field, values("bar baz foo xyz"))), - reader -> { + new IndexBuilder(this::toField) + .doc(field, "foo bar baz abc") + .doc(field, "bar foo baz def") + .doc(field, "bar baz foo xyz") + .build(analyzer, reader -> { assertThat(highlights(reader, query), containsInAnyOrder( fmt("0: (%s: '>foo bar baz< abc')", field), fmt("1: (%s: 'bar >foo baz< def')", field))); - }); + } + ); } @Test @@ -241,12 +208,11 @@ public class TestMatchRegionRetriever extends LuceneTestCase { } private void checkVariousQueryTypes(String field) throws IOException { - withReader( - List.of( - Map.of(field, values("foo bar baz abc")), - Map.of(field, values("bar foo baz def")), - Map.of(field, values("bar baz foo xyz"))), - reader -> { + new IndexBuilder(this::toField) + .doc(field, "foo bar baz abc") + .doc(field, "bar foo baz def") + .doc(field, "bar baz foo xyz") + .build(analyzer, reader -> { assertThat(highlights(reader, stdQueryParser.apply("foo baz", field)), containsInAnyOrder( fmt("0: (%s: '>foo< bar >baz< abc')", field), @@ -297,31 +263,31 @@ public class TestMatchRegionRetriever extends LuceneTestCase { assertThat(highlights(reader, new MatchAllDocsQuery()), Matchers.hasSize(0)); - }); + } + ); - withReader( - List.of( - Map.of(field, values("foo baz foo")), - Map.of(field, values("bas baz foo")), - Map.of(field, values("bar baz foo xyz"))), - reader -> { + new IndexBuilder(this::toField) + .doc(field, "foo baz foo") + .doc(field, "bas baz foo") + .doc(field, "bar baz foo xyz") + .build(analyzer, reader -> { assertThat( highlights(reader, stdQueryParser.apply("[bar TO baz] -bar", field)), containsInAnyOrder( fmt("0: (%s: 'foo >baz< foo')", field), fmt("1: (%s: '>bas< >baz< foo')", field))); - }); + } + ); } @Test public void testIntervalQueries() throws IOException { String field = FLD_TEXT_POS_OFFS; - withReader( - List.of( - Map.of(field, values("foo baz foo")), - Map.of(field, values("bas baz foo")), - Map.of(field, values("bar baz foo xyz"))), - reader -> { + new IndexBuilder(this::toField) + .doc(field, "foo baz foo") + .doc(field, "bas baz foo") + .doc(field, "bar baz foo xyz") + .build(analyzer, reader -> { assertThat( highlights(reader, new IntervalQuery(field, Intervals.unordered( @@ -374,7 +340,8 @@ public class TestMatchRegionRetriever extends LuceneTestCase { containsInAnyOrder( fmt("2: (field_text_offs: '>bar baz foo< xyz')", field) )); - }); + } + ); } @Test @@ -388,36 +355,37 @@ public class TestMatchRegionRetriever extends LuceneTestCase { } public void checkMultivaluedFields(String field) throws IOException { - withReader( - List.of( - Map.of(field, values("foo bar", "baz abc", "bad baz")), - Map.of(field, values("bar foo", "baz def")), - Map.of(field, values("bar baz", "foo xyz"))), - reader -> { + new IndexBuilder(this::toField) + .doc(field, "foo bar", "baz abc", "bad baz") + .doc(field, "bar foo", "baz def") + .doc(field, "bar baz", "foo xyz") + .build(analyzer, reader -> { assertThat(highlights(reader, stdQueryParser.apply("baz", field)), containsInAnyOrder( fmt("0: (%s: '>baz< abc | bad >baz<')", field), fmt("1: (%s: '>baz< def')", field), fmt("2: (%s: 'bar >baz<')", field))); - }); + } + ); } @Test public void testMultiFieldHighlights() throws IOException { - for (String[] fields : + for (String[] fieldPairs : new String[][]{ {FLD_TEXT_POS_OFFS1, FLD_TEXT_POS_OFFS2}, {FLD_TEXT_POS, FLD_TEXT_POS_OFFS2}, {FLD_TEXT_POS_OFFS1, FLD_TEXT_POS} }) { - String field1 = fields[0]; - String field2 = fields[1]; - withReader( - List.of( - Map.of( - field1, values("foo bar", "baz abc"), - field2, values("foo baz", "loo bar"))), - reader -> { + String field1 = fieldPairs[0]; + String field2 = fieldPairs[1]; + + new IndexBuilder(this::toField) + .doc(fields -> { + fields.add(field1, "foo bar", "baz abc"); + fields.add(field2, "foo baz", "loo bar"); + }) + .build(analyzer, reader -> { String ordered = Stream.of(fmt("(%s: '>baz< abc')", field1), fmt("(%s: 'loo >bar<')", field2)) .sorted() @@ -428,7 +396,8 @@ public class TestMatchRegionRetriever extends LuceneTestCase { reader, stdQueryParser.apply(field1 + ":baz" + " OR " + field2 + ":bar", field1)), containsInAnyOrder(fmt("0: %s", ordered))); - }); + } + ); } } @@ -440,15 +409,17 @@ public class TestMatchRegionRetriever extends LuceneTestCase { public void testNoRewrite() throws IOException { String field1 = FLD_TEXT_POS_OFFS1; String field2 = FLD_TEXT_POS_OFFS2; - withReader( - List.of( - Map.of( - field1, values("0100"), - field2, values("loo bar")), - Map.of( - field1, values("0200"), - field2, values("foo bar"))), - reader -> { + + new IndexBuilder(this::toField) + .doc(fields -> { + fields.add(field1, "0100"); + fields.add(field2, "loo bar"); + }) + .doc(fields -> { + fields.add(field1, "0200"); + fields.add(field2, "foo bar"); + }) + .build(analyzer, reader -> { String expected = fmt("0: (%s: '>0100<')(%s: 'loo >bar<')", field1, field2); assertThat( highlights( @@ -461,7 +432,8 @@ public class TestMatchRegionRetriever extends LuceneTestCase { reader, stdQueryParser.apply(fmt("+%s:01* AND %s:bar", field1, field2), field1)), containsInAnyOrder(expected)); - }); + } + ); } @Test @@ -475,9 +447,9 @@ public class TestMatchRegionRetriever extends LuceneTestCase { } public void checkNestedQueryHits(String field) throws IOException { - withReader( - List.of(Map.of(field, values("foo bar baz abc"))), - reader -> { + new IndexBuilder(this::toField) + .doc(field, "foo bar baz abc") + .build(analyzer, reader -> { assertThat( highlights( reader, @@ -496,7 +468,8 @@ public class TestMatchRegionRetriever extends LuceneTestCase { .add(new TermQuery(new Term(field, "baz")), BooleanClause.Occur.SHOULD) .build()), containsInAnyOrder(fmt("0: (%s: '>foo >bar< >baz<< abc')", field))); - }); + } + ); } @Test @@ -510,13 +483,12 @@ public class TestMatchRegionRetriever extends LuceneTestCase { } private void checkGraphQuery(String field) throws IOException { - withReader( - List.of( - Map.of(field, values("foo bar baz")), - Map.of(field, values("bar foo baz")), - Map.of(field, values("bar baz foo")), - Map.of(field, values("bar bar bar irrelevant"))), - reader -> { + new IndexBuilder(this::toField) + .doc(field, "foo bar baz") + .doc(field, "bar foo baz") + .doc(field, "bar baz foo") + .doc(field, "bar bar bar irrelevant") + .build(analyzer, reader -> { assertThat(highlights(reader, new TermQuery(new Term(field, "syn1"))), containsInAnyOrder(fmt("0: (%s: '>foo bar< baz')", field))); @@ -536,7 +508,8 @@ public class TestMatchRegionRetriever extends LuceneTestCase { assertThat( highlights(reader, stdQueryParser.apply(field + ":\"foo syn2 syn3\"", field)), containsInAnyOrder(fmt("1: (%s: 'bar >foo baz<')", field))); - }); + } + ); } @Test @@ -550,13 +523,12 @@ public class TestMatchRegionRetriever extends LuceneTestCase { } private void checkSpanQueries(String field) throws IOException { - withReader( - List.of( - Map.of(field, values("foo bar baz")), - Map.of(field, values("bar foo baz")), - Map.of(field, values("bar baz foo")), - Map.of(field, values("bar bar bar irrelevant"))), - reader -> { + new IndexBuilder(this::toField) + .doc(field, "foo bar baz") + .doc(field, "bar foo baz") + .doc(field, "bar baz foo") + .doc(field, "bar bar bar irrelevant") + .build(analyzer, reader -> { assertThat( highlights( reader, @@ -598,7 +570,8 @@ public class TestMatchRegionRetriever extends LuceneTestCase { fmt("0: (%s: '>foo bar< baz')", field), fmt("1: (%s: '>bar foo< baz')", field), fmt("2: (%s: '>bar baz foo<')", field))); - }); + } + ); } /** @@ -610,12 +583,10 @@ public class TestMatchRegionRetriever extends LuceneTestCase { public void testTextFieldNoPositionsOffsetFromValues() throws Exception { String field = FLD_TEXT_NOPOS; - withReader( - List.of( - Map.of(FLD_TEXT_NOPOS, values("foo bar")), - Map.of(FLD_TEXT_NOPOS, values("foo bar", "baz baz")) - ), - reader -> { + new IndexBuilder(this::toField) + .doc(FLD_TEXT_NOPOS, "foo bar") + .doc(FLD_TEXT_NOPOS, "foo bar", "baz baz") + .build(analyzer, reader -> { OffsetsRetrievalStrategySupplier defaults = MatchRegionRetriever .computeOffsetRetrievalStrategies(reader, analyzer); OffsetsRetrievalStrategySupplier customSuppliers = (fld) -> { @@ -634,7 +605,8 @@ public class TestMatchRegionRetriever extends LuceneTestCase { containsInAnyOrder( fmt("0: (%s: '>foo bar<')", field), fmt("1: (%s: '>foo bar< | >baz baz<')", field))); - }); + } + ); } /** @@ -648,13 +620,13 @@ public class TestMatchRegionRetriever extends LuceneTestCase { public void testTextFieldNoPositionsOffsetsFromTokens() throws Exception { String field = FLD_TEXT_NOPOS; - withReader( - List.of( - Map.of(FLD_TEXT_NOPOS, values("foo bar"), - FLD_TEXT_POS, values("bar bar")), - Map.of(FLD_TEXT_NOPOS, values("foo bar", "baz bar")) - ), - reader -> { + new IndexBuilder(this::toField) + .doc(fields -> { + fields.add(FLD_TEXT_NOPOS, "foo bar"); + fields.add(FLD_TEXT_POS, "bar bar"); + }) + .doc(FLD_TEXT_NOPOS, "foo bar", "baz bar") + .build(analyzer, reader -> { assertThat( highlights( reader, @@ -662,7 +634,8 @@ public class TestMatchRegionRetriever extends LuceneTestCase { containsInAnyOrder( fmt("0: (%s: 'foo >bar<')", field), fmt("1: (%s: 'foo >bar< | baz >bar<')", field))); - }); + } + ); } private List highlights(IndexReader reader, Query query) throws IOException { @@ -702,46 +675,14 @@ public class TestMatchRegionRetriever extends LuceneTestCase { } }; - MatchRegionRetriever highlighter = new MatchRegionRetriever(searcher, rewrittenQuery, analyzer, - offsetsStrategySupplier); + MatchRegionRetriever highlighter = new MatchRegionRetriever(searcher, rewrittenQuery, offsetsStrategySupplier); highlighter.highlightDocuments(topDocs, highlightCollector); return highlights; } - private String[] values(String... values) { - assertThat(values, not(emptyArray())); - return values; - } - - private void withReader( - Collection> docs, IOUtils.IOConsumer block) - throws IOException { - IndexWriterConfig config = new IndexWriterConfig(analyzer); - - try (Directory directory = new ByteBuffersDirectory()) { - IndexWriter iw = new IndexWriter(directory, config); - - int seq = 0; - for (Map fields : docs) { - Document doc = new Document(); - doc.add(new StringField(FLD_ID, Integer.toString(seq++), Field.Store.YES)); - for (Map.Entry field : fields.entrySet()) { - for (String value : field.getValue()) { - doc.add(toField(field.getKey(), value)); - } - } - iw.addDocument(doc); - if (RandomizedTest.randomBoolean()) { - iw.commit(); - } - } - iw.flush(); - - try (DirectoryReader reader = DirectoryReader.open(iw)) { - block.accept(reader); - } - } + private static String fmt(String string, Object... args) { + return String.format(Locale.ROOT, string, args); } private IndexableField toField(String name, String value) { @@ -760,8 +701,4 @@ public class TestMatchRegionRetriever extends LuceneTestCase { throw new AssertionError("Don't know how to handle this field: " + name); } } - - private static String fmt(String string, Object... args) { - return String.format(Locale.ROOT, string, args); - } }