LUCENE-9464: Add high(er)-level hit highlighter example that demonstrates and uses low-level components (#1820)

This commit is contained in:
Dawid Weiss 2020-09-10 13:17:13 +02:00 committed by GitHub
parent 8debc9d0c2
commit e2f3f626ee
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 1199 additions and 192 deletions

View File

@ -0,0 +1,139 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.matchhighlight;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.function.BiPredicate;
import java.util.function.Predicate;
/**
* A factory of {@link org.apache.lucene.search.matchhighlight.MatchHighlighter.FieldValueHighlighter} classes
* that cover typical use cases (verbatim values, highlights, abbreviations).
*
* @see MatchHighlighter#appendFieldHighlighter
*/
public final class FieldValueHighlighters {
private FieldValueHighlighters() {
}
private static abstract class AbstractFieldValueHighlighter implements MatchHighlighter.FieldValueHighlighter {
private final BiPredicate<String, Boolean> testPredicate;
protected AbstractFieldValueHighlighter(BiPredicate<String, Boolean> testPredicate) {
this.testPredicate = testPredicate;
}
@Override
public final boolean isApplicable(String field, boolean hasMatches) {
return testPredicate.test(field, hasMatches);
}
}
/**
* Displays up to {@code maxLeadingCharacters} of the field's value, regardless of whether it contained
* highlights or not.
*/
public static MatchHighlighter.FieldValueHighlighter maxLeadingCharacters(int maxLeadingCharacters, String ellipsis, Set<String> fields) {
PassageSelector passageSelector = defaultPassageSelector();
PassageFormatter passageFormatter = new PassageFormatter(ellipsis, "", "");
return new AbstractFieldValueHighlighter((field, hasMatches) -> fields.contains(field)) {
@Override
public List<String> format(String field, String[] values, String contiguousValue,
List<OffsetRange> valueRanges, List<MatchHighlighter.QueryOffsetRange> matchOffsets) {
List<Passage> bestPassages =
passageSelector.pickBest(contiguousValue, Collections.emptyList(), maxLeadingCharacters, 1, valueRanges);
return passageFormatter.format(contiguousValue, bestPassages, valueRanges);
}
@Override
public Collection<String> alwaysFetchedFields() {
return fields;
}
};
}
/**
* Default preconfigured {@link PassageSelector}.
*/
public static PassageSelector defaultPassageSelector() {
return new PassageSelector(
PassageSelector.DEFAULT_SCORER,
new BreakIteratorShrinkingAdjuster());
}
/**
* Highlights fields matching predicate {@code matchFields} only if they contained query matches.
*/
public static MatchHighlighter.FieldValueHighlighter highlighted(
int maxPassageWindow,
int maxPassages,
PassageFormatter passageFormatter,
Predicate<String> matchFields) {
PassageSelector passageSelector = defaultPassageSelector();
return new AbstractFieldValueHighlighter((field, hasMatches) -> matchFields.test(field) && hasMatches) {
@Override
public List<String> format(String field, String[] values, String contiguousValue,
List<OffsetRange> valueRanges, List<MatchHighlighter.QueryOffsetRange> matchOffsets) {
assert matchOffsets != null;
List<Passage> bestPassages =
passageSelector.pickBest(contiguousValue, matchOffsets, maxPassageWindow, maxPassages, valueRanges);
return passageFormatter.format(contiguousValue, bestPassages, valueRanges);
}
};
}
/**
* Always returns raw field values, no highlighting or value truncation is applied.
*/
public static MatchHighlighter.FieldValueHighlighter verbatimValue(String field, String... moreFields) {
HashSet<String> matchFields = new HashSet<>(Arrays.asList(moreFields));
matchFields.add(field);
return new AbstractFieldValueHighlighter((fld, hasMatches) -> matchFields.contains(fld)) {
@Override
public Collection<String> alwaysFetchedFields() {
return matchFields;
}
@Override
public List<String> format(String field, String[] values, String contiguousValue, List<OffsetRange> valueRanges,
List<MatchHighlighter.QueryOffsetRange> matchOffsets) {
return Arrays.asList(values);
}
};
}
/**
* Matches all fields and omits their value in the output (so that no highlight or value is emitted).
*/
public static MatchHighlighter.FieldValueHighlighter skipRemaining() {
return new AbstractFieldValueHighlighter((field, hasMatches) -> true) {
@Override
public List<String> format(String field, String[] values, String contiguousValue, List<OffsetRange> valueRanges,
List<MatchHighlighter.QueryOffsetRange> matchOffsets) {
return null;
}
};
}
}

View File

@ -0,0 +1,308 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.matchhighlight;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.DocumentStoredFieldVisitor;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import java.io.IOException;
import java.io.UncheckedIOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.function.Predicate;
import java.util.stream.Stream;
/**
* An example highlighter that combines several lower-level highlighting
* utilities in this package into a fully featured, ready-to-use component.
* <p>
* Note that if you need to customize or tweak the details of highlighting,
* it is better to assemble your own highlighter using those low-level
* building blocks, rather than extend or modify this one.
*/
public class MatchHighlighter {
private final IndexSearcher searcher;
private final OffsetsRetrievalStrategySupplier offsetsRetrievalStrategies;
private final Analyzer analyzer;
private final HashSet<String> fieldsAlwaysReturned = new HashSet<>();
private final List<FieldValueHighlighter> fieldHighlighters = new ArrayList<>();
/**
* Actual per-field highlighter. Field highlighters are probed whether they
* are applicable to a particular combination of (field, hasMatches) pair. If a highlighter
* declares it is applicable, its {@link #format} method is invoked and the result
* is returned as the field's value.
*
* @see FieldValueHighlighters
*/
public interface FieldValueHighlighter {
/**
* Check if this highlighter can be applied to a given field.
*
* @param field Field name
* @param hasMatches {@code true} if the field has a non-empty set of match regions.
*/
boolean isApplicable(String field, boolean hasMatches);
/**
* Do format field values appropriately.
*/
List<String> format(String field, String[] values, String contiguousValue,
List<OffsetRange> valueRanges, List<QueryOffsetRange> matchOffsets);
/**
* @return Returns a set of fields that must be fetched for each document, regardless
* of whether they had matches or not. This is useful to load and return certain fields
* that should always be included (identifiers, document titles, etc.).
*/
default Collection<String> alwaysFetchedFields() {
return Collections.emptyList();
}
/**
* Returns a new field value highlighter that is a combination of this one and another one.
*/
default FieldValueHighlighter or(FieldValueHighlighter other) {
FieldValueHighlighter first = this;
FieldValueHighlighter second = other;
HashSet<String> fieldUnion = new HashSet<>();
fieldUnion.addAll(first.alwaysFetchedFields());
fieldUnion.addAll(second.alwaysFetchedFields());
return new FieldValueHighlighter() {
@Override
public boolean isApplicable(String field, boolean hasMatches) {
return first.isApplicable(field, hasMatches)
|| second.isApplicable(field, hasMatches);
}
@Override
public List<String> format(String field, String[] values, String contiguousValue,
List<OffsetRange> valueRanges, List<QueryOffsetRange> matchOffsets) {
FieldValueHighlighter delegate =
first.isApplicable(field, matchOffsets != null && !matchOffsets.isEmpty()) ? first : second;
return delegate.format(field, values, contiguousValue, valueRanges, matchOffsets);
}
@Override
public Collection<String> alwaysFetchedFields() {
return fieldUnion;
}
};
}
}
/**
* Append a new highlighter to field highlighters chain. The order of field highlighters
* is important (first-matching wins).
*/
public MatchHighlighter appendFieldHighlighter(FieldValueHighlighter highlighter) {
fieldHighlighters.add(highlighter);
fieldsAlwaysReturned.addAll(highlighter.alwaysFetchedFields());
return this;
}
/**
* Always fetch the given set of fields for all input documents.
*/
public void alwaysFetchFields(String field, String... otherFields) {
Stream.concat(Stream.of(field), Stream.of(otherFields))
.forEach(fld -> fieldsAlwaysReturned.add(Objects.requireNonNull(fld)));
}
/**
* Single document's highlights.
*/
public static class DocHighlights {
public final int docId;
public final Map<String, List<String>> fields = new LinkedHashMap<>();
public DocHighlights(int docId) {
this.docId = docId;
}
}
/**
* An {@link OffsetRange} of a match, together with the source query that caused it.
*/
public static class QueryOffsetRange extends OffsetRange {
public final Query query;
QueryOffsetRange(Query query, int from, int to) {
super(from, to);
this.query = query;
}
}
private static class DocHit {
final int docId;
private final LeafReader leafReader;
private final int leafDocId;
private final LinkedHashMap<String, List<QueryOffsetRange>> matchRanges
= new LinkedHashMap<>();
DocHit(int docId, LeafReader leafReader, int leafDocId) {
this.docId = docId;
this.leafReader = leafReader;
this.leafDocId = leafDocId;
}
void addMatches(Query query, Map<String, List<OffsetRange>> hits) {
hits.forEach((field, offsets) -> {
List<QueryOffsetRange> target = matchRanges.computeIfAbsent(field, (fld) -> new ArrayList<>());
offsets.forEach(o -> target.add(new QueryOffsetRange(query, o.from, o.to)));
});
}
Document document(Predicate<String> needsField) throws IOException {
// Only load the fields that have a chance to be highlighted.
DocumentStoredFieldVisitor visitor = new DocumentStoredFieldVisitor() {
@Override
public Status needsField(FieldInfo fieldInfo) {
return (matchRanges.containsKey(fieldInfo.name) ||
needsField.test(fieldInfo.name)) ? Status.YES : Status.NO;
}
};
leafReader.document(leafDocId, visitor);
return visitor.getDocument();
}
}
public MatchHighlighter(IndexSearcher searcher, Analyzer analyzer) {
this(searcher, analyzer, MatchRegionRetriever.computeOffsetRetrievalStrategies(searcher.getIndexReader(), analyzer));
}
public MatchHighlighter(IndexSearcher searcher,
Analyzer analyzer,
OffsetsRetrievalStrategySupplier offsetsRetrievalStrategies) {
this.searcher = searcher;
this.offsetsRetrievalStrategies = offsetsRetrievalStrategies;
this.analyzer = analyzer;
}
public Stream<DocHighlights> highlight(TopDocs topDocs, Query... queries) throws IOException {
// We want to preserve topDocs document ordering and MatchRegionRetriever is optimized
// for streaming, so we'll just prepopulate the map in proper order.
LinkedHashMap<Integer, DocHit> docHits = new LinkedHashMap<>();
for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
docHits.put(scoreDoc.doc, null);
}
// Collect match ranges for each query and associate each range to the origin query.
for (Query q : queries) {
MatchRegionRetriever highlighter =
new MatchRegionRetriever(searcher, searcher.rewrite(q), offsetsRetrievalStrategies);
highlighter.highlightDocuments(topDocs,
(int docId, LeafReader leafReader, int leafDocId, Map<String, List<OffsetRange>> hits) -> {
DocHit docHit = docHits.get(docId);
if (docHit == null) {
docHit = new DocHit(docId, leafReader, leafDocId);
docHits.put(docId, docHit);
}
docHit.addMatches(q, hits);
});
}
return docHits.values().stream()
.filter(Objects::nonNull) // This should always the case?
.map(this::computeDocFieldValues);
}
private DocHighlights computeDocFieldValues(DocHit docHit) {
Document doc;
try {
doc = docHit.document(fieldsAlwaysReturned::contains);
} catch (IOException e) {
throw new UncheckedIOException(e);
}
DocHighlights docHighlights = new DocHighlights(docHit.docId);
HashSet<String> unique = new HashSet<>();
for (IndexableField indexableField : doc) {
String field = indexableField.name();
if (!unique.add(field)) {
continue;
}
String[] values = doc.getValues(field);
String contiguousValue = contiguousFieldValue(field, values);
List<OffsetRange> valueRanges = computeValueRanges(field, values);
List<QueryOffsetRange> offsets = docHit.matchRanges.get(field);
List<String> formattedValues = fieldValueHighlighter(field, offsets != null)
.format(field, values, contiguousValue, valueRanges, offsets);
if (formattedValues != null) {
docHighlights.fields.put(field, formattedValues);
}
}
return docHighlights;
}
private List<OffsetRange> computeValueRanges(String field, String[] values) {
ArrayList<OffsetRange> valueRanges = new ArrayList<>();
int offset = 0;
for (CharSequence v : values) {
valueRanges.add(new OffsetRange(offset, offset + v.length()));
offset += v.length();
offset += analyzer.getOffsetGap(field);
}
return valueRanges;
}
private String contiguousFieldValue(String field, String[] values) {
String value;
if (values.length == 1) {
value = values[0];
} else {
// TODO: This can be inefficient if offset gap is large but the logic
// of applying offsets would get much more complicated so leaving for now
// (would have to recalculate all offsets to omit gaps).
String fieldGapPadding = " ".repeat(analyzer.getOffsetGap(field));
value = String.join(fieldGapPadding, values);
}
return value;
}
private FieldValueHighlighter fieldValueHighlighter(String field, boolean hasMatches) {
for (FieldValueHighlighter highlighter : fieldHighlighters) {
if (highlighter.isApplicable(field, hasMatches)) {
return highlighter;
}
}
throw new RuntimeException("No field highlighter could be matched to field: " + field);
}
}

View File

@ -80,22 +80,23 @@ public class MatchRegionRetriever {
/**
* A constructor with the default offset strategy supplier.
*
* @param analyzer An analyzer that may be used to reprocess (retokenize) document fields
* in the absence of position offsets in the index. Note that the analyzer must return
* tokens (positions and offsets) identical to the ones stored in the index.
*/
public MatchRegionRetriever(IndexSearcher searcher, Query query, Analyzer analyzer) throws IOException {
this(searcher, query, analyzer, computeOffsetRetrievalStrategies(searcher.getIndexReader(), analyzer));
this(searcher, query, computeOffsetRetrievalStrategies(searcher.getIndexReader(), analyzer));
}
/**
* @param searcher Index searcher to be used for retrieving matches.
* @param query The query for which matches should be retrieved. The query should be rewritten
* against the provided searcher.
* @param analyzer An analyzer that may be used to reprocess (retokenize) document fields
* in the absence of position offsets in the index. Note that the analyzer must return
* tokens (positions and offsets) identical to the ones stored in the index.
* @param fieldOffsetStrategySupplier A custom supplier of per-field {@link OffsetsRetrievalStrategy}
* instances.
*/
public MatchRegionRetriever(IndexSearcher searcher, Query query, Analyzer analyzer,
public MatchRegionRetriever(IndexSearcher searcher, Query query,
OffsetsRetrievalStrategySupplier fieldOffsetStrategySupplier)
throws IOException {
leaves = searcher.getIndexReader().leaves();

View File

@ -0,0 +1,51 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.matchhighlight;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.DelegatingAnalyzerWrapper;
/**
* An analyzer for tests that has a predefined offset and position gap.
*/
class AnalyzerWithGaps extends DelegatingAnalyzerWrapper {
private final Analyzer delegate;
private final int offsetGap;
private final int positionGap;
AnalyzerWithGaps(int offsetGap, int positionGap, Analyzer delegate) {
super(delegate.getReuseStrategy());
this.delegate = delegate;
this.offsetGap = offsetGap;
this.positionGap = positionGap;
}
@Override
protected Analyzer getWrappedAnalyzer(String fieldName) {
return delegate;
}
@Override
public int getOffsetGap(String fieldName) {
return offsetGap;
}
@Override
public int getPositionIncrementGap(String fieldName) {
return positionGap;
}
}

View File

@ -0,0 +1,105 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.matchhighlight;
import com.carrotsearch.randomizedtesting.RandomizedTest;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.NumericDocValuesField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.store.ByteBuffersDirectory;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.IOUtils;
import java.io.IOException;
import java.util.ArrayList;
import java.util.function.BiFunction;
import java.util.function.Consumer;
/**
* Utility class for building an ephemeral document index
* and running a block of code on its reader.
*/
class IndexBuilder {
public static final String FLD_ID = "id";
public static final String FLD_SORT_ORDER = "id_order";
private final BiFunction<String, String, IndexableField> toField;
private final ArrayList<Document> documents = new ArrayList<>();
private int seq;
class DocFields {
final Document document;
public DocFields(Document doc) {
this.document = doc;
}
public void add(String field, String... values) {
assert values.length > 0 : "At least one field value is required.";
for (String value : values) {
document.add(toField.apply(field, value));
}
}
}
IndexBuilder(BiFunction<String, String, IndexableField> valueToField) {
this.toField = valueToField;
}
public IndexBuilder doc(String field, String... values) {
return doc(fields -> {
fields.add(field, values);
});
}
public IndexBuilder doc(Consumer<DocFields> fields) {
Document doc = new Document();
doc.add(new NumericDocValuesField(FLD_SORT_ORDER, seq));
doc.add(new StringField(FLD_ID, Integer.toString(seq++), Field.Store.YES));
fields.accept(new DocFields(doc));
documents.add(doc);
return this;
}
public IndexBuilder build(Analyzer analyzer, IOUtils.IOConsumer<DirectoryReader> block) throws IOException {
IndexWriterConfig config = new IndexWriterConfig(analyzer);
config.setIndexSort(new Sort(new SortField(FLD_SORT_ORDER, SortField.Type.LONG)));
try (Directory directory = new ByteBuffersDirectory()) {
IndexWriter iw = new IndexWriter(directory, config);
for (Document doc : documents) {
iw.addDocument(doc);
}
if (RandomizedTest.randomBoolean()) {
iw.commit();
}
iw.flush();
try (DirectoryReader reader = DirectoryReader.open(iw)) {
block.accept(reader);
}
}
return this;
}
}

View File

@ -0,0 +1,466 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.matchhighlight;
import com.carrotsearch.randomizedtesting.RandomizedTest;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
import org.apache.lucene.analysis.synonym.SynonymGraphFilter;
import org.apache.lucene.analysis.synonym.SynonymMap;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.Term;
import org.apache.lucene.queries.intervals.IntervalQuery;
import org.apache.lucene.queries.intervals.Intervals;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.LuceneTestCase;
import org.hamcrest.Matchers;
import org.junit.Before;
import org.junit.Test;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.Stream;
public class TestMatchHighlighter extends LuceneTestCase {
private static final String FLD_ID = "id";
private static final String FLD_TEXT1 = "text1";
private static final String FLD_TEXT2 = "text2";
private FieldType TYPE_TEXT_POSITIONS_OFFSETS;
private FieldType TYPE_TEXT_POSITIONS;
private PerFieldAnalyzerWrapper analyzer;
@Before
public void setup() throws IOException {
TYPE_TEXT_POSITIONS = TextField.TYPE_STORED;
TYPE_TEXT_POSITIONS_OFFSETS = new FieldType(TextField.TYPE_STORED);
TYPE_TEXT_POSITIONS_OFFSETS.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
TYPE_TEXT_POSITIONS_OFFSETS.freeze();
Map<String, Analyzer> fieldAnalyzers = new HashMap<>();
// Create an analyzer with some synonyms, just to showcase them.
SynonymMap synonymMap = buildSynonymMap(new String[][]{
{"moon\u0000shine", "firewater"},
{"firewater", "moon\u0000shine"},
});
// Make a non-empty offset gap so that break iterator doesn't go haywire on multivalues
// glued together.
final int offsetGap = RandomizedTest.randomIntBetween(1, 2);
final int positionGap = RandomizedTest.randomFrom(new int[]{0, 1, 100});
Analyzer synonymsAnalyzer =
new AnalyzerWithGaps(offsetGap, positionGap, new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new WhitespaceTokenizer();
TokenStream tokenStream = new SynonymGraphFilter(tokenizer, synonymMap, true);
return new TokenStreamComponents(tokenizer, tokenStream);
}
});
fieldAnalyzers.put(FLD_TEXT1, synonymsAnalyzer);
fieldAnalyzers.put(FLD_TEXT2, synonymsAnalyzer);
analyzer = new PerFieldAnalyzerWrapper(new MissingAnalyzer(), fieldAnalyzers);
}
static SynonymMap buildSynonymMap(String[][] synonyms) throws IOException {
SynonymMap.Builder builder = new SynonymMap.Builder();
for (String[] pair : synonyms) {
assertThat(pair.length, Matchers.equalTo(2));
builder.add(new CharsRef(pair[0]), new CharsRef(pair[1]), true);
}
return builder.build();
}
@Test
public void testBasicUsage() throws IOException {
new IndexBuilder(this::toField)
.doc(FLD_TEXT1, "foo bar baz")
.doc(FLD_TEXT1, "bar foo baz")
.doc(fields -> {
fields.add(FLD_TEXT1, "Very long content but not matching anything.");
fields.add(FLD_TEXT2, "no foo but bar");
})
.build(analyzer, reader -> {
Query query = new BooleanQuery.Builder()
.add(new TermQuery(new Term(FLD_TEXT1, "foo")), BooleanClause.Occur.SHOULD)
.add(new TermQuery(new Term(FLD_TEXT2, "bar")), BooleanClause.Occur.SHOULD)
.build();
// In the most basic scenario, we run a search against a query, retrieve
// top docs...
IndexSearcher searcher = new IndexSearcher(reader);
Sort sortOrder = Sort.INDEXORDER; // So that results are consistently ordered.
TopDocs topDocs = searcher.search(query, 10, sortOrder);
// ...and would want a fixed set of fields from those documents, some of them
// possibly highlighted if they matched the query.
//
// This configures the highlighter so that the FLD_ID field is always returned verbatim,
// and FLD_TEXT1 is returned *only if it contained a query match*.
MatchHighlighter highlighter =
new MatchHighlighter(searcher, analyzer)
.appendFieldHighlighter(FieldValueHighlighters.verbatimValue(FLD_ID))
.appendFieldHighlighter(FieldValueHighlighters.highlighted(
80 * 3, 1, new PassageFormatter("...", ">", "<"), FLD_TEXT1::equals))
.appendFieldHighlighter(FieldValueHighlighters.skipRemaining());
// Note document field highlights are a stream over documents in topDocs. In the remaining code we will just
// collect them on the fly into a preformatted string.
Stream<MatchHighlighter.DocHighlights> highlights = highlighter.highlight(topDocs, query);
assertHighlights(toDocList(highlights),
" 0. id: 0",
" text1: >foo< bar baz",
" 1. id: 1",
" text1: bar >foo< baz",
" 2. id: 2");
// In a more realistic use case, you'd want to show the value of a given field *regardless* of whether it
// contained a highlight or not -- it is odd that document "id: 2" above doesn't have the 'text1' field
// shown because that field wasn't part of the query match.
//
// Let's say the field is also potentially long; if it contains a match,
// we would want to display the contextual snippet surrounding that match. If it does not contain any
// matches, we would want to display its content up to a given number of characters (lead lines).
//
// Let's do this by adding an appropriate field highlighter on FLD_TEXT1.
highlighter =
new MatchHighlighter(searcher, analyzer)
.appendFieldHighlighter(FieldValueHighlighters.verbatimValue(FLD_ID))
.appendFieldHighlighter(FieldValueHighlighters.highlighted(
80 * 3, 1, new PassageFormatter("...", ">", "<"), FLD_TEXT1::equals))
.appendFieldHighlighter(FieldValueHighlighters.maxLeadingCharacters(10, "...", Set.of(FLD_TEXT1)))
.appendFieldHighlighter(FieldValueHighlighters.skipRemaining());
assertHighlights(toDocList(highlighter.highlight(topDocs, query)),
" 0. id: 0",
" text1: >foo< bar baz",
" 1. id: 1",
" text1: bar >foo< baz",
" 2. id: 2",
" text1: Very long...");
// Field highlighters can apply to multiple fields and be chained for convenience.
// For example, this defines a combined highlighter over both FLD_TEXT1 and FLD_TEXT2.
Set<String> fields = Set.of(FLD_TEXT1, FLD_TEXT2);
MatchHighlighter.FieldValueHighlighter highlightedOrAbbreviated =
FieldValueHighlighters.highlighted(80 * 3, 1, new PassageFormatter("...", ">", "<"), fields::contains)
.or(FieldValueHighlighters.maxLeadingCharacters(10, "...", fields));
highlighter =
new MatchHighlighter(searcher, analyzer)
.appendFieldHighlighter(FieldValueHighlighters.verbatimValue(FLD_ID))
.appendFieldHighlighter(highlightedOrAbbreviated)
.appendFieldHighlighter(FieldValueHighlighters.skipRemaining());
assertHighlights(toDocList(highlighter.highlight(topDocs, query)),
" 0. id: 0",
" text1: >foo< bar baz",
" 1. id: 1",
" text1: bar >foo< baz",
" 2. id: 2",
" text1: Very long...",
" text2: no foo but >bar<");
});
}
@Test
public void testSynonymHighlight() throws IOException {
// There is nothing special needed to highlight or process complex queries, synonyms, etc.
// Synonyms defined in the constructor of this class.
new IndexBuilder(this::toField)
.doc(FLD_TEXT1, "Where the moon shine falls, firewater flows.")
.build(analyzer, reader -> {
IndexSearcher searcher = new IndexSearcher(reader);
Sort sortOrder = Sort.INDEXORDER; // So that results are consistently ordered.
MatchHighlighter highlighter =
new MatchHighlighter(searcher, analyzer)
.appendFieldHighlighter(FieldValueHighlighters.highlighted(
80 * 3, 1, new PassageFormatter("...", ">", "<"), FLD_TEXT1::equals))
.appendFieldHighlighter(FieldValueHighlighters.skipRemaining());
Query query = new TermQuery(new Term(FLD_TEXT1, "firewater"));
assertHighlights(toDocList(highlighter.highlight(searcher.search(query, 10, sortOrder), query)),
"0. text1: Where the >moon shine< falls, >firewater< flows.");
query = new PhraseQuery(FLD_TEXT1, "moon", "shine");
assertHighlights(toDocList(highlighter.highlight(searcher.search(query, 10, sortOrder), query)),
"0. text1: Where the >moon shine< falls, >firewater< flows.");
});
}
@Test
public void testCustomFieldHighlightHandling() throws IOException {
// Match highlighter is a showcase of individual components in this package, suitable
// to create any kind of field-display designs.
//
// In this example we will build a custom field highlighting handler that
// highlights matches over a multivalued field, shows that field's values if it received
// no matches and limits the number of values displayed to at most 2 (with an appropriate message).
new IndexBuilder(this::toField)
// Just one document, one field, four values.
.doc(FLD_TEXT1, "foo bar", "bar foo baz", "bar baz foo", "baz baz baz")
.build(analyzer, reader -> {
IndexSearcher searcher = new IndexSearcher(reader);
Sort sortOrder = Sort.INDEXORDER;
// Let's start with the simple predefined highlighter so that the field's value shows
// and is highlighted when it was part of the hit.
MatchHighlighter.FieldValueHighlighter highlighted = FieldValueHighlighters.highlighted(
80 * 3, 2, new PassageFormatter("...", ">", "<"), FLD_TEXT1::equals);
MatchHighlighter highlighter =
new MatchHighlighter(searcher, analyzer)
.appendFieldHighlighter(highlighted)
.appendFieldHighlighter(FieldValueHighlighters.skipRemaining());
Query query = new TermQuery(new Term(FLD_TEXT1, "foo"));
TopDocs topDocs = searcher.search(query, 10, sortOrder);
// Note the highlighter is configured with at most 2 snippets so the match on the
// third value ("bar baz foo") is omitted. Ellipsis isn't inserted too because
// values are displayed in full.
assertHighlights(toDocList(highlighter.highlight(topDocs, query)),
"0. text1: >foo< bar, bar >foo< baz");
// So the above works fine if the field received a match but omits it otherwise. We can
// force the display of this field by chaining with verbatim value highlighter:
highlighter =
new MatchHighlighter(searcher, analyzer)
.appendFieldHighlighter(highlighted.or(FieldValueHighlighters.verbatimValue(FLD_TEXT1)))
.appendFieldHighlighter(FieldValueHighlighters.skipRemaining());
assertHighlights(toDocList(highlighter.highlight(topDocs, new MatchAllDocsQuery())),
"0. text1: foo bar, bar foo baz, bar baz foo, baz baz baz");
// But this is not exactly what we'd like because we want to limit the display of values to the first two.
// Let's just write a custom field highlighter handler that does it.
class AtMostNValuesHighlighter implements MatchHighlighter.FieldValueHighlighter {
private final String field;
private final int limit;
AtMostNValuesHighlighter(String field, int limit) {
this.field = field;
this.limit = limit;
}
@Override
public boolean isApplicable(String field, boolean hasMatches) {
return Objects.equals(field, this.field);
}
@Override
public List<String> format(String field, String[] values, String contiguousValue,
List<OffsetRange> valueRanges, List<MatchHighlighter.QueryOffsetRange> matchOffsets) {
if (values.length <= limit) {
return Arrays.asList(values);
} else {
List<String> collected = Stream.of(values).limit(limit).collect(Collectors.toList());
int remaining = values.length - collected.size();
collected.add(String.format(Locale.ROOT, "[%d omitted]", remaining));
return collected;
}
}
@Override
public Collection<String> alwaysFetchedFields() {
return Collections.singleton(field);
}
}
// We can now chain it as usual and contemplate the result.
highlighter =
new MatchHighlighter(searcher, analyzer)
.appendFieldHighlighter(highlighted.or(new AtMostNValuesHighlighter(FLD_TEXT1, 2)))
.appendFieldHighlighter(FieldValueHighlighters.skipRemaining());
assertHighlights(toDocList(highlighter.highlight(topDocs, query)),
"0. text1: >foo< bar, bar >foo< baz");
assertHighlights(toDocList(highlighter.highlight(topDocs, new MatchAllDocsQuery())),
"0. text1: foo bar, bar foo baz, [2 omitted]");
});
}
@Test
public void testHighlightMoreQueriesAtOnceShowoff() throws IOException {
// Match highlighter underlying components are powerful enough to build interesting,
// if not always super-practical, things. In this case, we would like to highlight
// a set of matches of *more than one* query over the same set of input documents. This includes
// highest-scoring passage resolution (from multiple hits) and different highlight markers
// for each query.
new IndexBuilder(this::toField)
.doc(FLD_TEXT1, "foo bar baz")
.doc(FLD_TEXT1, "foo baz bar")
.build(analyzer, reader -> {
// Let's start with the two queries. The first one will be an unordered
// query for (foo, baz) with a max gap of 1; let's use intervals for this.
Query q1 = new IntervalQuery(FLD_TEXT1,
Intervals.maxgaps(1,
Intervals.unordered(
Intervals.term("foo"),
Intervals.term("baz"))));
// The second one will be a simpler term query for "bar".
Query q2 = new TermQuery(new Term(FLD_TEXT1, "bar"));
// Let's fetch matching documents by combining the two into a Boolean query.
Query query = new BooleanQuery.Builder()
.add(q1, BooleanClause.Occur.SHOULD)
.add(q2, BooleanClause.Occur.SHOULD)
.build();
IndexSearcher searcher = new IndexSearcher(reader);
Sort sortOrder = Sort.INDEXORDER; // So that results are consistently ordered.
TopDocs topDocs = searcher.search(query, 10, sortOrder);
// If we use the "regular" highlighter, the result will be slightly odd: a nested
// highlight over "bar" within the first match. Also, you can't distinguish which of the sub-queries
// caused which highlight marker... but if it were HTML then you could give the span
// some semi-translucent background and layered matches would be visible.
MatchHighlighter highlighter =
new MatchHighlighter(searcher, analyzer)
.appendFieldHighlighter(FieldValueHighlighters.highlighted(
80 * 3, 1, new PassageFormatter("...", "<span>", "</span>"), FLD_TEXT1::equals))
.appendFieldHighlighter(FieldValueHighlighters.skipRemaining());
assertHighlights(toDocList(highlighter.highlight(topDocs, query)),
"0. text1: <span>foo <span>bar</span> baz</span>",
"1. text1: <span>foo baz</span> <span>bar</span>");
// To separate highlights for multiple queries we'll pass them separately to the
// highlighter and differentiate highlight markers upon their application. Let's start with the customized
// field highlighter first. This utilizes the fact that match ranges passed from MatchHighlighter
// contain a reference to the original query which brought up the match.
class SeparateMarkerFieldHighlighter implements MatchHighlighter.FieldValueHighlighter {
private final String field;
private final Map<Query, String> queryClassMap;
SeparateMarkerFieldHighlighter(String field, Map<Query, String> queryClassMap) {
this.field = field;
this.queryClassMap = queryClassMap;
}
@Override
public boolean isApplicable(String field, boolean hasMatches) {
return Objects.equals(field, this.field) && hasMatches;
}
@Override
public List<String> format(String field, String[] values, String contiguousValue,
List<OffsetRange> valueRanges, List<MatchHighlighter.QueryOffsetRange> matchOffsets) {
PassageSelector passageSelector = new PassageSelector();
int maxPassageWindow = 80;
int maxPassages = 3;
List<Passage> bestPassages =
passageSelector.pickBest(contiguousValue, matchOffsets, maxPassageWindow, maxPassages, valueRanges);
// We know the offset ranges passed to us by MatchHighlighter are instances of QueryOffsetRange
// so we compute the class based on that.
Function<OffsetRange, String> queryToClass =
(range) -> queryClassMap.get(((MatchHighlighter.QueryOffsetRange) range).query);
PassageFormatter passageFormatter = new PassageFormatter("...",
(range) -> "<span class='" + queryToClass.apply(range) + "'>",
(range) -> "</span>");
return passageFormatter.format(contiguousValue, bestPassages, valueRanges);
}
}
// And this is pretty much it. We now set up query classes to display, set up the highlighter...
Map<Query, String> queryClassMap = Map.of(q1, "q1", q2, "q2");
highlighter =
new MatchHighlighter(searcher, analyzer)
.appendFieldHighlighter(new SeparateMarkerFieldHighlighter(FLD_TEXT1, queryClassMap))
.appendFieldHighlighter(FieldValueHighlighters.skipRemaining());
// ...and run highlighting. Note the query passed to the highlighter are individual sub-clauses
// of the Boolean query used to fetch documents.
assertHighlights(toDocList(highlighter.highlight(topDocs, q1, q2)),
"0. text1: <span class='q1'>foo <span class='q2'>bar</span> baz</span>",
"1. text1: <span class='q1'>foo baz</span> <span class='q2'>bar</span>");
});
}
private void assertHighlights(List<List<String>> docList, String... expectedFormattedLines) {
ArrayList<String> actualLines = new ArrayList<>();
for (int doc = 0; doc < docList.size(); doc++) {
List<String> fields = docList.get(doc);
for (int i = 0; i < fields.size(); i++) {
actualLines.add((i == 0 ? String.format(Locale.ROOT, "%2d. ", doc) : " ") + fields.get(i));
}
}
if (!Arrays.equals(
Stream.of(expectedFormattedLines).map(String::trim).toArray(),
actualLines.stream().map(String::trim).toArray())) {
throw new AssertionError("Actual hits were:\n" +
String.join("\n", actualLines) + "\n\n but expected them to be:\n" +
String.join("\n", expectedFormattedLines));
}
}
private List<List<String>> toDocList(Stream<MatchHighlighter.DocHighlights> highlights) {
return highlights.map(docHighlights ->
docHighlights.fields.entrySet().stream()
.map(e -> e.getKey() + ": " + String.join(", ", e.getValue()))
.collect(Collectors.toList())
).collect(Collectors.toList());
}
private IndexableField toField(String name, String value) {
switch (name) {
case FLD_TEXT1:
return new Field(name, value, TYPE_TEXT_POSITIONS_OFFSETS);
case FLD_TEXT2:
return new Field(name, value, TYPE_TEXT_POSITIONS);
default:
throw new AssertionError("Don't know how to handle this field: " + name);
}
}
}

View File

@ -20,21 +20,17 @@ import com.carrotsearch.randomizedtesting.RandomizedTest;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
import org.apache.lucene.analysis.synonym.SynonymGraphFilter;
import org.apache.lucene.analysis.synonym.SynonymMap;
import org.apache.lucene.analysis.util.CharTokenizer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.Term;
import org.apache.lucene.queries.intervals.IntervalQuery;
@ -52,19 +48,13 @@ import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.spans.SpanNearQuery;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.store.ByteBuffersDirectory;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.LuceneTestCase;
import org.hamcrest.Matchers;
import org.junit.Before;
import org.junit.Test;
import java.io.IOException;
import java.io.UncheckedIOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
@ -75,11 +65,9 @@ import java.util.stream.Collectors;
import java.util.stream.Stream;
import static org.hamcrest.Matchers.containsInAnyOrder;
import static org.hamcrest.Matchers.emptyArray;
import static org.hamcrest.Matchers.not;
public class TestMatchRegionRetriever extends LuceneTestCase {
private static final String FLD_ID = "field_id";
private static final String FLD_ID = IndexBuilder.FLD_ID;
private static final String FLD_TEXT_POS_OFFS1 = "field_text_offs1";
private static final String FLD_TEXT_POS_OFFS2 = "field_text_offs2";
@ -100,7 +88,7 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
private Analyzer analyzer;
@Before
public void setup() {
public void setup() throws IOException {
TYPE_STORED_WITH_OFFSETS = new FieldType(TextField.TYPE_STORED);
TYPE_STORED_WITH_OFFSETS.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
TYPE_STORED_WITH_OFFSETS.freeze();
@ -109,26 +97,24 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
TYPE_STORED_NO_POSITIONS.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
TYPE_STORED_NO_POSITIONS.freeze();
final int offsetGap = RandomizedTest.randomIntBetween(0, 2);
final int positionGap = RandomizedTest.randomFrom(new int[]{0, 1, 100});
Analyzer whitespaceAnalyzer =
new Analyzer() {
final int offsetGap = RandomizedTest.randomIntBetween(0, 2);
final int positionGap = RandomizedTest.randomFrom(new int[]{0, 1, 100});
new AnalyzerWithGaps(offsetGap, positionGap,
new WhitespaceAnalyzer(WhitespaceTokenizer.DEFAULT_MAX_WORD_LEN));
SynonymMap synonymMap = TestMatchHighlighter.buildSynonymMap(new String[][] {
{"foo\u0000bar", "syn1"},
{"baz", "syn2\u0000syn3"},
});
Analyzer synonymsAnalyzer =
new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
WhitespaceTokenizer tokenizer =
new WhitespaceTokenizer(CharTokenizer.DEFAULT_MAX_WORD_LEN);
return new TokenStreamComponents(tokenizer);
}
@Override
public int getOffsetGap(String fieldName) {
return offsetGap;
}
@Override
public int getPositionIncrementGap(String fieldName) {
return positionGap;
Tokenizer tokenizer = new WhitespaceTokenizer();
TokenStream tokenStream = new SynonymGraphFilter(tokenizer, synonymMap, true);
return new TokenStreamComponents(tokenizer, tokenStream);
}
};
@ -138,26 +124,8 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
fieldAnalyzers.put(FLD_TEXT_POS_OFFS1, whitespaceAnalyzer);
fieldAnalyzers.put(FLD_TEXT_POS_OFFS2, whitespaceAnalyzer);
fieldAnalyzers.put(FLD_TEXT_NOPOS, whitespaceAnalyzer);
try {
SynonymMap.Builder b = new SynonymMap.Builder();
b.add(new CharsRef("foo\u0000bar"), new CharsRef("syn1"), true);
b.add(new CharsRef("baz"), new CharsRef("syn2\u0000syn3"), true);
SynonymMap synonymMap = b.build();
Analyzer synonymsAnalyzer =
new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new WhitespaceTokenizer();
TokenStream tokenStream = new SynonymGraphFilter(tokenizer, synonymMap, true);
return new TokenStreamComponents(tokenizer, tokenStream);
}
};
fieldAnalyzers.put(FLD_TEXT_SYNONYMS_POS_OFFS, synonymsAnalyzer);
fieldAnalyzers.put(FLD_TEXT_SYNONYMS_POS, synonymsAnalyzer);
} catch (IOException e) {
throw new UncheckedIOException(e);
}
fieldAnalyzers.put(FLD_TEXT_SYNONYMS_POS_OFFS, synonymsAnalyzer);
fieldAnalyzers.put(FLD_TEXT_SYNONYMS_POS, synonymsAnalyzer);
analyzer = new PerFieldAnalyzerWrapper(new MissingAnalyzer(), fieldAnalyzers);
}
@ -184,13 +152,12 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
}
private void checkTermQuery(String field) throws IOException {
withReader(
List.of(
Map.of(field, values("foo bar baz")),
Map.of(field, values("bar foo baz")),
Map.of(field, values("bar baz foo")),
Map.of(field, values("bar bar bar irrelevant"))),
reader -> {
new IndexBuilder(this::toField)
.doc(field, "foo bar baz")
.doc(field, "bar foo baz")
.doc(field, "bar baz foo")
.doc(field, "bar bar bar irrelevant")
.build(analyzer, reader -> {
assertThat(highlights(reader, new TermQuery(new Term(field, "foo"))),
containsInAnyOrder(
fmt("0: (%s: '>foo< bar baz')", field),
@ -217,17 +184,17 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
.add(new TermQuery(new Term(field, "xyz")), BooleanClause.Occur.MUST_NOT)
.build();
withReader(
List.of(
Map.of(field, values("foo bar baz abc")),
Map.of(field, values("bar foo baz def")),
Map.of(field, values("bar baz foo xyz"))),
reader -> {
new IndexBuilder(this::toField)
.doc(field, "foo bar baz abc")
.doc(field, "bar foo baz def")
.doc(field, "bar baz foo xyz")
.build(analyzer, reader -> {
assertThat(highlights(reader, query),
containsInAnyOrder(
fmt("0: (%s: '>foo bar baz< abc')", field),
fmt("1: (%s: 'bar >foo baz< def')", field)));
});
}
);
}
@Test
@ -241,12 +208,11 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
}
private void checkVariousQueryTypes(String field) throws IOException {
withReader(
List.of(
Map.of(field, values("foo bar baz abc")),
Map.of(field, values("bar foo baz def")),
Map.of(field, values("bar baz foo xyz"))),
reader -> {
new IndexBuilder(this::toField)
.doc(field, "foo bar baz abc")
.doc(field, "bar foo baz def")
.doc(field, "bar baz foo xyz")
.build(analyzer, reader -> {
assertThat(highlights(reader, stdQueryParser.apply("foo baz", field)),
containsInAnyOrder(
fmt("0: (%s: '>foo< bar >baz< abc')", field),
@ -297,31 +263,31 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
assertThat(highlights(reader, new MatchAllDocsQuery()),
Matchers.hasSize(0));
});
}
);
withReader(
List.of(
Map.of(field, values("foo baz foo")),
Map.of(field, values("bas baz foo")),
Map.of(field, values("bar baz foo xyz"))),
reader -> {
new IndexBuilder(this::toField)
.doc(field, "foo baz foo")
.doc(field, "bas baz foo")
.doc(field, "bar baz foo xyz")
.build(analyzer, reader -> {
assertThat(
highlights(reader, stdQueryParser.apply("[bar TO baz] -bar", field)),
containsInAnyOrder(
fmt("0: (%s: 'foo >baz< foo')", field), fmt("1: (%s: '>bas< >baz< foo')", field)));
});
}
);
}
@Test
public void testIntervalQueries() throws IOException {
String field = FLD_TEXT_POS_OFFS;
withReader(
List.of(
Map.of(field, values("foo baz foo")),
Map.of(field, values("bas baz foo")),
Map.of(field, values("bar baz foo xyz"))),
reader -> {
new IndexBuilder(this::toField)
.doc(field, "foo baz foo")
.doc(field, "bas baz foo")
.doc(field, "bar baz foo xyz")
.build(analyzer, reader -> {
assertThat(
highlights(reader, new IntervalQuery(field,
Intervals.unordered(
@ -374,7 +340,8 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
containsInAnyOrder(
fmt("2: (field_text_offs: '>bar baz foo< xyz')", field)
));
});
}
);
}
@Test
@ -388,36 +355,37 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
}
public void checkMultivaluedFields(String field) throws IOException {
withReader(
List.of(
Map.of(field, values("foo bar", "baz abc", "bad baz")),
Map.of(field, values("bar foo", "baz def")),
Map.of(field, values("bar baz", "foo xyz"))),
reader -> {
new IndexBuilder(this::toField)
.doc(field, "foo bar", "baz abc", "bad baz")
.doc(field, "bar foo", "baz def")
.doc(field, "bar baz", "foo xyz")
.build(analyzer, reader -> {
assertThat(highlights(reader, stdQueryParser.apply("baz", field)),
containsInAnyOrder(
fmt("0: (%s: '>baz< abc | bad >baz<')", field),
fmt("1: (%s: '>baz< def')", field),
fmt("2: (%s: 'bar >baz<')", field)));
});
}
);
}
@Test
public void testMultiFieldHighlights() throws IOException {
for (String[] fields :
for (String[] fieldPairs :
new String[][]{
{FLD_TEXT_POS_OFFS1, FLD_TEXT_POS_OFFS2},
{FLD_TEXT_POS, FLD_TEXT_POS_OFFS2},
{FLD_TEXT_POS_OFFS1, FLD_TEXT_POS}
}) {
String field1 = fields[0];
String field2 = fields[1];
withReader(
List.of(
Map.of(
field1, values("foo bar", "baz abc"),
field2, values("foo baz", "loo bar"))),
reader -> {
String field1 = fieldPairs[0];
String field2 = fieldPairs[1];
new IndexBuilder(this::toField)
.doc(fields -> {
fields.add(field1, "foo bar", "baz abc");
fields.add(field2, "foo baz", "loo bar");
})
.build(analyzer, reader -> {
String ordered =
Stream.of(fmt("(%s: '>baz< abc')", field1), fmt("(%s: 'loo >bar<')", field2))
.sorted()
@ -428,7 +396,8 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
reader,
stdQueryParser.apply(field1 + ":baz" + " OR " + field2 + ":bar", field1)),
containsInAnyOrder(fmt("0: %s", ordered)));
});
}
);
}
}
@ -440,15 +409,17 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
public void testNoRewrite() throws IOException {
String field1 = FLD_TEXT_POS_OFFS1;
String field2 = FLD_TEXT_POS_OFFS2;
withReader(
List.of(
Map.of(
field1, values("0100"),
field2, values("loo bar")),
Map.of(
field1, values("0200"),
field2, values("foo bar"))),
reader -> {
new IndexBuilder(this::toField)
.doc(fields -> {
fields.add(field1, "0100");
fields.add(field2, "loo bar");
})
.doc(fields -> {
fields.add(field1, "0200");
fields.add(field2, "foo bar");
})
.build(analyzer, reader -> {
String expected = fmt("0: (%s: '>0100<')(%s: 'loo >bar<')", field1, field2);
assertThat(
highlights(
@ -461,7 +432,8 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
reader,
stdQueryParser.apply(fmt("+%s:01* AND %s:bar", field1, field2), field1)),
containsInAnyOrder(expected));
});
}
);
}
@Test
@ -475,9 +447,9 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
}
public void checkNestedQueryHits(String field) throws IOException {
withReader(
List.of(Map.of(field, values("foo bar baz abc"))),
reader -> {
new IndexBuilder(this::toField)
.doc(field, "foo bar baz abc")
.build(analyzer, reader -> {
assertThat(
highlights(
reader,
@ -496,7 +468,8 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
.add(new TermQuery(new Term(field, "baz")), BooleanClause.Occur.SHOULD)
.build()),
containsInAnyOrder(fmt("0: (%s: '>foo >bar< >baz<< abc')", field)));
});
}
);
}
@Test
@ -510,13 +483,12 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
}
private void checkGraphQuery(String field) throws IOException {
withReader(
List.of(
Map.of(field, values("foo bar baz")),
Map.of(field, values("bar foo baz")),
Map.of(field, values("bar baz foo")),
Map.of(field, values("bar bar bar irrelevant"))),
reader -> {
new IndexBuilder(this::toField)
.doc(field, "foo bar baz")
.doc(field, "bar foo baz")
.doc(field, "bar baz foo")
.doc(field, "bar bar bar irrelevant")
.build(analyzer, reader -> {
assertThat(highlights(reader, new TermQuery(new Term(field, "syn1"))),
containsInAnyOrder(fmt("0: (%s: '>foo bar< baz')", field)));
@ -536,7 +508,8 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
assertThat(
highlights(reader, stdQueryParser.apply(field + ":\"foo syn2 syn3\"", field)),
containsInAnyOrder(fmt("1: (%s: 'bar >foo baz<')", field)));
});
}
);
}
@Test
@ -550,13 +523,12 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
}
private void checkSpanQueries(String field) throws IOException {
withReader(
List.of(
Map.of(field, values("foo bar baz")),
Map.of(field, values("bar foo baz")),
Map.of(field, values("bar baz foo")),
Map.of(field, values("bar bar bar irrelevant"))),
reader -> {
new IndexBuilder(this::toField)
.doc(field, "foo bar baz")
.doc(field, "bar foo baz")
.doc(field, "bar baz foo")
.doc(field, "bar bar bar irrelevant")
.build(analyzer, reader -> {
assertThat(
highlights(
reader,
@ -598,7 +570,8 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
fmt("0: (%s: '>foo bar< baz')", field),
fmt("1: (%s: '>bar foo< baz')", field),
fmt("2: (%s: '>bar baz foo<')", field)));
});
}
);
}
/**
@ -610,12 +583,10 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
public void testTextFieldNoPositionsOffsetFromValues() throws Exception {
String field = FLD_TEXT_NOPOS;
withReader(
List.of(
Map.of(FLD_TEXT_NOPOS, values("foo bar")),
Map.of(FLD_TEXT_NOPOS, values("foo bar", "baz baz"))
),
reader -> {
new IndexBuilder(this::toField)
.doc(FLD_TEXT_NOPOS, "foo bar")
.doc(FLD_TEXT_NOPOS, "foo bar", "baz baz")
.build(analyzer, reader -> {
OffsetsRetrievalStrategySupplier defaults = MatchRegionRetriever
.computeOffsetRetrievalStrategies(reader, analyzer);
OffsetsRetrievalStrategySupplier customSuppliers = (fld) -> {
@ -634,7 +605,8 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
containsInAnyOrder(
fmt("0: (%s: '>foo bar<')", field),
fmt("1: (%s: '>foo bar< | >baz baz<')", field)));
});
}
);
}
/**
@ -648,13 +620,13 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
public void testTextFieldNoPositionsOffsetsFromTokens() throws Exception {
String field = FLD_TEXT_NOPOS;
withReader(
List.of(
Map.of(FLD_TEXT_NOPOS, values("foo bar"),
FLD_TEXT_POS, values("bar bar")),
Map.of(FLD_TEXT_NOPOS, values("foo bar", "baz bar"))
),
reader -> {
new IndexBuilder(this::toField)
.doc(fields -> {
fields.add(FLD_TEXT_NOPOS, "foo bar");
fields.add(FLD_TEXT_POS, "bar bar");
})
.doc(FLD_TEXT_NOPOS, "foo bar", "baz bar")
.build(analyzer, reader -> {
assertThat(
highlights(
reader,
@ -662,7 +634,8 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
containsInAnyOrder(
fmt("0: (%s: 'foo >bar<')", field),
fmt("1: (%s: 'foo >bar< | baz >bar<')", field)));
});
}
);
}
private List<String> highlights(IndexReader reader, Query query) throws IOException {
@ -702,46 +675,14 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
}
};
MatchRegionRetriever highlighter = new MatchRegionRetriever(searcher, rewrittenQuery, analyzer,
offsetsStrategySupplier);
MatchRegionRetriever highlighter = new MatchRegionRetriever(searcher, rewrittenQuery, offsetsStrategySupplier);
highlighter.highlightDocuments(topDocs, highlightCollector);
return highlights;
}
private String[] values(String... values) {
assertThat(values, not(emptyArray()));
return values;
}
private void withReader(
Collection<Map<String, String[]>> docs, IOUtils.IOConsumer<DirectoryReader> block)
throws IOException {
IndexWriterConfig config = new IndexWriterConfig(analyzer);
try (Directory directory = new ByteBuffersDirectory()) {
IndexWriter iw = new IndexWriter(directory, config);
int seq = 0;
for (Map<String, String[]> fields : docs) {
Document doc = new Document();
doc.add(new StringField(FLD_ID, Integer.toString(seq++), Field.Store.YES));
for (Map.Entry<String, String[]> field : fields.entrySet()) {
for (String value : field.getValue()) {
doc.add(toField(field.getKey(), value));
}
}
iw.addDocument(doc);
if (RandomizedTest.randomBoolean()) {
iw.commit();
}
}
iw.flush();
try (DirectoryReader reader = DirectoryReader.open(iw)) {
block.accept(reader);
}
}
private static String fmt(String string, Object... args) {
return String.format(Locale.ROOT, string, args);
}
private IndexableField toField(String name, String value) {
@ -760,8 +701,4 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
throw new AssertionError("Don't know how to handle this field: " + name);
}
}
private static String fmt(String string, Object... args) {
return String.format(Locale.ROOT, string, args);
}
}