mirror of https://github.com/apache/lucene.git
LUCENE-9464: Add high(er)-level hit highlighter example that demonstrates and uses low-level components (#1820)
This commit is contained in:
parent
8debc9d0c2
commit
e2f3f626ee
|
@ -0,0 +1,139 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.search.matchhighlight;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.function.BiPredicate;
|
||||
import java.util.function.Predicate;
|
||||
|
||||
/**
|
||||
* A factory of {@link org.apache.lucene.search.matchhighlight.MatchHighlighter.FieldValueHighlighter} classes
|
||||
* that cover typical use cases (verbatim values, highlights, abbreviations).
|
||||
*
|
||||
* @see MatchHighlighter#appendFieldHighlighter
|
||||
*/
|
||||
public final class FieldValueHighlighters {
|
||||
private FieldValueHighlighters() {
|
||||
}
|
||||
|
||||
private static abstract class AbstractFieldValueHighlighter implements MatchHighlighter.FieldValueHighlighter {
|
||||
private final BiPredicate<String, Boolean> testPredicate;
|
||||
|
||||
protected AbstractFieldValueHighlighter(BiPredicate<String, Boolean> testPredicate) {
|
||||
this.testPredicate = testPredicate;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final boolean isApplicable(String field, boolean hasMatches) {
|
||||
return testPredicate.test(field, hasMatches);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Displays up to {@code maxLeadingCharacters} of the field's value, regardless of whether it contained
|
||||
* highlights or not.
|
||||
*/
|
||||
public static MatchHighlighter.FieldValueHighlighter maxLeadingCharacters(int maxLeadingCharacters, String ellipsis, Set<String> fields) {
|
||||
PassageSelector passageSelector = defaultPassageSelector();
|
||||
PassageFormatter passageFormatter = new PassageFormatter(ellipsis, "", "");
|
||||
return new AbstractFieldValueHighlighter((field, hasMatches) -> fields.contains(field)) {
|
||||
@Override
|
||||
public List<String> format(String field, String[] values, String contiguousValue,
|
||||
List<OffsetRange> valueRanges, List<MatchHighlighter.QueryOffsetRange> matchOffsets) {
|
||||
List<Passage> bestPassages =
|
||||
passageSelector.pickBest(contiguousValue, Collections.emptyList(), maxLeadingCharacters, 1, valueRanges);
|
||||
|
||||
return passageFormatter.format(contiguousValue, bestPassages, valueRanges);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Collection<String> alwaysFetchedFields() {
|
||||
return fields;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Default preconfigured {@link PassageSelector}.
|
||||
*/
|
||||
public static PassageSelector defaultPassageSelector() {
|
||||
return new PassageSelector(
|
||||
PassageSelector.DEFAULT_SCORER,
|
||||
new BreakIteratorShrinkingAdjuster());
|
||||
}
|
||||
|
||||
/**
|
||||
* Highlights fields matching predicate {@code matchFields} only if they contained query matches.
|
||||
*/
|
||||
public static MatchHighlighter.FieldValueHighlighter highlighted(
|
||||
int maxPassageWindow,
|
||||
int maxPassages,
|
||||
PassageFormatter passageFormatter,
|
||||
Predicate<String> matchFields) {
|
||||
PassageSelector passageSelector = defaultPassageSelector();
|
||||
return new AbstractFieldValueHighlighter((field, hasMatches) -> matchFields.test(field) && hasMatches) {
|
||||
@Override
|
||||
public List<String> format(String field, String[] values, String contiguousValue,
|
||||
List<OffsetRange> valueRanges, List<MatchHighlighter.QueryOffsetRange> matchOffsets) {
|
||||
assert matchOffsets != null;
|
||||
|
||||
List<Passage> bestPassages =
|
||||
passageSelector.pickBest(contiguousValue, matchOffsets, maxPassageWindow, maxPassages, valueRanges);
|
||||
|
||||
return passageFormatter.format(contiguousValue, bestPassages, valueRanges);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Always returns raw field values, no highlighting or value truncation is applied.
|
||||
*/
|
||||
public static MatchHighlighter.FieldValueHighlighter verbatimValue(String field, String... moreFields) {
|
||||
HashSet<String> matchFields = new HashSet<>(Arrays.asList(moreFields));
|
||||
matchFields.add(field);
|
||||
return new AbstractFieldValueHighlighter((fld, hasMatches) -> matchFields.contains(fld)) {
|
||||
@Override
|
||||
public Collection<String> alwaysFetchedFields() {
|
||||
return matchFields;
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<String> format(String field, String[] values, String contiguousValue, List<OffsetRange> valueRanges,
|
||||
List<MatchHighlighter.QueryOffsetRange> matchOffsets) {
|
||||
return Arrays.asList(values);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Matches all fields and omits their value in the output (so that no highlight or value is emitted).
|
||||
*/
|
||||
public static MatchHighlighter.FieldValueHighlighter skipRemaining() {
|
||||
return new AbstractFieldValueHighlighter((field, hasMatches) -> true) {
|
||||
@Override
|
||||
public List<String> format(String field, String[] values, String contiguousValue, List<OffsetRange> valueRanges,
|
||||
List<MatchHighlighter.QueryOffsetRange> matchOffsets) {
|
||||
return null;
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
|
@ -0,0 +1,308 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.search.matchhighlight;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.DocumentStoredFieldVisitor;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.IndexableField;
|
||||
import org.apache.lucene.index.LeafReader;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.ScoreDoc;
|
||||
import org.apache.lucene.search.TopDocs;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.UncheckedIOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.HashSet;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import java.util.function.Predicate;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
/**
|
||||
* An example highlighter that combines several lower-level highlighting
|
||||
* utilities in this package into a fully featured, ready-to-use component.
|
||||
* <p>
|
||||
* Note that if you need to customize or tweak the details of highlighting,
|
||||
* it is better to assemble your own highlighter using those low-level
|
||||
* building blocks, rather than extend or modify this one.
|
||||
*/
|
||||
public class MatchHighlighter {
|
||||
private final IndexSearcher searcher;
|
||||
private final OffsetsRetrievalStrategySupplier offsetsRetrievalStrategies;
|
||||
private final Analyzer analyzer;
|
||||
|
||||
private final HashSet<String> fieldsAlwaysReturned = new HashSet<>();
|
||||
private final List<FieldValueHighlighter> fieldHighlighters = new ArrayList<>();
|
||||
|
||||
/**
|
||||
* Actual per-field highlighter. Field highlighters are probed whether they
|
||||
* are applicable to a particular combination of (field, hasMatches) pair. If a highlighter
|
||||
* declares it is applicable, its {@link #format} method is invoked and the result
|
||||
* is returned as the field's value.
|
||||
*
|
||||
* @see FieldValueHighlighters
|
||||
*/
|
||||
public interface FieldValueHighlighter {
|
||||
/**
|
||||
* Check if this highlighter can be applied to a given field.
|
||||
*
|
||||
* @param field Field name
|
||||
* @param hasMatches {@code true} if the field has a non-empty set of match regions.
|
||||
*/
|
||||
boolean isApplicable(String field, boolean hasMatches);
|
||||
|
||||
/**
|
||||
* Do format field values appropriately.
|
||||
*/
|
||||
List<String> format(String field, String[] values, String contiguousValue,
|
||||
List<OffsetRange> valueRanges, List<QueryOffsetRange> matchOffsets);
|
||||
|
||||
/**
|
||||
* @return Returns a set of fields that must be fetched for each document, regardless
|
||||
* of whether they had matches or not. This is useful to load and return certain fields
|
||||
* that should always be included (identifiers, document titles, etc.).
|
||||
*/
|
||||
default Collection<String> alwaysFetchedFields() {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a new field value highlighter that is a combination of this one and another one.
|
||||
*/
|
||||
default FieldValueHighlighter or(FieldValueHighlighter other) {
|
||||
FieldValueHighlighter first = this;
|
||||
FieldValueHighlighter second = other;
|
||||
|
||||
HashSet<String> fieldUnion = new HashSet<>();
|
||||
fieldUnion.addAll(first.alwaysFetchedFields());
|
||||
fieldUnion.addAll(second.alwaysFetchedFields());
|
||||
|
||||
return new FieldValueHighlighter() {
|
||||
@Override
|
||||
public boolean isApplicable(String field, boolean hasMatches) {
|
||||
return first.isApplicable(field, hasMatches)
|
||||
|| second.isApplicable(field, hasMatches);
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<String> format(String field, String[] values, String contiguousValue,
|
||||
List<OffsetRange> valueRanges, List<QueryOffsetRange> matchOffsets) {
|
||||
FieldValueHighlighter delegate =
|
||||
first.isApplicable(field, matchOffsets != null && !matchOffsets.isEmpty()) ? first : second;
|
||||
return delegate.format(field, values, contiguousValue, valueRanges, matchOffsets);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Collection<String> alwaysFetchedFields() {
|
||||
return fieldUnion;
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Append a new highlighter to field highlighters chain. The order of field highlighters
|
||||
* is important (first-matching wins).
|
||||
*/
|
||||
public MatchHighlighter appendFieldHighlighter(FieldValueHighlighter highlighter) {
|
||||
fieldHighlighters.add(highlighter);
|
||||
fieldsAlwaysReturned.addAll(highlighter.alwaysFetchedFields());
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Always fetch the given set of fields for all input documents.
|
||||
*/
|
||||
public void alwaysFetchFields(String field, String... otherFields) {
|
||||
Stream.concat(Stream.of(field), Stream.of(otherFields))
|
||||
.forEach(fld -> fieldsAlwaysReturned.add(Objects.requireNonNull(fld)));
|
||||
}
|
||||
|
||||
/**
|
||||
* Single document's highlights.
|
||||
*/
|
||||
public static class DocHighlights {
|
||||
public final int docId;
|
||||
public final Map<String, List<String>> fields = new LinkedHashMap<>();
|
||||
|
||||
public DocHighlights(int docId) {
|
||||
this.docId = docId;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* An {@link OffsetRange} of a match, together with the source query that caused it.
|
||||
*/
|
||||
public static class QueryOffsetRange extends OffsetRange {
|
||||
public final Query query;
|
||||
|
||||
QueryOffsetRange(Query query, int from, int to) {
|
||||
super(from, to);
|
||||
this.query = query;
|
||||
}
|
||||
}
|
||||
|
||||
private static class DocHit {
|
||||
final int docId;
|
||||
private final LeafReader leafReader;
|
||||
private final int leafDocId;
|
||||
private final LinkedHashMap<String, List<QueryOffsetRange>> matchRanges
|
||||
= new LinkedHashMap<>();
|
||||
|
||||
DocHit(int docId, LeafReader leafReader, int leafDocId) {
|
||||
this.docId = docId;
|
||||
this.leafReader = leafReader;
|
||||
this.leafDocId = leafDocId;
|
||||
}
|
||||
|
||||
void addMatches(Query query, Map<String, List<OffsetRange>> hits) {
|
||||
hits.forEach((field, offsets) -> {
|
||||
List<QueryOffsetRange> target = matchRanges.computeIfAbsent(field, (fld) -> new ArrayList<>());
|
||||
offsets.forEach(o -> target.add(new QueryOffsetRange(query, o.from, o.to)));
|
||||
});
|
||||
}
|
||||
|
||||
Document document(Predicate<String> needsField) throws IOException {
|
||||
// Only load the fields that have a chance to be highlighted.
|
||||
DocumentStoredFieldVisitor visitor = new DocumentStoredFieldVisitor() {
|
||||
@Override
|
||||
public Status needsField(FieldInfo fieldInfo) {
|
||||
return (matchRanges.containsKey(fieldInfo.name) ||
|
||||
needsField.test(fieldInfo.name)) ? Status.YES : Status.NO;
|
||||
}
|
||||
};
|
||||
|
||||
leafReader.document(leafDocId, visitor);
|
||||
return visitor.getDocument();
|
||||
}
|
||||
}
|
||||
|
||||
public MatchHighlighter(IndexSearcher searcher, Analyzer analyzer) {
|
||||
this(searcher, analyzer, MatchRegionRetriever.computeOffsetRetrievalStrategies(searcher.getIndexReader(), analyzer));
|
||||
}
|
||||
|
||||
public MatchHighlighter(IndexSearcher searcher,
|
||||
Analyzer analyzer,
|
||||
OffsetsRetrievalStrategySupplier offsetsRetrievalStrategies) {
|
||||
this.searcher = searcher;
|
||||
this.offsetsRetrievalStrategies = offsetsRetrievalStrategies;
|
||||
this.analyzer = analyzer;
|
||||
}
|
||||
|
||||
public Stream<DocHighlights> highlight(TopDocs topDocs, Query... queries) throws IOException {
|
||||
// We want to preserve topDocs document ordering and MatchRegionRetriever is optimized
|
||||
// for streaming, so we'll just prepopulate the map in proper order.
|
||||
LinkedHashMap<Integer, DocHit> docHits = new LinkedHashMap<>();
|
||||
for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
|
||||
docHits.put(scoreDoc.doc, null);
|
||||
}
|
||||
|
||||
// Collect match ranges for each query and associate each range to the origin query.
|
||||
for (Query q : queries) {
|
||||
MatchRegionRetriever highlighter =
|
||||
new MatchRegionRetriever(searcher, searcher.rewrite(q), offsetsRetrievalStrategies);
|
||||
highlighter.highlightDocuments(topDocs,
|
||||
(int docId, LeafReader leafReader, int leafDocId, Map<String, List<OffsetRange>> hits) -> {
|
||||
DocHit docHit = docHits.get(docId);
|
||||
if (docHit == null) {
|
||||
docHit = new DocHit(docId, leafReader, leafDocId);
|
||||
docHits.put(docId, docHit);
|
||||
}
|
||||
docHit.addMatches(q, hits);
|
||||
});
|
||||
}
|
||||
|
||||
return docHits.values().stream()
|
||||
.filter(Objects::nonNull) // This should always the case?
|
||||
.map(this::computeDocFieldValues);
|
||||
}
|
||||
|
||||
private DocHighlights computeDocFieldValues(DocHit docHit) {
|
||||
Document doc;
|
||||
try {
|
||||
doc = docHit.document(fieldsAlwaysReturned::contains);
|
||||
} catch (IOException e) {
|
||||
throw new UncheckedIOException(e);
|
||||
}
|
||||
|
||||
DocHighlights docHighlights = new DocHighlights(docHit.docId);
|
||||
|
||||
HashSet<String> unique = new HashSet<>();
|
||||
for (IndexableField indexableField : doc) {
|
||||
String field = indexableField.name();
|
||||
if (!unique.add(field)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
String[] values = doc.getValues(field);
|
||||
String contiguousValue = contiguousFieldValue(field, values);
|
||||
List<OffsetRange> valueRanges = computeValueRanges(field, values);
|
||||
List<QueryOffsetRange> offsets = docHit.matchRanges.get(field);
|
||||
|
||||
List<String> formattedValues = fieldValueHighlighter(field, offsets != null)
|
||||
.format(field, values, contiguousValue, valueRanges, offsets);
|
||||
|
||||
if (formattedValues != null) {
|
||||
docHighlights.fields.put(field, formattedValues);
|
||||
}
|
||||
}
|
||||
|
||||
return docHighlights;
|
||||
}
|
||||
|
||||
private List<OffsetRange> computeValueRanges(String field, String[] values) {
|
||||
ArrayList<OffsetRange> valueRanges = new ArrayList<>();
|
||||
int offset = 0;
|
||||
for (CharSequence v : values) {
|
||||
valueRanges.add(new OffsetRange(offset, offset + v.length()));
|
||||
offset += v.length();
|
||||
offset += analyzer.getOffsetGap(field);
|
||||
}
|
||||
return valueRanges;
|
||||
}
|
||||
|
||||
private String contiguousFieldValue(String field, String[] values) {
|
||||
String value;
|
||||
if (values.length == 1) {
|
||||
value = values[0];
|
||||
} else {
|
||||
// TODO: This can be inefficient if offset gap is large but the logic
|
||||
// of applying offsets would get much more complicated so leaving for now
|
||||
// (would have to recalculate all offsets to omit gaps).
|
||||
String fieldGapPadding = " ".repeat(analyzer.getOffsetGap(field));
|
||||
value = String.join(fieldGapPadding, values);
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
private FieldValueHighlighter fieldValueHighlighter(String field, boolean hasMatches) {
|
||||
for (FieldValueHighlighter highlighter : fieldHighlighters) {
|
||||
if (highlighter.isApplicable(field, hasMatches)) {
|
||||
return highlighter;
|
||||
}
|
||||
}
|
||||
throw new RuntimeException("No field highlighter could be matched to field: " + field);
|
||||
}
|
||||
}
|
|
@ -80,22 +80,23 @@ public class MatchRegionRetriever {
|
|||
|
||||
/**
|
||||
* A constructor with the default offset strategy supplier.
|
||||
*
|
||||
* @param analyzer An analyzer that may be used to reprocess (retokenize) document fields
|
||||
* in the absence of position offsets in the index. Note that the analyzer must return
|
||||
* tokens (positions and offsets) identical to the ones stored in the index.
|
||||
*/
|
||||
public MatchRegionRetriever(IndexSearcher searcher, Query query, Analyzer analyzer) throws IOException {
|
||||
this(searcher, query, analyzer, computeOffsetRetrievalStrategies(searcher.getIndexReader(), analyzer));
|
||||
this(searcher, query, computeOffsetRetrievalStrategies(searcher.getIndexReader(), analyzer));
|
||||
}
|
||||
|
||||
/**
|
||||
* @param searcher Index searcher to be used for retrieving matches.
|
||||
* @param query The query for which matches should be retrieved. The query should be rewritten
|
||||
* against the provided searcher.
|
||||
* @param analyzer An analyzer that may be used to reprocess (retokenize) document fields
|
||||
* in the absence of position offsets in the index. Note that the analyzer must return
|
||||
* tokens (positions and offsets) identical to the ones stored in the index.
|
||||
* @param fieldOffsetStrategySupplier A custom supplier of per-field {@link OffsetsRetrievalStrategy}
|
||||
* instances.
|
||||
*/
|
||||
public MatchRegionRetriever(IndexSearcher searcher, Query query, Analyzer analyzer,
|
||||
public MatchRegionRetriever(IndexSearcher searcher, Query query,
|
||||
OffsetsRetrievalStrategySupplier fieldOffsetStrategySupplier)
|
||||
throws IOException {
|
||||
leaves = searcher.getIndexReader().leaves();
|
||||
|
|
|
@ -0,0 +1,51 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.search.matchhighlight;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.DelegatingAnalyzerWrapper;
|
||||
|
||||
/**
|
||||
* An analyzer for tests that has a predefined offset and position gap.
|
||||
*/
|
||||
class AnalyzerWithGaps extends DelegatingAnalyzerWrapper {
|
||||
private final Analyzer delegate;
|
||||
private final int offsetGap;
|
||||
private final int positionGap;
|
||||
|
||||
AnalyzerWithGaps(int offsetGap, int positionGap, Analyzer delegate) {
|
||||
super(delegate.getReuseStrategy());
|
||||
this.delegate = delegate;
|
||||
this.offsetGap = offsetGap;
|
||||
this.positionGap = positionGap;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Analyzer getWrappedAnalyzer(String fieldName) {
|
||||
return delegate;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getOffsetGap(String fieldName) {
|
||||
return offsetGap;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getPositionIncrementGap(String fieldName) {
|
||||
return positionGap;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,105 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.search.matchhighlight;
|
||||
|
||||
import com.carrotsearch.randomizedtesting.RandomizedTest;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.NumericDocValuesField;
|
||||
import org.apache.lucene.document.StringField;
|
||||
import org.apache.lucene.index.DirectoryReader;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.IndexWriterConfig;
|
||||
import org.apache.lucene.index.IndexableField;
|
||||
import org.apache.lucene.search.Sort;
|
||||
import org.apache.lucene.search.SortField;
|
||||
import org.apache.lucene.store.ByteBuffersDirectory;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.function.BiFunction;
|
||||
import java.util.function.Consumer;
|
||||
|
||||
/**
|
||||
* Utility class for building an ephemeral document index
|
||||
* and running a block of code on its reader.
|
||||
*/
|
||||
class IndexBuilder {
|
||||
public static final String FLD_ID = "id";
|
||||
public static final String FLD_SORT_ORDER = "id_order";
|
||||
|
||||
private final BiFunction<String, String, IndexableField> toField;
|
||||
private final ArrayList<Document> documents = new ArrayList<>();
|
||||
private int seq;
|
||||
|
||||
class DocFields {
|
||||
final Document document;
|
||||
|
||||
public DocFields(Document doc) {
|
||||
this.document = doc;
|
||||
}
|
||||
|
||||
public void add(String field, String... values) {
|
||||
assert values.length > 0 : "At least one field value is required.";
|
||||
for (String value : values) {
|
||||
document.add(toField.apply(field, value));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
IndexBuilder(BiFunction<String, String, IndexableField> valueToField) {
|
||||
this.toField = valueToField;
|
||||
}
|
||||
|
||||
public IndexBuilder doc(String field, String... values) {
|
||||
return doc(fields -> {
|
||||
fields.add(field, values);
|
||||
});
|
||||
}
|
||||
|
||||
public IndexBuilder doc(Consumer<DocFields> fields) {
|
||||
Document doc = new Document();
|
||||
doc.add(new NumericDocValuesField(FLD_SORT_ORDER, seq));
|
||||
doc.add(new StringField(FLD_ID, Integer.toString(seq++), Field.Store.YES));
|
||||
fields.accept(new DocFields(doc));
|
||||
documents.add(doc);
|
||||
return this;
|
||||
}
|
||||
|
||||
public IndexBuilder build(Analyzer analyzer, IOUtils.IOConsumer<DirectoryReader> block) throws IOException {
|
||||
IndexWriterConfig config = new IndexWriterConfig(analyzer);
|
||||
config.setIndexSort(new Sort(new SortField(FLD_SORT_ORDER, SortField.Type.LONG)));
|
||||
try (Directory directory = new ByteBuffersDirectory()) {
|
||||
IndexWriter iw = new IndexWriter(directory, config);
|
||||
for (Document doc : documents) {
|
||||
iw.addDocument(doc);
|
||||
}
|
||||
if (RandomizedTest.randomBoolean()) {
|
||||
iw.commit();
|
||||
}
|
||||
iw.flush();
|
||||
|
||||
try (DirectoryReader reader = DirectoryReader.open(iw)) {
|
||||
block.accept(reader);
|
||||
}
|
||||
}
|
||||
return this;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,466 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.search.matchhighlight;
|
||||
|
||||
import com.carrotsearch.randomizedtesting.RandomizedTest;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
|
||||
import org.apache.lucene.analysis.synonym.SynonymGraphFilter;
|
||||
import org.apache.lucene.analysis.synonym.SynonymMap;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.FieldType;
|
||||
import org.apache.lucene.document.TextField;
|
||||
import org.apache.lucene.index.IndexOptions;
|
||||
import org.apache.lucene.index.IndexableField;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.queries.intervals.IntervalQuery;
|
||||
import org.apache.lucene.queries.intervals.Intervals;
|
||||
import org.apache.lucene.search.BooleanClause;
|
||||
import org.apache.lucene.search.BooleanQuery;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.MatchAllDocsQuery;
|
||||
import org.apache.lucene.search.PhraseQuery;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.Sort;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.search.TopDocs;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.hamcrest.Matchers;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import java.util.Set;
|
||||
import java.util.function.Function;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
public class TestMatchHighlighter extends LuceneTestCase {
|
||||
private static final String FLD_ID = "id";
|
||||
private static final String FLD_TEXT1 = "text1";
|
||||
private static final String FLD_TEXT2 = "text2";
|
||||
|
||||
private FieldType TYPE_TEXT_POSITIONS_OFFSETS;
|
||||
private FieldType TYPE_TEXT_POSITIONS;
|
||||
|
||||
private PerFieldAnalyzerWrapper analyzer;
|
||||
|
||||
@Before
|
||||
public void setup() throws IOException {
|
||||
TYPE_TEXT_POSITIONS = TextField.TYPE_STORED;
|
||||
|
||||
TYPE_TEXT_POSITIONS_OFFSETS = new FieldType(TextField.TYPE_STORED);
|
||||
TYPE_TEXT_POSITIONS_OFFSETS.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
|
||||
TYPE_TEXT_POSITIONS_OFFSETS.freeze();
|
||||
|
||||
Map<String, Analyzer> fieldAnalyzers = new HashMap<>();
|
||||
|
||||
// Create an analyzer with some synonyms, just to showcase them.
|
||||
SynonymMap synonymMap = buildSynonymMap(new String[][]{
|
||||
{"moon\u0000shine", "firewater"},
|
||||
{"firewater", "moon\u0000shine"},
|
||||
});
|
||||
|
||||
// Make a non-empty offset gap so that break iterator doesn't go haywire on multivalues
|
||||
// glued together.
|
||||
final int offsetGap = RandomizedTest.randomIntBetween(1, 2);
|
||||
final int positionGap = RandomizedTest.randomFrom(new int[]{0, 1, 100});
|
||||
Analyzer synonymsAnalyzer =
|
||||
new AnalyzerWithGaps(offsetGap, positionGap, new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer();
|
||||
TokenStream tokenStream = new SynonymGraphFilter(tokenizer, synonymMap, true);
|
||||
return new TokenStreamComponents(tokenizer, tokenStream);
|
||||
}
|
||||
});
|
||||
|
||||
fieldAnalyzers.put(FLD_TEXT1, synonymsAnalyzer);
|
||||
fieldAnalyzers.put(FLD_TEXT2, synonymsAnalyzer);
|
||||
|
||||
analyzer = new PerFieldAnalyzerWrapper(new MissingAnalyzer(), fieldAnalyzers);
|
||||
}
|
||||
|
||||
static SynonymMap buildSynonymMap(String[][] synonyms) throws IOException {
|
||||
SynonymMap.Builder builder = new SynonymMap.Builder();
|
||||
for (String[] pair : synonyms) {
|
||||
assertThat(pair.length, Matchers.equalTo(2));
|
||||
builder.add(new CharsRef(pair[0]), new CharsRef(pair[1]), true);
|
||||
}
|
||||
return builder.build();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testBasicUsage() throws IOException {
|
||||
new IndexBuilder(this::toField)
|
||||
.doc(FLD_TEXT1, "foo bar baz")
|
||||
.doc(FLD_TEXT1, "bar foo baz")
|
||||
.doc(fields -> {
|
||||
fields.add(FLD_TEXT1, "Very long content but not matching anything.");
|
||||
fields.add(FLD_TEXT2, "no foo but bar");
|
||||
})
|
||||
.build(analyzer, reader -> {
|
||||
Query query = new BooleanQuery.Builder()
|
||||
.add(new TermQuery(new Term(FLD_TEXT1, "foo")), BooleanClause.Occur.SHOULD)
|
||||
.add(new TermQuery(new Term(FLD_TEXT2, "bar")), BooleanClause.Occur.SHOULD)
|
||||
.build();
|
||||
|
||||
// In the most basic scenario, we run a search against a query, retrieve
|
||||
// top docs...
|
||||
IndexSearcher searcher = new IndexSearcher(reader);
|
||||
Sort sortOrder = Sort.INDEXORDER; // So that results are consistently ordered.
|
||||
TopDocs topDocs = searcher.search(query, 10, sortOrder);
|
||||
|
||||
// ...and would want a fixed set of fields from those documents, some of them
|
||||
// possibly highlighted if they matched the query.
|
||||
//
|
||||
// This configures the highlighter so that the FLD_ID field is always returned verbatim,
|
||||
// and FLD_TEXT1 is returned *only if it contained a query match*.
|
||||
MatchHighlighter highlighter =
|
||||
new MatchHighlighter(searcher, analyzer)
|
||||
.appendFieldHighlighter(FieldValueHighlighters.verbatimValue(FLD_ID))
|
||||
.appendFieldHighlighter(FieldValueHighlighters.highlighted(
|
||||
80 * 3, 1, new PassageFormatter("...", ">", "<"), FLD_TEXT1::equals))
|
||||
.appendFieldHighlighter(FieldValueHighlighters.skipRemaining());
|
||||
|
||||
// Note document field highlights are a stream over documents in topDocs. In the remaining code we will just
|
||||
// collect them on the fly into a preformatted string.
|
||||
Stream<MatchHighlighter.DocHighlights> highlights = highlighter.highlight(topDocs, query);
|
||||
assertHighlights(toDocList(highlights),
|
||||
" 0. id: 0",
|
||||
" text1: >foo< bar baz",
|
||||
" 1. id: 1",
|
||||
" text1: bar >foo< baz",
|
||||
" 2. id: 2");
|
||||
|
||||
// In a more realistic use case, you'd want to show the value of a given field *regardless* of whether it
|
||||
// contained a highlight or not -- it is odd that document "id: 2" above doesn't have the 'text1' field
|
||||
// shown because that field wasn't part of the query match.
|
||||
//
|
||||
// Let's say the field is also potentially long; if it contains a match,
|
||||
// we would want to display the contextual snippet surrounding that match. If it does not contain any
|
||||
// matches, we would want to display its content up to a given number of characters (lead lines).
|
||||
//
|
||||
// Let's do this by adding an appropriate field highlighter on FLD_TEXT1.
|
||||
highlighter =
|
||||
new MatchHighlighter(searcher, analyzer)
|
||||
.appendFieldHighlighter(FieldValueHighlighters.verbatimValue(FLD_ID))
|
||||
.appendFieldHighlighter(FieldValueHighlighters.highlighted(
|
||||
80 * 3, 1, new PassageFormatter("...", ">", "<"), FLD_TEXT1::equals))
|
||||
.appendFieldHighlighter(FieldValueHighlighters.maxLeadingCharacters(10, "...", Set.of(FLD_TEXT1)))
|
||||
.appendFieldHighlighter(FieldValueHighlighters.skipRemaining());
|
||||
|
||||
assertHighlights(toDocList(highlighter.highlight(topDocs, query)),
|
||||
" 0. id: 0",
|
||||
" text1: >foo< bar baz",
|
||||
" 1. id: 1",
|
||||
" text1: bar >foo< baz",
|
||||
" 2. id: 2",
|
||||
" text1: Very long...");
|
||||
|
||||
// Field highlighters can apply to multiple fields and be chained for convenience.
|
||||
// For example, this defines a combined highlighter over both FLD_TEXT1 and FLD_TEXT2.
|
||||
Set<String> fields = Set.of(FLD_TEXT1, FLD_TEXT2);
|
||||
MatchHighlighter.FieldValueHighlighter highlightedOrAbbreviated =
|
||||
FieldValueHighlighters.highlighted(80 * 3, 1, new PassageFormatter("...", ">", "<"), fields::contains)
|
||||
.or(FieldValueHighlighters.maxLeadingCharacters(10, "...", fields));
|
||||
|
||||
highlighter =
|
||||
new MatchHighlighter(searcher, analyzer)
|
||||
.appendFieldHighlighter(FieldValueHighlighters.verbatimValue(FLD_ID))
|
||||
.appendFieldHighlighter(highlightedOrAbbreviated)
|
||||
.appendFieldHighlighter(FieldValueHighlighters.skipRemaining());
|
||||
|
||||
assertHighlights(toDocList(highlighter.highlight(topDocs, query)),
|
||||
" 0. id: 0",
|
||||
" text1: >foo< bar baz",
|
||||
" 1. id: 1",
|
||||
" text1: bar >foo< baz",
|
||||
" 2. id: 2",
|
||||
" text1: Very long...",
|
||||
" text2: no foo but >bar<");
|
||||
});
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSynonymHighlight() throws IOException {
|
||||
// There is nothing special needed to highlight or process complex queries, synonyms, etc.
|
||||
// Synonyms defined in the constructor of this class.
|
||||
new IndexBuilder(this::toField)
|
||||
.doc(FLD_TEXT1, "Where the moon shine falls, firewater flows.")
|
||||
.build(analyzer, reader -> {
|
||||
IndexSearcher searcher = new IndexSearcher(reader);
|
||||
Sort sortOrder = Sort.INDEXORDER; // So that results are consistently ordered.
|
||||
|
||||
MatchHighlighter highlighter =
|
||||
new MatchHighlighter(searcher, analyzer)
|
||||
.appendFieldHighlighter(FieldValueHighlighters.highlighted(
|
||||
80 * 3, 1, new PassageFormatter("...", ">", "<"), FLD_TEXT1::equals))
|
||||
.appendFieldHighlighter(FieldValueHighlighters.skipRemaining());
|
||||
|
||||
Query query = new TermQuery(new Term(FLD_TEXT1, "firewater"));
|
||||
assertHighlights(toDocList(highlighter.highlight(searcher.search(query, 10, sortOrder), query)),
|
||||
"0. text1: Where the >moon shine< falls, >firewater< flows.");
|
||||
|
||||
query = new PhraseQuery(FLD_TEXT1, "moon", "shine");
|
||||
assertHighlights(toDocList(highlighter.highlight(searcher.search(query, 10, sortOrder), query)),
|
||||
"0. text1: Where the >moon shine< falls, >firewater< flows.");
|
||||
});
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCustomFieldHighlightHandling() throws IOException {
|
||||
// Match highlighter is a showcase of individual components in this package, suitable
|
||||
// to create any kind of field-display designs.
|
||||
//
|
||||
// In this example we will build a custom field highlighting handler that
|
||||
// highlights matches over a multivalued field, shows that field's values if it received
|
||||
// no matches and limits the number of values displayed to at most 2 (with an appropriate message).
|
||||
new IndexBuilder(this::toField)
|
||||
// Just one document, one field, four values.
|
||||
.doc(FLD_TEXT1, "foo bar", "bar foo baz", "bar baz foo", "baz baz baz")
|
||||
.build(analyzer, reader -> {
|
||||
IndexSearcher searcher = new IndexSearcher(reader);
|
||||
Sort sortOrder = Sort.INDEXORDER;
|
||||
|
||||
// Let's start with the simple predefined highlighter so that the field's value shows
|
||||
// and is highlighted when it was part of the hit.
|
||||
MatchHighlighter.FieldValueHighlighter highlighted = FieldValueHighlighters.highlighted(
|
||||
80 * 3, 2, new PassageFormatter("...", ">", "<"), FLD_TEXT1::equals);
|
||||
MatchHighlighter highlighter =
|
||||
new MatchHighlighter(searcher, analyzer)
|
||||
.appendFieldHighlighter(highlighted)
|
||||
.appendFieldHighlighter(FieldValueHighlighters.skipRemaining());
|
||||
|
||||
Query query = new TermQuery(new Term(FLD_TEXT1, "foo"));
|
||||
TopDocs topDocs = searcher.search(query, 10, sortOrder);
|
||||
|
||||
// Note the highlighter is configured with at most 2 snippets so the match on the
|
||||
// third value ("bar baz foo") is omitted. Ellipsis isn't inserted too because
|
||||
// values are displayed in full.
|
||||
assertHighlights(toDocList(highlighter.highlight(topDocs, query)),
|
||||
"0. text1: >foo< bar, bar >foo< baz");
|
||||
|
||||
// So the above works fine if the field received a match but omits it otherwise. We can
|
||||
// force the display of this field by chaining with verbatim value highlighter:
|
||||
highlighter =
|
||||
new MatchHighlighter(searcher, analyzer)
|
||||
.appendFieldHighlighter(highlighted.or(FieldValueHighlighters.verbatimValue(FLD_TEXT1)))
|
||||
.appendFieldHighlighter(FieldValueHighlighters.skipRemaining());
|
||||
|
||||
assertHighlights(toDocList(highlighter.highlight(topDocs, new MatchAllDocsQuery())),
|
||||
"0. text1: foo bar, bar foo baz, bar baz foo, baz baz baz");
|
||||
|
||||
// But this is not exactly what we'd like because we want to limit the display of values to the first two.
|
||||
// Let's just write a custom field highlighter handler that does it.
|
||||
class AtMostNValuesHighlighter implements MatchHighlighter.FieldValueHighlighter {
|
||||
private final String field;
|
||||
private final int limit;
|
||||
|
||||
AtMostNValuesHighlighter(String field, int limit) {
|
||||
this.field = field;
|
||||
this.limit = limit;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isApplicable(String field, boolean hasMatches) {
|
||||
return Objects.equals(field, this.field);
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<String> format(String field, String[] values, String contiguousValue,
|
||||
List<OffsetRange> valueRanges, List<MatchHighlighter.QueryOffsetRange> matchOffsets) {
|
||||
if (values.length <= limit) {
|
||||
return Arrays.asList(values);
|
||||
} else {
|
||||
List<String> collected = Stream.of(values).limit(limit).collect(Collectors.toList());
|
||||
int remaining = values.length - collected.size();
|
||||
collected.add(String.format(Locale.ROOT, "[%d omitted]", remaining));
|
||||
return collected;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Collection<String> alwaysFetchedFields() {
|
||||
return Collections.singleton(field);
|
||||
}
|
||||
}
|
||||
|
||||
// We can now chain it as usual and contemplate the result.
|
||||
highlighter =
|
||||
new MatchHighlighter(searcher, analyzer)
|
||||
.appendFieldHighlighter(highlighted.or(new AtMostNValuesHighlighter(FLD_TEXT1, 2)))
|
||||
.appendFieldHighlighter(FieldValueHighlighters.skipRemaining());
|
||||
|
||||
assertHighlights(toDocList(highlighter.highlight(topDocs, query)),
|
||||
"0. text1: >foo< bar, bar >foo< baz");
|
||||
assertHighlights(toDocList(highlighter.highlight(topDocs, new MatchAllDocsQuery())),
|
||||
"0. text1: foo bar, bar foo baz, [2 omitted]");
|
||||
});
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testHighlightMoreQueriesAtOnceShowoff() throws IOException {
|
||||
// Match highlighter underlying components are powerful enough to build interesting,
|
||||
// if not always super-practical, things. In this case, we would like to highlight
|
||||
// a set of matches of *more than one* query over the same set of input documents. This includes
|
||||
// highest-scoring passage resolution (from multiple hits) and different highlight markers
|
||||
// for each query.
|
||||
new IndexBuilder(this::toField)
|
||||
.doc(FLD_TEXT1, "foo bar baz")
|
||||
.doc(FLD_TEXT1, "foo baz bar")
|
||||
.build(analyzer, reader -> {
|
||||
// Let's start with the two queries. The first one will be an unordered
|
||||
// query for (foo, baz) with a max gap of 1; let's use intervals for this.
|
||||
Query q1 = new IntervalQuery(FLD_TEXT1,
|
||||
Intervals.maxgaps(1,
|
||||
Intervals.unordered(
|
||||
Intervals.term("foo"),
|
||||
Intervals.term("baz"))));
|
||||
|
||||
// The second one will be a simpler term query for "bar".
|
||||
Query q2 = new TermQuery(new Term(FLD_TEXT1, "bar"));
|
||||
|
||||
// Let's fetch matching documents by combining the two into a Boolean query.
|
||||
Query query = new BooleanQuery.Builder()
|
||||
.add(q1, BooleanClause.Occur.SHOULD)
|
||||
.add(q2, BooleanClause.Occur.SHOULD)
|
||||
.build();
|
||||
|
||||
IndexSearcher searcher = new IndexSearcher(reader);
|
||||
Sort sortOrder = Sort.INDEXORDER; // So that results are consistently ordered.
|
||||
TopDocs topDocs = searcher.search(query, 10, sortOrder);
|
||||
|
||||
// If we use the "regular" highlighter, the result will be slightly odd: a nested
|
||||
// highlight over "bar" within the first match. Also, you can't distinguish which of the sub-queries
|
||||
// caused which highlight marker... but if it were HTML then you could give the span
|
||||
// some semi-translucent background and layered matches would be visible.
|
||||
MatchHighlighter highlighter =
|
||||
new MatchHighlighter(searcher, analyzer)
|
||||
.appendFieldHighlighter(FieldValueHighlighters.highlighted(
|
||||
80 * 3, 1, new PassageFormatter("...", "<span>", "</span>"), FLD_TEXT1::equals))
|
||||
.appendFieldHighlighter(FieldValueHighlighters.skipRemaining());
|
||||
|
||||
assertHighlights(toDocList(highlighter.highlight(topDocs, query)),
|
||||
"0. text1: <span>foo <span>bar</span> baz</span>",
|
||||
"1. text1: <span>foo baz</span> <span>bar</span>");
|
||||
|
||||
// To separate highlights for multiple queries we'll pass them separately to the
|
||||
// highlighter and differentiate highlight markers upon their application. Let's start with the customized
|
||||
// field highlighter first. This utilizes the fact that match ranges passed from MatchHighlighter
|
||||
// contain a reference to the original query which brought up the match.
|
||||
class SeparateMarkerFieldHighlighter implements MatchHighlighter.FieldValueHighlighter {
|
||||
private final String field;
|
||||
private final Map<Query, String> queryClassMap;
|
||||
|
||||
SeparateMarkerFieldHighlighter(String field, Map<Query, String> queryClassMap) {
|
||||
this.field = field;
|
||||
this.queryClassMap = queryClassMap;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isApplicable(String field, boolean hasMatches) {
|
||||
return Objects.equals(field, this.field) && hasMatches;
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<String> format(String field, String[] values, String contiguousValue,
|
||||
List<OffsetRange> valueRanges, List<MatchHighlighter.QueryOffsetRange> matchOffsets) {
|
||||
PassageSelector passageSelector = new PassageSelector();
|
||||
int maxPassageWindow = 80;
|
||||
int maxPassages = 3;
|
||||
List<Passage> bestPassages =
|
||||
passageSelector.pickBest(contiguousValue, matchOffsets, maxPassageWindow, maxPassages, valueRanges);
|
||||
|
||||
// We know the offset ranges passed to us by MatchHighlighter are instances of QueryOffsetRange
|
||||
// so we compute the class based on that.
|
||||
Function<OffsetRange, String> queryToClass =
|
||||
(range) -> queryClassMap.get(((MatchHighlighter.QueryOffsetRange) range).query);
|
||||
|
||||
PassageFormatter passageFormatter = new PassageFormatter("...",
|
||||
(range) -> "<span class='" + queryToClass.apply(range) + "'>",
|
||||
(range) -> "</span>");
|
||||
|
||||
return passageFormatter.format(contiguousValue, bestPassages, valueRanges);
|
||||
}
|
||||
}
|
||||
|
||||
// And this is pretty much it. We now set up query classes to display, set up the highlighter...
|
||||
Map<Query, String> queryClassMap = Map.of(q1, "q1", q2, "q2");
|
||||
highlighter =
|
||||
new MatchHighlighter(searcher, analyzer)
|
||||
.appendFieldHighlighter(new SeparateMarkerFieldHighlighter(FLD_TEXT1, queryClassMap))
|
||||
.appendFieldHighlighter(FieldValueHighlighters.skipRemaining());
|
||||
|
||||
// ...and run highlighting. Note the query passed to the highlighter are individual sub-clauses
|
||||
// of the Boolean query used to fetch documents.
|
||||
assertHighlights(toDocList(highlighter.highlight(topDocs, q1, q2)),
|
||||
"0. text1: <span class='q1'>foo <span class='q2'>bar</span> baz</span>",
|
||||
"1. text1: <span class='q1'>foo baz</span> <span class='q2'>bar</span>");
|
||||
});
|
||||
}
|
||||
|
||||
private void assertHighlights(List<List<String>> docList, String... expectedFormattedLines) {
|
||||
ArrayList<String> actualLines = new ArrayList<>();
|
||||
for (int doc = 0; doc < docList.size(); doc++) {
|
||||
List<String> fields = docList.get(doc);
|
||||
for (int i = 0; i < fields.size(); i++) {
|
||||
actualLines.add((i == 0 ? String.format(Locale.ROOT, "%2d. ", doc) : " ") + fields.get(i));
|
||||
}
|
||||
}
|
||||
|
||||
if (!Arrays.equals(
|
||||
Stream.of(expectedFormattedLines).map(String::trim).toArray(),
|
||||
actualLines.stream().map(String::trim).toArray())) {
|
||||
throw new AssertionError("Actual hits were:\n" +
|
||||
String.join("\n", actualLines) + "\n\n but expected them to be:\n" +
|
||||
String.join("\n", expectedFormattedLines));
|
||||
}
|
||||
}
|
||||
|
||||
private List<List<String>> toDocList(Stream<MatchHighlighter.DocHighlights> highlights) {
|
||||
return highlights.map(docHighlights ->
|
||||
docHighlights.fields.entrySet().stream()
|
||||
.map(e -> e.getKey() + ": " + String.join(", ", e.getValue()))
|
||||
.collect(Collectors.toList())
|
||||
).collect(Collectors.toList());
|
||||
}
|
||||
|
||||
private IndexableField toField(String name, String value) {
|
||||
switch (name) {
|
||||
case FLD_TEXT1:
|
||||
return new Field(name, value, TYPE_TEXT_POSITIONS_OFFSETS);
|
||||
case FLD_TEXT2:
|
||||
return new Field(name, value, TYPE_TEXT_POSITIONS);
|
||||
default:
|
||||
throw new AssertionError("Don't know how to handle this field: " + name);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -20,21 +20,17 @@ import com.carrotsearch.randomizedtesting.RandomizedTest;
|
|||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
|
||||
import org.apache.lucene.analysis.synonym.SynonymGraphFilter;
|
||||
import org.apache.lucene.analysis.synonym.SynonymMap;
|
||||
import org.apache.lucene.analysis.util.CharTokenizer;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.FieldType;
|
||||
import org.apache.lucene.document.StringField;
|
||||
import org.apache.lucene.document.TextField;
|
||||
import org.apache.lucene.index.DirectoryReader;
|
||||
import org.apache.lucene.index.IndexOptions;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.IndexWriterConfig;
|
||||
import org.apache.lucene.index.IndexableField;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.queries.intervals.IntervalQuery;
|
||||
|
@ -52,19 +48,13 @@ import org.apache.lucene.search.TermQuery;
|
|||
import org.apache.lucene.search.TopDocs;
|
||||
import org.apache.lucene.search.spans.SpanNearQuery;
|
||||
import org.apache.lucene.search.spans.SpanTermQuery;
|
||||
import org.apache.lucene.store.ByteBuffersDirectory;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.hamcrest.Matchers;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.UncheckedIOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
|
@ -75,11 +65,9 @@ import java.util.stream.Collectors;
|
|||
import java.util.stream.Stream;
|
||||
|
||||
import static org.hamcrest.Matchers.containsInAnyOrder;
|
||||
import static org.hamcrest.Matchers.emptyArray;
|
||||
import static org.hamcrest.Matchers.not;
|
||||
|
||||
public class TestMatchRegionRetriever extends LuceneTestCase {
|
||||
private static final String FLD_ID = "field_id";
|
||||
private static final String FLD_ID = IndexBuilder.FLD_ID;
|
||||
|
||||
private static final String FLD_TEXT_POS_OFFS1 = "field_text_offs1";
|
||||
private static final String FLD_TEXT_POS_OFFS2 = "field_text_offs2";
|
||||
|
@ -100,7 +88,7 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
|
|||
private Analyzer analyzer;
|
||||
|
||||
@Before
|
||||
public void setup() {
|
||||
public void setup() throws IOException {
|
||||
TYPE_STORED_WITH_OFFSETS = new FieldType(TextField.TYPE_STORED);
|
||||
TYPE_STORED_WITH_OFFSETS.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
|
||||
TYPE_STORED_WITH_OFFSETS.freeze();
|
||||
|
@ -109,26 +97,24 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
|
|||
TYPE_STORED_NO_POSITIONS.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
|
||||
TYPE_STORED_NO_POSITIONS.freeze();
|
||||
|
||||
final int offsetGap = RandomizedTest.randomIntBetween(0, 2);
|
||||
final int positionGap = RandomizedTest.randomFrom(new int[]{0, 1, 100});
|
||||
Analyzer whitespaceAnalyzer =
|
||||
new Analyzer() {
|
||||
final int offsetGap = RandomizedTest.randomIntBetween(0, 2);
|
||||
final int positionGap = RandomizedTest.randomFrom(new int[]{0, 1, 100});
|
||||
new AnalyzerWithGaps(offsetGap, positionGap,
|
||||
new WhitespaceAnalyzer(WhitespaceTokenizer.DEFAULT_MAX_WORD_LEN));
|
||||
|
||||
SynonymMap synonymMap = TestMatchHighlighter.buildSynonymMap(new String[][] {
|
||||
{"foo\u0000bar", "syn1"},
|
||||
{"baz", "syn2\u0000syn3"},
|
||||
});
|
||||
|
||||
Analyzer synonymsAnalyzer =
|
||||
new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
WhitespaceTokenizer tokenizer =
|
||||
new WhitespaceTokenizer(CharTokenizer.DEFAULT_MAX_WORD_LEN);
|
||||
return new TokenStreamComponents(tokenizer);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getOffsetGap(String fieldName) {
|
||||
return offsetGap;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getPositionIncrementGap(String fieldName) {
|
||||
return positionGap;
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer();
|
||||
TokenStream tokenStream = new SynonymGraphFilter(tokenizer, synonymMap, true);
|
||||
return new TokenStreamComponents(tokenizer, tokenStream);
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -138,26 +124,8 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
|
|||
fieldAnalyzers.put(FLD_TEXT_POS_OFFS1, whitespaceAnalyzer);
|
||||
fieldAnalyzers.put(FLD_TEXT_POS_OFFS2, whitespaceAnalyzer);
|
||||
fieldAnalyzers.put(FLD_TEXT_NOPOS, whitespaceAnalyzer);
|
||||
|
||||
try {
|
||||
SynonymMap.Builder b = new SynonymMap.Builder();
|
||||
b.add(new CharsRef("foo\u0000bar"), new CharsRef("syn1"), true);
|
||||
b.add(new CharsRef("baz"), new CharsRef("syn2\u0000syn3"), true);
|
||||
SynonymMap synonymMap = b.build();
|
||||
Analyzer synonymsAnalyzer =
|
||||
new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer();
|
||||
TokenStream tokenStream = new SynonymGraphFilter(tokenizer, synonymMap, true);
|
||||
return new TokenStreamComponents(tokenizer, tokenStream);
|
||||
}
|
||||
};
|
||||
fieldAnalyzers.put(FLD_TEXT_SYNONYMS_POS_OFFS, synonymsAnalyzer);
|
||||
fieldAnalyzers.put(FLD_TEXT_SYNONYMS_POS, synonymsAnalyzer);
|
||||
} catch (IOException e) {
|
||||
throw new UncheckedIOException(e);
|
||||
}
|
||||
fieldAnalyzers.put(FLD_TEXT_SYNONYMS_POS_OFFS, synonymsAnalyzer);
|
||||
fieldAnalyzers.put(FLD_TEXT_SYNONYMS_POS, synonymsAnalyzer);
|
||||
|
||||
analyzer = new PerFieldAnalyzerWrapper(new MissingAnalyzer(), fieldAnalyzers);
|
||||
}
|
||||
|
@ -184,13 +152,12 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
|
|||
}
|
||||
|
||||
private void checkTermQuery(String field) throws IOException {
|
||||
withReader(
|
||||
List.of(
|
||||
Map.of(field, values("foo bar baz")),
|
||||
Map.of(field, values("bar foo baz")),
|
||||
Map.of(field, values("bar baz foo")),
|
||||
Map.of(field, values("bar bar bar irrelevant"))),
|
||||
reader -> {
|
||||
new IndexBuilder(this::toField)
|
||||
.doc(field, "foo bar baz")
|
||||
.doc(field, "bar foo baz")
|
||||
.doc(field, "bar baz foo")
|
||||
.doc(field, "bar bar bar irrelevant")
|
||||
.build(analyzer, reader -> {
|
||||
assertThat(highlights(reader, new TermQuery(new Term(field, "foo"))),
|
||||
containsInAnyOrder(
|
||||
fmt("0: (%s: '>foo< bar baz')", field),
|
||||
|
@ -217,17 +184,17 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
|
|||
.add(new TermQuery(new Term(field, "xyz")), BooleanClause.Occur.MUST_NOT)
|
||||
.build();
|
||||
|
||||
withReader(
|
||||
List.of(
|
||||
Map.of(field, values("foo bar baz abc")),
|
||||
Map.of(field, values("bar foo baz def")),
|
||||
Map.of(field, values("bar baz foo xyz"))),
|
||||
reader -> {
|
||||
new IndexBuilder(this::toField)
|
||||
.doc(field, "foo bar baz abc")
|
||||
.doc(field, "bar foo baz def")
|
||||
.doc(field, "bar baz foo xyz")
|
||||
.build(analyzer, reader -> {
|
||||
assertThat(highlights(reader, query),
|
||||
containsInAnyOrder(
|
||||
fmt("0: (%s: '>foo bar baz< abc')", field),
|
||||
fmt("1: (%s: 'bar >foo baz< def')", field)));
|
||||
});
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -241,12 +208,11 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
|
|||
}
|
||||
|
||||
private void checkVariousQueryTypes(String field) throws IOException {
|
||||
withReader(
|
||||
List.of(
|
||||
Map.of(field, values("foo bar baz abc")),
|
||||
Map.of(field, values("bar foo baz def")),
|
||||
Map.of(field, values("bar baz foo xyz"))),
|
||||
reader -> {
|
||||
new IndexBuilder(this::toField)
|
||||
.doc(field, "foo bar baz abc")
|
||||
.doc(field, "bar foo baz def")
|
||||
.doc(field, "bar baz foo xyz")
|
||||
.build(analyzer, reader -> {
|
||||
assertThat(highlights(reader, stdQueryParser.apply("foo baz", field)),
|
||||
containsInAnyOrder(
|
||||
fmt("0: (%s: '>foo< bar >baz< abc')", field),
|
||||
|
@ -297,31 +263,31 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
|
|||
|
||||
assertThat(highlights(reader, new MatchAllDocsQuery()),
|
||||
Matchers.hasSize(0));
|
||||
});
|
||||
}
|
||||
);
|
||||
|
||||
withReader(
|
||||
List.of(
|
||||
Map.of(field, values("foo baz foo")),
|
||||
Map.of(field, values("bas baz foo")),
|
||||
Map.of(field, values("bar baz foo xyz"))),
|
||||
reader -> {
|
||||
new IndexBuilder(this::toField)
|
||||
.doc(field, "foo baz foo")
|
||||
.doc(field, "bas baz foo")
|
||||
.doc(field, "bar baz foo xyz")
|
||||
.build(analyzer, reader -> {
|
||||
assertThat(
|
||||
highlights(reader, stdQueryParser.apply("[bar TO baz] -bar", field)),
|
||||
containsInAnyOrder(
|
||||
fmt("0: (%s: 'foo >baz< foo')", field), fmt("1: (%s: '>bas< >baz< foo')", field)));
|
||||
});
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testIntervalQueries() throws IOException {
|
||||
String field = FLD_TEXT_POS_OFFS;
|
||||
|
||||
withReader(
|
||||
List.of(
|
||||
Map.of(field, values("foo baz foo")),
|
||||
Map.of(field, values("bas baz foo")),
|
||||
Map.of(field, values("bar baz foo xyz"))),
|
||||
reader -> {
|
||||
new IndexBuilder(this::toField)
|
||||
.doc(field, "foo baz foo")
|
||||
.doc(field, "bas baz foo")
|
||||
.doc(field, "bar baz foo xyz")
|
||||
.build(analyzer, reader -> {
|
||||
assertThat(
|
||||
highlights(reader, new IntervalQuery(field,
|
||||
Intervals.unordered(
|
||||
|
@ -374,7 +340,8 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
|
|||
containsInAnyOrder(
|
||||
fmt("2: (field_text_offs: '>bar baz foo< xyz')", field)
|
||||
));
|
||||
});
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -388,36 +355,37 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
|
|||
}
|
||||
|
||||
public void checkMultivaluedFields(String field) throws IOException {
|
||||
withReader(
|
||||
List.of(
|
||||
Map.of(field, values("foo bar", "baz abc", "bad baz")),
|
||||
Map.of(field, values("bar foo", "baz def")),
|
||||
Map.of(field, values("bar baz", "foo xyz"))),
|
||||
reader -> {
|
||||
new IndexBuilder(this::toField)
|
||||
.doc(field, "foo bar", "baz abc", "bad baz")
|
||||
.doc(field, "bar foo", "baz def")
|
||||
.doc(field, "bar baz", "foo xyz")
|
||||
.build(analyzer, reader -> {
|
||||
assertThat(highlights(reader, stdQueryParser.apply("baz", field)),
|
||||
containsInAnyOrder(
|
||||
fmt("0: (%s: '>baz< abc | bad >baz<')", field),
|
||||
fmt("1: (%s: '>baz< def')", field),
|
||||
fmt("2: (%s: 'bar >baz<')", field)));
|
||||
});
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMultiFieldHighlights() throws IOException {
|
||||
for (String[] fields :
|
||||
for (String[] fieldPairs :
|
||||
new String[][]{
|
||||
{FLD_TEXT_POS_OFFS1, FLD_TEXT_POS_OFFS2},
|
||||
{FLD_TEXT_POS, FLD_TEXT_POS_OFFS2},
|
||||
{FLD_TEXT_POS_OFFS1, FLD_TEXT_POS}
|
||||
}) {
|
||||
String field1 = fields[0];
|
||||
String field2 = fields[1];
|
||||
withReader(
|
||||
List.of(
|
||||
Map.of(
|
||||
field1, values("foo bar", "baz abc"),
|
||||
field2, values("foo baz", "loo bar"))),
|
||||
reader -> {
|
||||
String field1 = fieldPairs[0];
|
||||
String field2 = fieldPairs[1];
|
||||
|
||||
new IndexBuilder(this::toField)
|
||||
.doc(fields -> {
|
||||
fields.add(field1, "foo bar", "baz abc");
|
||||
fields.add(field2, "foo baz", "loo bar");
|
||||
})
|
||||
.build(analyzer, reader -> {
|
||||
String ordered =
|
||||
Stream.of(fmt("(%s: '>baz< abc')", field1), fmt("(%s: 'loo >bar<')", field2))
|
||||
.sorted()
|
||||
|
@ -428,7 +396,8 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
|
|||
reader,
|
||||
stdQueryParser.apply(field1 + ":baz" + " OR " + field2 + ":bar", field1)),
|
||||
containsInAnyOrder(fmt("0: %s", ordered)));
|
||||
});
|
||||
}
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -440,15 +409,17 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
|
|||
public void testNoRewrite() throws IOException {
|
||||
String field1 = FLD_TEXT_POS_OFFS1;
|
||||
String field2 = FLD_TEXT_POS_OFFS2;
|
||||
withReader(
|
||||
List.of(
|
||||
Map.of(
|
||||
field1, values("0100"),
|
||||
field2, values("loo bar")),
|
||||
Map.of(
|
||||
field1, values("0200"),
|
||||
field2, values("foo bar"))),
|
||||
reader -> {
|
||||
|
||||
new IndexBuilder(this::toField)
|
||||
.doc(fields -> {
|
||||
fields.add(field1, "0100");
|
||||
fields.add(field2, "loo bar");
|
||||
})
|
||||
.doc(fields -> {
|
||||
fields.add(field1, "0200");
|
||||
fields.add(field2, "foo bar");
|
||||
})
|
||||
.build(analyzer, reader -> {
|
||||
String expected = fmt("0: (%s: '>0100<')(%s: 'loo >bar<')", field1, field2);
|
||||
assertThat(
|
||||
highlights(
|
||||
|
@ -461,7 +432,8 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
|
|||
reader,
|
||||
stdQueryParser.apply(fmt("+%s:01* AND %s:bar", field1, field2), field1)),
|
||||
containsInAnyOrder(expected));
|
||||
});
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -475,9 +447,9 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
|
|||
}
|
||||
|
||||
public void checkNestedQueryHits(String field) throws IOException {
|
||||
withReader(
|
||||
List.of(Map.of(field, values("foo bar baz abc"))),
|
||||
reader -> {
|
||||
new IndexBuilder(this::toField)
|
||||
.doc(field, "foo bar baz abc")
|
||||
.build(analyzer, reader -> {
|
||||
assertThat(
|
||||
highlights(
|
||||
reader,
|
||||
|
@ -496,7 +468,8 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
|
|||
.add(new TermQuery(new Term(field, "baz")), BooleanClause.Occur.SHOULD)
|
||||
.build()),
|
||||
containsInAnyOrder(fmt("0: (%s: '>foo >bar< >baz<< abc')", field)));
|
||||
});
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -510,13 +483,12 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
|
|||
}
|
||||
|
||||
private void checkGraphQuery(String field) throws IOException {
|
||||
withReader(
|
||||
List.of(
|
||||
Map.of(field, values("foo bar baz")),
|
||||
Map.of(field, values("bar foo baz")),
|
||||
Map.of(field, values("bar baz foo")),
|
||||
Map.of(field, values("bar bar bar irrelevant"))),
|
||||
reader -> {
|
||||
new IndexBuilder(this::toField)
|
||||
.doc(field, "foo bar baz")
|
||||
.doc(field, "bar foo baz")
|
||||
.doc(field, "bar baz foo")
|
||||
.doc(field, "bar bar bar irrelevant")
|
||||
.build(analyzer, reader -> {
|
||||
assertThat(highlights(reader, new TermQuery(new Term(field, "syn1"))),
|
||||
containsInAnyOrder(fmt("0: (%s: '>foo bar< baz')", field)));
|
||||
|
||||
|
@ -536,7 +508,8 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
|
|||
assertThat(
|
||||
highlights(reader, stdQueryParser.apply(field + ":\"foo syn2 syn3\"", field)),
|
||||
containsInAnyOrder(fmt("1: (%s: 'bar >foo baz<')", field)));
|
||||
});
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -550,13 +523,12 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
|
|||
}
|
||||
|
||||
private void checkSpanQueries(String field) throws IOException {
|
||||
withReader(
|
||||
List.of(
|
||||
Map.of(field, values("foo bar baz")),
|
||||
Map.of(field, values("bar foo baz")),
|
||||
Map.of(field, values("bar baz foo")),
|
||||
Map.of(field, values("bar bar bar irrelevant"))),
|
||||
reader -> {
|
||||
new IndexBuilder(this::toField)
|
||||
.doc(field, "foo bar baz")
|
||||
.doc(field, "bar foo baz")
|
||||
.doc(field, "bar baz foo")
|
||||
.doc(field, "bar bar bar irrelevant")
|
||||
.build(analyzer, reader -> {
|
||||
assertThat(
|
||||
highlights(
|
||||
reader,
|
||||
|
@ -598,7 +570,8 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
|
|||
fmt("0: (%s: '>foo bar< baz')", field),
|
||||
fmt("1: (%s: '>bar foo< baz')", field),
|
||||
fmt("2: (%s: '>bar baz foo<')", field)));
|
||||
});
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -610,12 +583,10 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
|
|||
public void testTextFieldNoPositionsOffsetFromValues() throws Exception {
|
||||
String field = FLD_TEXT_NOPOS;
|
||||
|
||||
withReader(
|
||||
List.of(
|
||||
Map.of(FLD_TEXT_NOPOS, values("foo bar")),
|
||||
Map.of(FLD_TEXT_NOPOS, values("foo bar", "baz baz"))
|
||||
),
|
||||
reader -> {
|
||||
new IndexBuilder(this::toField)
|
||||
.doc(FLD_TEXT_NOPOS, "foo bar")
|
||||
.doc(FLD_TEXT_NOPOS, "foo bar", "baz baz")
|
||||
.build(analyzer, reader -> {
|
||||
OffsetsRetrievalStrategySupplier defaults = MatchRegionRetriever
|
||||
.computeOffsetRetrievalStrategies(reader, analyzer);
|
||||
OffsetsRetrievalStrategySupplier customSuppliers = (fld) -> {
|
||||
|
@ -634,7 +605,8 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
|
|||
containsInAnyOrder(
|
||||
fmt("0: (%s: '>foo bar<')", field),
|
||||
fmt("1: (%s: '>foo bar< | >baz baz<')", field)));
|
||||
});
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -648,13 +620,13 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
|
|||
public void testTextFieldNoPositionsOffsetsFromTokens() throws Exception {
|
||||
String field = FLD_TEXT_NOPOS;
|
||||
|
||||
withReader(
|
||||
List.of(
|
||||
Map.of(FLD_TEXT_NOPOS, values("foo bar"),
|
||||
FLD_TEXT_POS, values("bar bar")),
|
||||
Map.of(FLD_TEXT_NOPOS, values("foo bar", "baz bar"))
|
||||
),
|
||||
reader -> {
|
||||
new IndexBuilder(this::toField)
|
||||
.doc(fields -> {
|
||||
fields.add(FLD_TEXT_NOPOS, "foo bar");
|
||||
fields.add(FLD_TEXT_POS, "bar bar");
|
||||
})
|
||||
.doc(FLD_TEXT_NOPOS, "foo bar", "baz bar")
|
||||
.build(analyzer, reader -> {
|
||||
assertThat(
|
||||
highlights(
|
||||
reader,
|
||||
|
@ -662,7 +634,8 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
|
|||
containsInAnyOrder(
|
||||
fmt("0: (%s: 'foo >bar<')", field),
|
||||
fmt("1: (%s: 'foo >bar< | baz >bar<')", field)));
|
||||
});
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
private List<String> highlights(IndexReader reader, Query query) throws IOException {
|
||||
|
@ -702,46 +675,14 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
|
|||
}
|
||||
};
|
||||
|
||||
MatchRegionRetriever highlighter = new MatchRegionRetriever(searcher, rewrittenQuery, analyzer,
|
||||
offsetsStrategySupplier);
|
||||
MatchRegionRetriever highlighter = new MatchRegionRetriever(searcher, rewrittenQuery, offsetsStrategySupplier);
|
||||
highlighter.highlightDocuments(topDocs, highlightCollector);
|
||||
|
||||
return highlights;
|
||||
}
|
||||
|
||||
private String[] values(String... values) {
|
||||
assertThat(values, not(emptyArray()));
|
||||
return values;
|
||||
}
|
||||
|
||||
private void withReader(
|
||||
Collection<Map<String, String[]>> docs, IOUtils.IOConsumer<DirectoryReader> block)
|
||||
throws IOException {
|
||||
IndexWriterConfig config = new IndexWriterConfig(analyzer);
|
||||
|
||||
try (Directory directory = new ByteBuffersDirectory()) {
|
||||
IndexWriter iw = new IndexWriter(directory, config);
|
||||
|
||||
int seq = 0;
|
||||
for (Map<String, String[]> fields : docs) {
|
||||
Document doc = new Document();
|
||||
doc.add(new StringField(FLD_ID, Integer.toString(seq++), Field.Store.YES));
|
||||
for (Map.Entry<String, String[]> field : fields.entrySet()) {
|
||||
for (String value : field.getValue()) {
|
||||
doc.add(toField(field.getKey(), value));
|
||||
}
|
||||
}
|
||||
iw.addDocument(doc);
|
||||
if (RandomizedTest.randomBoolean()) {
|
||||
iw.commit();
|
||||
}
|
||||
}
|
||||
iw.flush();
|
||||
|
||||
try (DirectoryReader reader = DirectoryReader.open(iw)) {
|
||||
block.accept(reader);
|
||||
}
|
||||
}
|
||||
private static String fmt(String string, Object... args) {
|
||||
return String.format(Locale.ROOT, string, args);
|
||||
}
|
||||
|
||||
private IndexableField toField(String name, String value) {
|
||||
|
@ -760,8 +701,4 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
|
|||
throw new AssertionError("Don't know how to handle this field: " + name);
|
||||
}
|
||||
}
|
||||
|
||||
private static String fmt(String string, Object... args) {
|
||||
return String.format(Locale.ROOT, string, args);
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue