LUCENE-9463: Query match region retrieval component, passage scoring and formatting (#1750)

Reviewed as part of previous issue by @romseygeek
This commit is contained in:
Dawid Weiss 2020-08-14 14:21:12 +02:00 committed by GitHub
parent a003f64649
commit 150a8dacb5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
22 changed files with 2758 additions and 0 deletions

View File

@ -63,6 +63,9 @@ API Changes
Improvements
* LUCENE-9463: Query match region retrieval component, passage scoring and formatting
for building custom highlighters. (Alan Woodward, Dawid Weiss)
* LUCENE-9370: RegExp query is no longer lenient about inappropriate backslashes and
follows the Java Pattern policy for rejecting illegal syntax. (Mark Harwood)

View File

@ -28,4 +28,5 @@ dependencies {
testImplementation project(':lucene:test-framework')
testImplementation project(':lucene:analysis:common')
testImplementation project(':lucene:queryparser')
}

View File

@ -38,6 +38,7 @@
<pathelement path="${memory.jar}"/>
<pathelement path="${queries.jar}"/>
<pathelement path="${analyzers-common.jar}"/>
<pathelement path="${queryparser.jar}"/>
<path refid="test.base.classpath"/>
</path>

View File

@ -0,0 +1,86 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.matchhighlight;
import java.text.BreakIterator;
import java.util.Locale;
/**
* A {@link PassageAdjuster} that adjusts the {@link Passage} range to
* word boundaries hinted by the given {@link BreakIterator}.
*/
public class BreakIteratorShrinkingAdjuster implements PassageAdjuster {
private final BreakIterator bi;
private CharSequence value;
public BreakIteratorShrinkingAdjuster() {
this(BreakIterator.getWordInstance(Locale.ROOT));
}
public BreakIteratorShrinkingAdjuster(BreakIterator bi) {
this.bi = bi;
}
@Override
public void currentValue(CharSequence value) {
this.value = value;
bi.setText(new CharSequenceIterator(value));
}
@Override
public OffsetRange adjust(Passage passage) {
int from = passage.from;
if (from > 0) {
while (!bi.isBoundary(from)
|| (from < value.length() && Character.isWhitespace(value.charAt(from)))) {
from = bi.following(from);
if (from == BreakIterator.DONE) {
from = passage.from;
break;
}
}
if (from == value.length()) {
from = passage.from;
}
}
int to = passage.to;
if (to != value.length()) {
while (!bi.isBoundary(to) || (to > 0 && Character.isWhitespace(value.charAt(to - 1)))) {
to = bi.preceding(to);
if (to == BreakIterator.DONE) {
to = passage.to;
break;
}
}
if (to == 0) {
to = passage.to;
}
}
for (OffsetRange r : passage.markers) {
from = Math.min(from, r.from);
to = Math.max(to, r.to);
}
if (from > to) {
from = to;
}
return new OffsetRange(from, to);
}
}

View File

@ -0,0 +1,104 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.matchhighlight;
import java.text.CharacterIterator;
/**
* A {@link CharacterIterator} over a {@link CharSequence}.
*/
final class CharSequenceIterator implements CharacterIterator {
private final CharSequence text;
private int begin;
private int end;
private int pos;
public CharSequenceIterator(CharSequence text) {
this.text = text;
this.begin = 0;
this.end = text.length();
}
public char first() {
pos = begin;
return current();
}
public char last() {
if (end != begin) {
pos = end - 1;
} else {
pos = end;
}
return current();
}
public char setIndex(int p) {
if (p < begin || p > end) throw new IllegalArgumentException("Invalid index");
pos = p;
return current();
}
public char current() {
if (pos >= begin && pos < end) {
return text.charAt(pos);
} else {
return DONE;
}
}
public char next() {
if (pos < end - 1) {
pos++;
return text.charAt(pos);
} else {
pos = end;
return DONE;
}
}
public char previous() {
if (pos > begin) {
pos--;
return text.charAt(pos);
} else {
return DONE;
}
}
public int getBeginIndex() {
return begin;
}
public int getEndIndex() {
return end;
}
public int getIndex() {
return pos;
}
@Override
public Object clone() {
try {
return super.clone();
} catch (CloneNotSupportedException e) {
throw new RuntimeException(e);
}
}
}

View File

@ -0,0 +1,304 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.matchhighlight;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Matches;
import org.apache.lucene.search.MatchesIterator;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.QueryVisitor;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.Weight;
import java.io.IOException;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.PrimitiveIterator;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.function.Predicate;
/**
* Utility class to compute a list of "match regions" for a given query, searcher and
* document(s) using {@link Matches} API.
*/
public class MatchRegionRetriever {
private final List<LeafReaderContext> leaves;
private final Weight weight;
private final TreeSet<String> affectedFields;
private final Map<String, OffsetsRetrievalStrategy> offsetStrategies;
private final Set<String> preloadFields;
/**
* A callback for accepting a single document (and its associated leaf reader, leaf document ID)
* and its match offset ranges, as indicated by the {@link Matches} interface retrieved for
* the query.
*/
@FunctionalInterface
public interface MatchOffsetsConsumer {
void accept(int docId, LeafReader leafReader, int leafDocId, Map<String, List<OffsetRange>> hits)
throws IOException;
}
/**
* An abstraction that provides document values for a given field. Default implementation
* in {@link DocumentFieldValueProvider} just reaches to a preloaded {@link Document}. It is
* possible to write a more efficient implementation on top of a reusable character buffer
* (that reuses the buffer while retrieving hit regions for documents).
*/
@FunctionalInterface
public interface FieldValueProvider {
List<CharSequence> getValues(String field);
}
/**
* A constructor with the default offset strategy supplier.
*/
public MatchRegionRetriever(IndexSearcher searcher, Query query, Analyzer analyzer) throws IOException {
this(searcher, query, analyzer, computeOffsetRetrievalStrategies(searcher.getIndexReader(), analyzer));
}
/**
* @param searcher Index searcher to be used for retrieving matches.
* @param query The query for which matches should be retrieved. The query should be rewritten
* against the provided searcher.
* @param analyzer An analyzer that may be used to reprocess (retokenize) document fields
* in the absence of position offsets in the index. Note that the analyzer must return
* tokens (positions and offsets) identical to the ones stored in the index.
* @param fieldOffsetStrategySupplier A custom supplier of per-field {@link OffsetsRetrievalStrategy}
* instances.
*/
public MatchRegionRetriever(IndexSearcher searcher, Query query, Analyzer analyzer,
OffsetsRetrievalStrategySupplier fieldOffsetStrategySupplier)
throws IOException {
leaves = searcher.getIndexReader().leaves();
assert checkOrderConsistency(leaves);
// We need full scoring mode so that we can receive matches from all sub-clauses
// (no optimizations in Boolean queries take place).
weight = searcher.createWeight(query, ScoreMode.COMPLETE, 0);
// Compute the subset of fields affected by this query so that we don't load or scan
// fields that are irrelevant.
affectedFields = new TreeSet<>();
query.visit(
new QueryVisitor() {
@Override
public boolean acceptField(String field) {
affectedFields.add(field);
return false;
}
});
// Compute value offset retrieval strategy for all affected fields.
offsetStrategies = new HashMap<>();
for (String field : affectedFields) {
offsetStrategies.put(field, fieldOffsetStrategySupplier.apply(field));
}
// Ask offset strategies if they'll need field values.
preloadFields = new HashSet<>();
offsetStrategies.forEach(
(field, strategy) -> {
if (strategy.requiresDocument()) {
preloadFields.add(field);
}
});
// Only preload those field values that can be affected by the query and are required
// by strategies.
preloadFields.retainAll(affectedFields);
}
public void highlightDocuments(TopDocs topDocs, MatchOffsetsConsumer consumer) throws IOException {
highlightDocuments(Arrays.stream(topDocs.scoreDocs)
.mapToInt(scoreDoc -> scoreDoc.doc)
.sorted()
.iterator(), consumer);
}
/**
* Low-level, high-efficiency method for highlighting large numbers of documents at once in a
* streaming fashion.
*
* @param docIds A stream of <em>sorted</em> document identifiers for which hit ranges should
* be returned.
* @param consumer A streaming consumer for document-hits pairs.
*/
public void highlightDocuments(PrimitiveIterator.OfInt docIds, MatchOffsetsConsumer consumer)
throws IOException {
if (leaves.isEmpty()) {
return;
}
Iterator<LeafReaderContext> ctx = leaves.iterator();
LeafReaderContext currentContext = ctx.next();
int previousDocId = -1;
Map<String, List<OffsetRange>> highlights = new TreeMap<>();
while (docIds.hasNext()) {
int docId = docIds.nextInt();
if (docId < previousDocId) {
throw new RuntimeException("Input document IDs must be sorted (increasing).");
}
previousDocId = docId;
while (docId >= currentContext.docBase + currentContext.reader().maxDoc()) {
currentContext = ctx.next();
}
int contextRelativeDocId = docId - currentContext.docBase;
// Only preload fields we may potentially need.
FieldValueProvider documentSupplier;
if (preloadFields.isEmpty()) {
documentSupplier = null;
} else {
Document doc = currentContext.reader().document(contextRelativeDocId, preloadFields);
documentSupplier = new DocumentFieldValueProvider(doc);
}
highlights.clear();
highlightDocument(currentContext, contextRelativeDocId, documentSupplier, (field) -> true, highlights);
consumer.accept(docId, currentContext.reader(), contextRelativeDocId, highlights);
}
}
/**
* Low-level method for retrieving hit ranges for a single document. This method can be used with
* custom document {@link FieldValueProvider}.
*/
public void highlightDocument(
LeafReaderContext leafReaderContext,
int contextDocId,
FieldValueProvider doc,
Predicate<String> acceptField,
Map<String, List<OffsetRange>> outputHighlights)
throws IOException {
Matches matches = weight.matches(leafReaderContext, contextDocId);
if (matches == null) {
return;
}
for (String field : affectedFields) {
if (acceptField.test(field)) {
MatchesIterator matchesIterator = matches.getMatches(field);
if (matchesIterator == null) {
// No matches on this field, even though the field was part of the query. This may be possible
// with complex queries that source non-text fields (have no "hit regions" in any textual
// representation). Skip.
} else {
OffsetsRetrievalStrategy offsetStrategy = offsetStrategies.get(field);
if (offsetStrategy == null) {
throw new IOException(
"Non-empty matches but no offset retrieval strategy for field: " + field);
}
List<OffsetRange> ranges = offsetStrategy.get(matchesIterator, doc);
if (!ranges.isEmpty()) {
outputHighlights.put(field, ranges);
}
}
}
}
}
private boolean checkOrderConsistency(List<LeafReaderContext> leaves) {
for (int i = 1; i < leaves.size(); i++) {
LeafReaderContext prev = leaves.get(i - 1);
LeafReaderContext next = leaves.get(i);
assert prev.docBase <= next.docBase;
assert prev.docBase + prev.reader().maxDoc() == next.docBase;
}
return true;
}
/**
* Compute default strategies for retrieving offsets from {@link MatchesIterator}
* instances for a set of given fields.
*/
public static OffsetsRetrievalStrategySupplier computeOffsetRetrievalStrategies(
IndexReader reader, Analyzer analyzer) {
FieldInfos fieldInfos = FieldInfos.getMergedFieldInfos(reader);
return (field) -> {
FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
if (fieldInfo == null) {
return (mi, doc) -> {
throw new IOException("FieldInfo is null for field: " + field);
};
}
switch (fieldInfo.getIndexOptions()) {
case DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS:
return new OffsetsFromMatchIterator(field);
case DOCS_AND_FREQS_AND_POSITIONS:
return new OffsetsFromPositions(field, analyzer);
case DOCS_AND_FREQS:
case DOCS:
// By default retrieve offsets from individual tokens
// retrieved by the analyzer (possibly narrowed down to
// only those terms that the query hinted at when passed
// a QueryVisitor.
//
// Alternative straties are also possible and may make sense
// depending on the use case (OffsetsFromValues, for example).
return new OffsetsFromTokens(field, analyzer);
default:
return
(matchesIterator, doc) -> {
throw new IOException(
"Field is indexed without positions and/or offsets: "
+ field
+ ", "
+ fieldInfo.getIndexOptions());
};
}
};
}
/**
* Implements {@link FieldValueProvider} wrapping a preloaded
* {@link Document}.
*/
private static final class DocumentFieldValueProvider implements FieldValueProvider {
private final Document doc;
public DocumentFieldValueProvider(Document doc) {
this.doc = doc;
}
@Override
public List<CharSequence> getValues(String field) {
return Arrays.asList(doc.getValues(field));
}
}
}

View File

@ -0,0 +1,65 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.matchhighlight;
import java.util.Objects;
/**
* A non-empty range of offset positions.
*/
public class OffsetRange {
/** Start index, inclusive. */
public final int from;
/** End index, exclusive. */
public final int to;
/**
* @param from Start index, inclusive.
* @param to End index, exclusive.
*/
public OffsetRange(int from, int to) {
assert from <= to : "A non-empty offset range is required: " + from + "-" + to;
this.from = from;
this.to = to;
}
public int length() {
return to - from;
}
@Override
public String toString() {
return "[from=" + from + ", to=" + to + "]";
}
@Override
public boolean equals(Object other) {
if (other == this) return true;
if (other instanceof OffsetRange) {
OffsetRange that = (OffsetRange) other;
return from == that.from && to == that.to;
} else {
return false;
}
}
@Override
public int hashCode() {
return Objects.hash(from, to);
}
}

View File

@ -0,0 +1,49 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.matchhighlight;
import org.apache.lucene.search.MatchesIterator;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
/**
* This strategy retrieves offsets directly from {@link MatchesIterator}.
*/
public final class OffsetsFromMatchIterator implements OffsetsRetrievalStrategy {
private final String field;
OffsetsFromMatchIterator(String field) {
this.field = field;
}
@Override
public List<OffsetRange> get(MatchesIterator matchesIterator, MatchRegionRetriever.FieldValueProvider doc)
throws IOException {
ArrayList<OffsetRange> ranges = new ArrayList<>();
while (matchesIterator.next()) {
int from = matchesIterator.startOffset();
int to = matchesIterator.endOffset();
if (from < 0 || to < 0) {
throw new IOException("Matches API returned negative offsets for field: " + field);
}
ranges.add(new OffsetRange(from, to));
}
return ranges;
}
}

View File

@ -0,0 +1,154 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.matchhighlight;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.search.MatchesIterator;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* This strategy applies to fields with stored positions but no offsets. We re-analyze
* the field's value to find out offsets of match positions.
* <p>
* Note that this may fail if index data (positions stored in the index) is out of sync
* with the field values or the analyzer. This strategy assumes it'll never happen.
*/
public final class OffsetsFromPositions implements OffsetsRetrievalStrategy {
private final String field;
private final Analyzer analyzer;
OffsetsFromPositions(String field, Analyzer analyzer) {
this.field = field;
this.analyzer = analyzer;
}
@Override
public List<OffsetRange> get(MatchesIterator matchesIterator, MatchRegionRetriever.FieldValueProvider doc)
throws IOException {
ArrayList<OffsetRange> ranges = new ArrayList<>();
while (matchesIterator.next()) {
int from = matchesIterator.startPosition();
int to = matchesIterator.endPosition();
if (from < 0 || to < 0) {
throw new IOException("Matches API returned negative positions for field: " + field);
}
ranges.add(new OffsetRange(from, to));
}
// Convert from positions to offsets.
ranges = convertPositionsToOffsets(ranges, analyzer, field, doc.getValues(field));
return ranges;
}
@Override
public boolean requiresDocument() {
return true;
}
private static ArrayList<OffsetRange> convertPositionsToOffsets(
ArrayList<OffsetRange> ranges,
Analyzer analyzer,
String fieldName,
List<CharSequence> values)
throws IOException {
if (ranges.isEmpty()) {
return ranges;
}
class LeftRight {
int left = Integer.MAX_VALUE;
int right = Integer.MIN_VALUE;
@Override
public String toString() {
return "[" + "L: " + left + ", R: " + right + ']';
}
}
Map<Integer, LeftRight> requiredPositionSpans = new HashMap<>();
int minPosition = Integer.MAX_VALUE;
int maxPosition = Integer.MIN_VALUE;
for (OffsetRange range : ranges) {
requiredPositionSpans.computeIfAbsent(range.from, (key) -> new LeftRight());
requiredPositionSpans.computeIfAbsent(range.to, (key) -> new LeftRight());
minPosition = Math.min(minPosition, range.from);
maxPosition = Math.max(maxPosition, range.to);
}
int position = -1;
int valueOffset = 0;
for (int valueIndex = 0, max = values.size(); valueIndex < max; valueIndex++) {
final String value = values.get(valueIndex).toString();
final boolean lastValue = valueIndex + 1 == max;
TokenStream ts = analyzer.tokenStream(fieldName, value);
OffsetAttribute offsetAttr = ts.getAttribute(OffsetAttribute.class);
PositionIncrementAttribute posAttr = ts.getAttribute(PositionIncrementAttribute.class);
ts.reset();
while (ts.incrementToken()) {
position += posAttr.getPositionIncrement();
if (position >= minPosition) {
LeftRight leftRight = requiredPositionSpans.get(position);
if (leftRight != null) {
int startOffset = valueOffset + offsetAttr.startOffset();
int endOffset = valueOffset + offsetAttr.endOffset();
leftRight.left = Math.min(leftRight.left, startOffset);
leftRight.right = Math.max(leftRight.right, endOffset);
}
// Only short-circuit if we're on the last value (which should be the common
// case since most fields would only have a single value anyway). We need
// to make sure of this because otherwise offsetAttr would have incorrect value.
if (position > maxPosition && lastValue) {
break;
}
}
}
ts.end();
position += posAttr.getPositionIncrement() + analyzer.getPositionIncrementGap(fieldName);
valueOffset += offsetAttr.endOffset() + analyzer.getOffsetGap(fieldName);
ts.close();
}
ArrayList<OffsetRange> converted = new ArrayList<>();
for (OffsetRange range : ranges) {
LeftRight left = requiredPositionSpans.get(range.from);
LeftRight right = requiredPositionSpans.get(range.to);
if (left == null
|| right == null
|| left.left == Integer.MAX_VALUE
|| right.right == Integer.MIN_VALUE) {
throw new RuntimeException("Position not properly initialized for range: " + range);
}
converted.add(new OffsetRange(left.left, right.right));
}
return converted;
}
}

View File

@ -0,0 +1,97 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.matchhighlight;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.MatchesIterator;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.QueryVisitor;
import org.apache.lucene.util.BytesRef;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
/**
* This strategy works for fields where we know the match occurred but there are
* no known positions or offsets.
* <p>
* We re-analyze field values and return offset ranges for returned tokens that
* are also returned by the query's term collector.
*/
public final class OffsetsFromTokens implements OffsetsRetrievalStrategy {
private final String field;
private final Analyzer analyzer;
public OffsetsFromTokens(String field, Analyzer analyzer) {
this.field = field;
this.analyzer = analyzer;
}
@Override
public List<OffsetRange> get(MatchesIterator matchesIterator, MatchRegionRetriever.FieldValueProvider doc) throws IOException {
List<CharSequence> values = doc.getValues(field);
Set<BytesRef> matchTerms = new HashSet<>();
while (matchesIterator.next()) {
Query q = matchesIterator.getQuery();
q.visit(new QueryVisitor() {
@Override
public void consumeTerms(Query query, Term... terms) {
for (Term t : terms) {
if (field.equals(t.field())) {
matchTerms.add(t.bytes());
}
}
}
});
}
ArrayList<OffsetRange> ranges = new ArrayList<>();
int valueOffset = 0;
for (int valueIndex = 0, max = values.size(); valueIndex < max; valueIndex++) {
final String value = values.get(valueIndex).toString();
TokenStream ts = analyzer.tokenStream(field, value);
OffsetAttribute offsetAttr = ts.getAttribute(OffsetAttribute.class);
TermToBytesRefAttribute termAttr = ts.getAttribute(TermToBytesRefAttribute.class);
ts.reset();
while (ts.incrementToken()) {
if (matchTerms.contains(termAttr.getBytesRef())) {
int startOffset = valueOffset + offsetAttr.startOffset();
int endOffset = valueOffset + offsetAttr.endOffset();
ranges.add(new OffsetRange(startOffset, endOffset));
}
}
ts.end();
valueOffset += offsetAttr.endOffset() + analyzer.getOffsetGap(field);
ts.close();
}
return ranges;
}
@Override
public boolean requiresDocument() {
return true;
}
}

View File

@ -0,0 +1,74 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.matchhighlight;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.search.MatchesIterator;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
/**
* This strategy works for fields where we know the match occurred but there are
* no known positions or offsets.
* <p>
* We re-analyze field values and return offset ranges for entire values
* (not individual tokens). Re-analysis is required because analyzer may return
* an unknown offset gap.
*/
public final class OffsetsFromValues implements OffsetsRetrievalStrategy {
private final String field;
private final Analyzer analyzer;
public OffsetsFromValues(String field, Analyzer analyzer) {
this.field = field;
this.analyzer = analyzer;
}
@Override
public List<OffsetRange> get(MatchesIterator matchesIterator, MatchRegionRetriever.FieldValueProvider doc) throws IOException {
List<CharSequence> values = doc.getValues(field);
ArrayList<OffsetRange> ranges = new ArrayList<>();
int valueOffset = 0;
for (CharSequence charSequence : values) {
final String value = charSequence.toString();
TokenStream ts = analyzer.tokenStream(field, value);
OffsetAttribute offsetAttr = ts.getAttribute(OffsetAttribute.class);
ts.reset();
int startOffset = valueOffset;
while (ts.incrementToken()) {
// Go through all tokens to increment offset attribute properly.
}
ts.end();
valueOffset += offsetAttr.endOffset();
ranges.add(new OffsetRange(startOffset, valueOffset));
valueOffset += analyzer.getOffsetGap(field);
ts.close();
}
return ranges;
}
@Override
public boolean requiresDocument() {
return true;
}
}

View File

@ -0,0 +1,42 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.matchhighlight;
import org.apache.lucene.search.MatchesIterator;
import java.io.IOException;
import java.util.List;
/**
* Determines how match offset regions are computed from {@link MatchesIterator}. Several
* possibilities exist, ranging from retrieving offsets directly from a match instance
* to re-evaluating the document's field and recomputing offsets from there.
*/
public interface OffsetsRetrievalStrategy {
/**
* Return value offsets (match ranges) acquired from the given {@link MatchesIterator}.
*/
List<OffsetRange> get(MatchesIterator matchesIterator, MatchRegionRetriever.FieldValueProvider doc)
throws IOException;
/**
* Whether this strategy requires document field access.
*/
default boolean requiresDocument() {
return false;
}
}

View File

@ -0,0 +1,26 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.matchhighlight;
import java.util.function.Function;
/**
* A per-field supplier of {@link OffsetsRetrievalStrategy}.
*/
@FunctionalInterface
public interface OffsetsRetrievalStrategySupplier extends Function<String, OffsetsRetrievalStrategy> {
}

View File

@ -0,0 +1,39 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.matchhighlight;
import java.util.List;
/**
* A passage is a fragment of source text, scored and possibly with a list of sub-offsets (markers)
* to be highlighted. The markers can be overlapping or nested, but they're always contained within
* the passage.
*/
public class Passage extends OffsetRange {
public List<OffsetRange> markers;
public Passage(int from, int to, List<OffsetRange> markers) {
super(from, to);
this.markers = markers;
}
@Override
public String toString() {
return "[" + super.toString() + ", markers=" + markers + "]";
}
}

View File

@ -0,0 +1,27 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.matchhighlight;
/**
* Adjusts the range of one or more passages over a given value. An example
* adjuster could shift passage boundary to the next or previous word delimiter
* or white space, for example.
*/
public interface PassageAdjuster {
void currentValue(CharSequence value);
OffsetRange adjust(Passage p);
}

View File

@ -0,0 +1,214 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.matchhighlight;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.RandomAccess;
import java.util.function.Function;
/**
* Formats a collection of {@linkplain Passage passages} over a given string, cleaning up
* and resolving restrictions concerning overlaps, allowed sub-ranges over the
* input string and length restrictions.
*
* Passages are demarcated with constructor-provided ellipsis and start/end marker
* sequences.
*/
public class PassageFormatter {
private final String ellipsis;
private final Function<OffsetRange, String> markerStart;
private final Function<OffsetRange, String> markerEnd;
private final ArrayList<OffsetRange> markerStack = new ArrayList<>();
public PassageFormatter(String ellipsis, String markerStart, String markerEnd) {
this(ellipsis, (m) -> markerStart, (m) -> markerEnd);
}
public PassageFormatter(
String ellipsis,
Function<OffsetRange, String> markerStart,
Function<OffsetRange, String> markerEnd) {
this.ellipsis = ellipsis;
this.markerStart = markerStart;
this.markerEnd = markerEnd;
}
public List<String> format(CharSequence value, List<Passage> passages, List<OffsetRange> ranges) {
assert PassageSelector.sortedAndNonOverlapping(passages);
assert PassageSelector.sortedAndNonOverlapping(ranges);
assert ranges instanceof RandomAccess;
if (ranges.isEmpty()) {
return Collections.emptyList();
}
ArrayList<String> result = new ArrayList<>();
StringBuilder buf = new StringBuilder();
int rangeIndex = 0;
OffsetRange range = ranges.get(rangeIndex);
passageFormatting:
for (Passage passage : passages) {
// Move to the range of the current passage.
while (passage.from >= range.to) {
if (++rangeIndex == ranges.size()) {
break passageFormatting;
}
range = ranges.get(rangeIndex);
}
assert range.from <= passage.from && range.to >= passage.to : range + " ? " + passage;
buf.setLength(0);
if (range.from < passage.from) {
buf.append(ellipsis);
}
format(buf, value, passage);
if (range.to > passage.to) {
buf.append(ellipsis);
}
result.add(buf.toString());
}
return result;
}
public StringBuilder format(StringBuilder buf, CharSequence value, final Passage passage) {
switch (passage.markers.size()) {
case 0:
// No markers, full passage appended.
buf.append(value, passage.from, passage.to);
break;
case 1:
// One marker, trivial and frequent case so it's handled separately.
OffsetRange m = passage.markers.iterator().next();
buf.append(value, passage.from, m.from);
buf.append(markerStart.apply(m));
buf.append(value, m.from, m.to);
buf.append(markerEnd.apply(m));
buf.append(value, m.to, passage.to);
break;
default:
// Multiple markers, possibly overlapping or nested.
markerStack.clear();
multipleMarkers(value, passage, buf, markerStack);
break;
}
return buf;
}
/** Handle multiple markers, possibly overlapping or nested. */
private void multipleMarkers(
CharSequence value, final Passage p, StringBuilder b, ArrayList<OffsetRange> markerStack) {
int at = p.from;
int max = p.to;
SlicePoint[] slicePoints = slicePoints(p);
for (SlicePoint slicePoint : slicePoints) {
b.append(value, at, slicePoint.offset);
OffsetRange currentMarker = slicePoint.marker;
switch (slicePoint.type) {
case START:
markerStack.add(currentMarker);
b.append(markerStart.apply(currentMarker));
break;
case END:
int markerIndex = markerStack.lastIndexOf(currentMarker);
for (int k = markerIndex; k < markerStack.size(); k++) {
b.append(markerEnd.apply(markerStack.get(k)));
}
markerStack.remove(markerIndex);
for (int k = markerIndex; k < markerStack.size(); k++) {
b.append(markerStart.apply(markerStack.get(k)));
}
break;
default:
throw new RuntimeException();
}
at = slicePoint.offset;
}
if (at < max) {
b.append(value, at, max);
}
}
private static SlicePoint[] slicePoints(Passage p) {
SlicePoint[] slicePoints = new SlicePoint[p.markers.size() * 2];
int x = 0;
for (OffsetRange m : p.markers) {
slicePoints[x++] = new SlicePoint(SlicePoint.Type.START, m.from, m);
slicePoints[x++] = new SlicePoint(SlicePoint.Type.END, m.to, m);
}
// Order slice points by their offset
Comparator<SlicePoint> c =
Comparator.<SlicePoint>comparingInt(pt -> pt.offset)
.thenComparingInt(pt -> pt.type.ordering)
.thenComparing(
(a, b) -> {
if (a.type == SlicePoint.Type.START) {
// Longer start slice points come first.
return Integer.compare(b.marker.to, a.marker.to);
} else {
// Shorter end slice points come first.
return Integer.compare(b.marker.from, a.marker.from);
}
});
Arrays.sort(slicePoints, c);
return slicePoints;
}
static class SlicePoint {
enum Type {
START(2),
END(1);
private final int ordering;
Type(int ordering) {
this.ordering = ordering;
}
}
public final int offset;
public final Type type;
public final OffsetRange marker;
public SlicePoint(Type t, int offset, OffsetRange m) {
this.type = t;
this.offset = offset;
this.marker = m;
}
@Override
public String toString() {
return "(" + type + ", " + marker + ")";
}
}
}

View File

@ -0,0 +1,273 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.matchhighlight;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.PriorityQueue;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;
import java.util.RandomAccess;
/** Selects fragments of text that score best for the given set of highlight markers. */
public class PassageSelector {
public static final Comparator<Passage> DEFAULT_SCORER =
(a, b) -> {
// Compare the number of highlights first.
int v;
v = Integer.compare(a.markers.size(), b.markers.size());
if (v != 0) {
return v;
}
// Total number of characters covered by the highlights.
int len1 = 0, len2 = 0;
for (OffsetRange o : a.markers) {
len1 += o.length();
}
for (OffsetRange o : b.markers) {
len2 += o.length();
}
if (len1 != len2) {
return Integer.compare(len1, len2);
}
return Integer.compare(b.from, a.from);
};
private final Comparator<Passage> passageScorer;
private final PassageAdjuster passageAdjuster;
public PassageSelector() {
this(DEFAULT_SCORER, null);
}
public PassageSelector(Comparator<Passage> passageScorer, PassageAdjuster passageAdjuster) {
this.passageScorer = passageScorer;
this.passageAdjuster = passageAdjuster;
}
public List<Passage> pickBest(
CharSequence value,
List<? extends OffsetRange> markers,
int maxPassageWindow,
int maxPassages) {
return pickBest(
value, markers, maxPassageWindow, maxPassages, List.of(new OffsetRange(0, value.length())));
}
public List<Passage> pickBest(
CharSequence value,
List<? extends OffsetRange> markers,
int maxPassageWindow,
int maxPassages,
List<OffsetRange> permittedPassageRanges) {
assert markers instanceof RandomAccess && permittedPassageRanges instanceof RandomAccess;
// Handle odd special cases early.
if (value.length() == 0 || maxPassageWindow == 0) {
return Collections.emptyList();
}
// Sort markers by their start offset, shortest first.
markers.sort(
(a, b) -> {
int v = Integer.compare(a.from, b.from);
return v != 0 ? v : Integer.compare(a.to, b.to);
});
// Determine a maximum offset window around each highlight marker and
// pick the best scoring passage candidates.
PriorityQueue<Passage> pq =
new PriorityQueue<>(maxPassages) {
@Override
protected boolean lessThan(Passage a, Passage b) {
return passageScorer.compare(a, b) < 0;
}
};
assert sortedAndNonOverlapping(permittedPassageRanges);
final int max = markers.size();
int markerIndex = 0;
nextRange:
for (OffsetRange range : permittedPassageRanges) {
final int rangeTo = Math.min(range.to, value.length());
// Skip ranges outside of the value window anyway.
if (range.from >= rangeTo) {
continue;
}
while (markerIndex < max) {
OffsetRange m = markers.get(markerIndex);
// Markers are sorted so if the current marker's start is past the range,
// we can advance, but we need to check the same marker against the new range.
if (m.from >= rangeTo) {
continue nextRange;
}
// Check if current marker falls within the range and is smaller than the largest allowed
// passage window.
if (m.from >= range.from && m.to <= rangeTo && m.length() <= maxPassageWindow) {
// Adjust the window range to center the highlight marker.
int from = (m.from + m.to - maxPassageWindow) / 2;
int to = (m.from + m.to + maxPassageWindow) / 2;
if (from < range.from) {
to += range.from - from;
from = range.from;
}
if (to > rangeTo) {
from -= to - rangeTo;
to = rangeTo;
if (from < range.from) {
from = range.from;
}
}
if (from < to && to <= value.length()) {
// Find other markers that are completely inside the passage window.
ArrayList<OffsetRange> inside = new ArrayList<>();
int i = markerIndex;
while (i > 0 && markers.get(i - 1).from >= from) {
i--;
}
OffsetRange c;
for (; i < max && (c = markers.get(i)).from < to; i++) {
if (c.to <= to) {
inside.add(c);
}
}
if (!inside.isEmpty()) {
pq.insertWithOverflow(new Passage(from, to, inside));
}
}
}
// Advance to the next marker.
markerIndex++;
}
}
// Collect from the priority queue (reverse the order so that highest-scoring are first).
Passage[] passages;
if (pq.size() > 0) {
passages = new Passage[pq.size()];
for (int i = pq.size(); --i >= 0; ) {
passages[i] = pq.pop();
}
} else {
// Handle the default, no highlighting markers case.
passages = pickDefaultPassage(value, maxPassageWindow, permittedPassageRanges);
}
// Correct passage boundaries from maxExclusive window. Typically shrink boundaries until we're
// on a proper word/sentence boundary.
if (passageAdjuster != null) {
passageAdjuster.currentValue(value);
for (int x = 0; x < passages.length; x++) {
Passage p = passages[x];
OffsetRange newRange = passageAdjuster.adjust(p);
if (newRange.from != p.from || newRange.to != p.to) {
assert newRange.from >= p.from && newRange.to <= p.to
: "Adjusters must not expand the passage's range: was "
+ p
+ " => changed to "
+ newRange;
passages[x] = new Passage(newRange.from, newRange.to, p.markers);
}
}
}
// Ensure there are no overlaps on passages. In case of conflicts, better score wins.
int last = 0;
for (int i = 0; i < passages.length; i++) {
Passage a = passages[i];
if (a != null && a.length() > 0) {
passages[last++] = a;
for (int j = i + 1; j < passages.length; j++) {
Passage b = passages[j];
if (b != null) {
if (adjecentOrOverlapping(a, b)) {
passages[j] = null;
}
}
}
}
}
// Remove nullified slots.
if (passages.length != last) {
passages = ArrayUtil.copyOfSubArray(passages, 0, last);
}
// Sort in the offset order again.
Arrays.sort(passages, (a, b) -> Integer.compare(a.from, b.from));
return Arrays.asList(passages);
}
static boolean sortedAndNonOverlapping(List<? extends OffsetRange> permittedPassageRanges) {
if (permittedPassageRanges.size() > 1) {
Iterator<? extends OffsetRange> i = permittedPassageRanges.iterator();
for (OffsetRange next, previous = i.next(); i.hasNext(); previous = next) {
next = i.next();
if (previous.to > next.from) {
throw new AssertionError(
"Ranges must be sorted and non-overlapping: " + permittedPassageRanges);
}
}
}
return true;
}
/**
* Invoked when no passages could be selected (due to constraints or lack of highlight markers).
*/
protected Passage[] pickDefaultPassage(
CharSequence value, int maxCharacterWindow, List<OffsetRange> permittedPassageRanges) {
// Search for the first range that is not empty.
for (OffsetRange o : permittedPassageRanges) {
int to = Math.min(value.length(), o.to);
if (o.from < to) {
return new Passage[] {
new Passage(
o.from, o.from + Math.min(maxCharacterWindow, o.length()), Collections.emptyList())
};
}
}
return new Passage[] {};
}
private static boolean adjecentOrOverlapping(Passage a, Passage b) {
if (a.from >= b.from) {
return a.from <= b.to - 1;
} else {
return a.to - 1 >= b.from;
}
}
}

View File

@ -0,0 +1,28 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* This package contains several components useful to build a highlighter
* on top of the {@link org.apache.lucene.search.Matches} API.
*
* {@link org.apache.lucene.search.matchhighlight.MatchRegionRetriever} can be
* used to retrieve hit areas for a given {@link org.apache.lucene.search.Query}
* and one (or more) indexed documents. These hit areas can be then passed to
* {@link org.apache.lucene.search.matchhighlight.PassageSelector} and formatted
* with {@link org.apache.lucene.search.matchhighlight.PassageFormatter}.
*/
package org.apache.lucene.search.matchhighlight;

View File

@ -0,0 +1,81 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.matchhighlight;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
/**
* A simple ASCII match range highlighter for tests.
*/
final class AsciiMatchRangeHighlighter {
private final Analyzer analyzer;
private final PassageFormatter passageFormatter;
private final PassageSelector selector;
private int maxPassageWindow = 160;
private int maxPassages = 10;
public AsciiMatchRangeHighlighter(Analyzer analyzer) {
this.passageFormatter = new PassageFormatter("...", ">", "<");
this.selector = new PassageSelector();
this.analyzer = analyzer;
}
public Map<String, List<String>> apply(Document document, Map<String, List<OffsetRange>> fieldHighlights) {
ArrayList<OffsetRange> valueRanges = new ArrayList<>();
Map<String, List<String>> fieldSnippets = new LinkedHashMap<>();
fieldHighlights.forEach(
(field, matchRanges) -> {
int offsetGap = analyzer.getOffsetGap(field);
String[] values = document.getValues(field);
String value;
if (values.length == 1) {
value = values[0];
} else {
// This can be inefficient if offset gap is large but recomputing
// offsets in a smart way doesn't make sense for tests.
String fieldGapPadding = " ".repeat(offsetGap);
value = String.join(fieldGapPadding, values);
}
// Create permitted range windows for passages so that they don't cross
// multi-value boundary.
valueRanges.clear();
int offset = 0;
for (CharSequence v : values) {
valueRanges.add(new OffsetRange(offset, offset + v.length()));
offset += v.length();
offset += offsetGap;
}
List<Passage> passages =
selector.pickBest(value, matchRanges, maxPassageWindow, maxPassages, valueRanges);
fieldSnippets.put(field, passageFormatter.format(value, passages, valueRanges));
});
return fieldSnippets;
}
}

View File

@ -0,0 +1,39 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.matchhighlight;
import org.apache.lucene.analysis.Analyzer;
import java.io.Reader;
/** An {@link Analyzer} that throws a runtime exception when used for anything. */
final class MissingAnalyzer extends Analyzer {
@Override
protected Reader initReader(String fieldName, Reader reader) {
throw new RuntimeException("Field must have an explicit Analyzer: " + fieldName);
}
@Override
protected TokenStreamComponents createComponents(String fieldName) {
throw new RuntimeException("Field must have an explicit Analyzer: " + fieldName);
}
@Override
public int getOffsetGap(String fieldName) {
return 0;
}
}

View File

@ -0,0 +1,767 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.matchhighlight;
import com.carrotsearch.randomizedtesting.RandomizedTest;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
import org.apache.lucene.analysis.synonym.SynonymGraphFilter;
import org.apache.lucene.analysis.synonym.SynonymMap;
import org.apache.lucene.analysis.util.CharTokenizer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.Term;
import org.apache.lucene.queries.intervals.IntervalQuery;
import org.apache.lucene.queries.intervals.Intervals;
import org.apache.lucene.queryparser.flexible.core.QueryNodeException;
import org.apache.lucene.queryparser.flexible.standard.StandardQueryParser;
import org.apache.lucene.queryparser.flexible.standard.config.StandardQueryConfigHandler;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.spans.SpanNearQuery;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.store.ByteBuffersDirectory;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.LuceneTestCase;
import org.hamcrest.Matchers;
import org.junit.Before;
import org.junit.Test;
import java.io.IOException;
import java.io.UncheckedIOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.TreeMap;
import java.util.function.BiFunction;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import static org.hamcrest.Matchers.containsInAnyOrder;
import static org.hamcrest.Matchers.emptyArray;
import static org.hamcrest.Matchers.not;
public class TestMatchRegionRetriever extends LuceneTestCase {
private static final String FLD_ID = "field_id";
private static final String FLD_TEXT_POS_OFFS1 = "field_text_offs1";
private static final String FLD_TEXT_POS_OFFS2 = "field_text_offs2";
private static final String FLD_TEXT_POS_OFFS = "field_text_offs";
private static final String FLD_TEXT_POS = "field_text";
private static final String FLD_TEXT_SYNONYMS_POS_OFFS = "field_text_syns_offs";
private static final String FLD_TEXT_SYNONYMS_POS = "field_text_syns";
private static final String FLD_TEXT_NOPOS = "field_text_nopos";
private static final String FLD_NON_EXISTING = "field_missing";
private FieldType TYPE_STORED_WITH_OFFSETS;
private FieldType TYPE_STORED_NO_POSITIONS;
private Analyzer analyzer;
@Before
public void setup() {
TYPE_STORED_WITH_OFFSETS = new FieldType(TextField.TYPE_STORED);
TYPE_STORED_WITH_OFFSETS.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
TYPE_STORED_WITH_OFFSETS.freeze();
TYPE_STORED_NO_POSITIONS = new FieldType(TextField.TYPE_STORED);
TYPE_STORED_NO_POSITIONS.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
TYPE_STORED_NO_POSITIONS.freeze();
Analyzer whitespaceAnalyzer =
new Analyzer() {
final int offsetGap = RandomizedTest.randomIntBetween(0, 2);
final int positionGap = RandomizedTest.randomFrom(new int[]{0, 1, 100});
@Override
protected TokenStreamComponents createComponents(String fieldName) {
WhitespaceTokenizer tokenizer =
new WhitespaceTokenizer(CharTokenizer.DEFAULT_MAX_WORD_LEN);
return new TokenStreamComponents(tokenizer);
}
@Override
public int getOffsetGap(String fieldName) {
return offsetGap;
}
@Override
public int getPositionIncrementGap(String fieldName) {
return positionGap;
}
};
Map<String, Analyzer> fieldAnalyzers = new HashMap<>();
fieldAnalyzers.put(FLD_TEXT_POS, whitespaceAnalyzer);
fieldAnalyzers.put(FLD_TEXT_POS_OFFS, whitespaceAnalyzer);
fieldAnalyzers.put(FLD_TEXT_POS_OFFS1, whitespaceAnalyzer);
fieldAnalyzers.put(FLD_TEXT_POS_OFFS2, whitespaceAnalyzer);
fieldAnalyzers.put(FLD_TEXT_NOPOS, whitespaceAnalyzer);
try {
SynonymMap.Builder b = new SynonymMap.Builder();
b.add(new CharsRef("foo\u0000bar"), new CharsRef("syn1"), true);
b.add(new CharsRef("baz"), new CharsRef("syn2\u0000syn3"), true);
SynonymMap synonymMap = b.build();
Analyzer synonymsAnalyzer =
new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new WhitespaceTokenizer();
TokenStream tokenStream = new SynonymGraphFilter(tokenizer, synonymMap, true);
return new TokenStreamComponents(tokenizer, tokenStream);
}
};
fieldAnalyzers.put(FLD_TEXT_SYNONYMS_POS_OFFS, synonymsAnalyzer);
fieldAnalyzers.put(FLD_TEXT_SYNONYMS_POS, synonymsAnalyzer);
} catch (IOException e) {
throw new UncheckedIOException(e);
}
analyzer = new PerFieldAnalyzerWrapper(new MissingAnalyzer(), fieldAnalyzers);
}
BiFunction<String, String, Query> stdQueryParser =
(query, defField) -> {
try {
StandardQueryParser parser = new StandardQueryParser(analyzer);
parser.setDefaultOperator(StandardQueryConfigHandler.Operator.AND);
return parser.parse(query, defField);
} catch (QueryNodeException e) {
throw new RuntimeException(e);
}
};
@Test
public void testTermQueryWithOffsets() throws IOException {
checkTermQuery(FLD_TEXT_POS_OFFS);
}
@Test
public void testTermQueryWithPositions() throws IOException {
checkTermQuery(FLD_TEXT_POS);
}
private void checkTermQuery(String field) throws IOException {
withReader(
List.of(
Map.of(field, values("foo bar baz")),
Map.of(field, values("bar foo baz")),
Map.of(field, values("bar baz foo")),
Map.of(field, values("bar bar bar irrelevant"))),
reader -> {
assertThat(highlights(reader, new TermQuery(new Term(field, "foo"))),
containsInAnyOrder(
fmt("0: (%s: '>foo< bar baz')", field),
fmt("1: (%s: 'bar >foo< baz')", field),
fmt("2: (%s: 'bar baz >foo<')", field)));
});
}
@Test
public void testBooleanMultifieldQueryWithOffsets() throws IOException {
checkBooleanMultifieldQuery(FLD_TEXT_POS_OFFS);
}
@Test
public void testBooleanMultifieldQueryWithPositions() throws IOException {
checkBooleanMultifieldQuery(FLD_TEXT_POS);
}
private void checkBooleanMultifieldQuery(String field) throws IOException {
Query query =
new BooleanQuery.Builder()
.add(new PhraseQuery(1, field, "foo", "baz"), BooleanClause.Occur.SHOULD)
.add(new TermQuery(new Term(FLD_NON_EXISTING, "abc")), BooleanClause.Occur.SHOULD)
.add(new TermQuery(new Term(field, "xyz")), BooleanClause.Occur.MUST_NOT)
.build();
withReader(
List.of(
Map.of(field, values("foo bar baz abc")),
Map.of(field, values("bar foo baz def")),
Map.of(field, values("bar baz foo xyz"))),
reader -> {
assertThat(highlights(reader, query),
containsInAnyOrder(
fmt("0: (%s: '>foo bar baz< abc')", field),
fmt("1: (%s: 'bar >foo baz< def')", field)));
});
}
@Test
public void testVariousQueryTypesWithOffsets() throws IOException {
checkVariousQueryTypes(FLD_TEXT_POS_OFFS);
}
@Test
public void testVariousQueryTypesWithPositions() throws IOException {
checkVariousQueryTypes(FLD_TEXT_POS);
}
private void checkVariousQueryTypes(String field) throws IOException {
withReader(
List.of(
Map.of(field, values("foo bar baz abc")),
Map.of(field, values("bar foo baz def")),
Map.of(field, values("bar baz foo xyz"))),
reader -> {
assertThat(highlights(reader, stdQueryParser.apply("foo baz", field)),
containsInAnyOrder(
fmt("0: (%s: '>foo< bar >baz< abc')", field),
fmt("1: (%s: 'bar >foo< >baz< def')", field),
fmt("2: (%s: 'bar >baz< >foo< xyz')", field)));
assertThat(highlights(reader, stdQueryParser.apply("foo OR xyz", field)),
containsInAnyOrder(
fmt("0: (%s: '>foo< bar baz abc')", field),
fmt("1: (%s: 'bar >foo< baz def')", field),
fmt("2: (%s: 'bar baz >foo< >xyz<')", field)));
assertThat(highlights(reader, stdQueryParser.apply("bas~2", field)),
containsInAnyOrder(
fmt("0: (%s: 'foo >bar< >baz< >abc<')", field),
fmt("1: (%s: '>bar< foo >baz< def')", field),
fmt("2: (%s: '>bar< >baz< foo xyz')", field)));
assertThat(highlights(reader, stdQueryParser.apply("\"foo bar\"", field)),
containsInAnyOrder((fmt("0: (%s: '>foo bar< baz abc')", field))));
assertThat(highlights(reader, stdQueryParser.apply("\"foo bar\"~3", field)),
containsInAnyOrder(
fmt("0: (%s: '>foo bar< baz abc')", field),
fmt("1: (%s: '>bar foo< baz def')", field),
fmt("2: (%s: '>bar baz foo< xyz')", field)));
assertThat(highlights(reader, stdQueryParser.apply("ba*", field)),
containsInAnyOrder(
fmt("0: (%s: 'foo >bar< >baz< abc')", field),
fmt("1: (%s: '>bar< foo >baz< def')", field),
fmt("2: (%s: '>bar< >baz< foo xyz')", field)));
assertThat(highlights(reader, stdQueryParser.apply("[bar TO bas]", field)),
containsInAnyOrder(
fmt("0: (%s: 'foo >bar< baz abc')", field),
fmt("1: (%s: '>bar< foo baz def')", field),
fmt("2: (%s: '>bar< baz foo xyz')", field)));
// Note how document '2' has 'bar' that isn't highlighted (because this
// document is excluded in the first clause).
assertThat(
highlights(reader, stdQueryParser.apply("([bar TO baz] -xyz) OR baz", field)),
containsInAnyOrder(
fmt("0: (%s: 'foo >bar< >>baz<< abc')", field),
fmt("1: (%s: '>bar< foo >>baz<< def')", field),
fmt("2: (%s: 'bar >baz< foo xyz')", field)));
assertThat(highlights(reader, new MatchAllDocsQuery()),
Matchers.hasSize(0));
});
withReader(
List.of(
Map.of(field, values("foo baz foo")),
Map.of(field, values("bas baz foo")),
Map.of(field, values("bar baz foo xyz"))),
reader -> {
assertThat(
highlights(reader, stdQueryParser.apply("[bar TO baz] -bar", field)),
containsInAnyOrder(
fmt("0: (%s: 'foo >baz< foo')", field), fmt("1: (%s: '>bas< >baz< foo')", field)));
});
}
@Test
public void testIntervalQueries() throws IOException {
String field = FLD_TEXT_POS_OFFS;
withReader(
List.of(
Map.of(field, values("foo baz foo")),
Map.of(field, values("bas baz foo")),
Map.of(field, values("bar baz foo xyz"))),
reader -> {
assertThat(
highlights(reader, new IntervalQuery(field,
Intervals.unordered(
Intervals.term("foo"),
Intervals.term("bas"),
Intervals.term("baz")))),
containsInAnyOrder(
fmt("1: (field_text_offs: '>bas baz foo<')", field)
));
assertThat(
highlights(reader, new IntervalQuery(field,
Intervals.maxgaps(1,
Intervals.unordered(
Intervals.term("foo"),
Intervals.term("bar"))))),
containsInAnyOrder(
fmt("2: (field_text_offs: '>bar baz foo< xyz')", field)
));
assertThat(
highlights(reader, new IntervalQuery(field,
Intervals.containing(
Intervals.unordered(
Intervals.term("foo"),
Intervals.term("bar")),
Intervals.term("foo")))),
containsInAnyOrder(
fmt("2: (field_text_offs: '>bar baz foo< xyz')", field)
));
assertThat(
highlights(reader, new IntervalQuery(field,
Intervals.containedBy(
Intervals.term("foo"),
Intervals.unordered(
Intervals.term("foo"),
Intervals.term("bar"))))),
containsInAnyOrder(
fmt("2: (field_text_offs: '>bar baz foo< xyz')", field)
));
assertThat(
highlights(reader, new IntervalQuery(field,
Intervals.overlapping(
Intervals.unordered(
Intervals.term("foo"),
Intervals.term("bar")),
Intervals.term("foo")))),
containsInAnyOrder(
fmt("2: (field_text_offs: '>bar baz foo< xyz')", field)
));
});
}
@Test
public void testMultivaluedFieldsWithOffsets() throws IOException {
checkMultivaluedFields(FLD_TEXT_POS_OFFS);
}
@Test
public void testMultivaluedFieldsWithPositions() throws IOException {
checkMultivaluedFields(FLD_TEXT_POS);
}
public void checkMultivaluedFields(String field) throws IOException {
withReader(
List.of(
Map.of(field, values("foo bar", "baz abc", "bad baz")),
Map.of(field, values("bar foo", "baz def")),
Map.of(field, values("bar baz", "foo xyz"))),
reader -> {
assertThat(highlights(reader, stdQueryParser.apply("baz", field)),
containsInAnyOrder(
fmt("0: (%s: '>baz< abc | bad >baz<')", field),
fmt("1: (%s: '>baz< def')", field),
fmt("2: (%s: 'bar >baz<')", field)));
});
}
@Test
public void testMultiFieldHighlights() throws IOException {
for (String[] fields :
new String[][]{
{FLD_TEXT_POS_OFFS1, FLD_TEXT_POS_OFFS2},
{FLD_TEXT_POS, FLD_TEXT_POS_OFFS2},
{FLD_TEXT_POS_OFFS1, FLD_TEXT_POS}
}) {
String field1 = fields[0];
String field2 = fields[1];
withReader(
List.of(
Map.of(
field1, values("foo bar", "baz abc"),
field2, values("foo baz", "loo bar"))),
reader -> {
String ordered =
Stream.of(fmt("(%s: '>baz< abc')", field1), fmt("(%s: 'loo >bar<')", field2))
.sorted()
.collect(Collectors.joining(""));
assertThat(
highlights(
reader,
stdQueryParser.apply(field1 + ":baz" + " OR " + field2 + ":bar", field1)),
containsInAnyOrder(fmt("0: %s", ordered)));
});
}
}
/**
* Rewritten Boolean queries may omit matches from {@link
* org.apache.lucene.search.BooleanClause.Occur#SHOULD} clauses. Check that this isn't the case.
*/
@Test
public void testNoRewrite() throws IOException {
String field1 = FLD_TEXT_POS_OFFS1;
String field2 = FLD_TEXT_POS_OFFS2;
withReader(
List.of(
Map.of(
field1, values("0100"),
field2, values("loo bar")),
Map.of(
field1, values("0200"),
field2, values("foo bar"))),
reader -> {
String expected = fmt("0: (%s: '>0100<')(%s: 'loo >bar<')", field1, field2);
assertThat(
highlights(
reader,
stdQueryParser.apply(fmt("+%s:01* OR %s:bar", field1, field2), field1)),
containsInAnyOrder(expected));
assertThat(
highlights(
reader,
stdQueryParser.apply(fmt("+%s:01* AND %s:bar", field1, field2), field1)),
containsInAnyOrder(expected));
});
}
@Test
public void testNestedQueryHitsWithOffsets() throws IOException {
checkNestedQueryHits(FLD_TEXT_POS_OFFS);
}
@Test
public void testNestedQueryHitsWithPositions() throws IOException {
checkNestedQueryHits(FLD_TEXT_POS);
}
public void checkNestedQueryHits(String field) throws IOException {
withReader(
List.of(Map.of(field, values("foo bar baz abc"))),
reader -> {
assertThat(
highlights(
reader,
new BooleanQuery.Builder()
.add(new PhraseQuery(1, field, "foo", "baz"), BooleanClause.Occur.SHOULD)
.add(new TermQuery(new Term(field, "bar")), BooleanClause.Occur.SHOULD)
.build()),
containsInAnyOrder(fmt("0: (%s: '>foo >bar< baz< abc')", field)));
assertThat(
highlights(
reader,
new BooleanQuery.Builder()
.add(new PhraseQuery(1, field, "foo", "baz"), BooleanClause.Occur.SHOULD)
.add(new TermQuery(new Term(field, "bar")), BooleanClause.Occur.SHOULD)
.add(new TermQuery(new Term(field, "baz")), BooleanClause.Occur.SHOULD)
.build()),
containsInAnyOrder(fmt("0: (%s: '>foo >bar< >baz<< abc')", field)));
});
}
@Test
public void testGraphQueryWithOffsets() throws Exception {
checkGraphQuery(FLD_TEXT_SYNONYMS_POS_OFFS);
}
@Test
public void testGraphQueryWithPositions() throws Exception {
checkGraphQuery(FLD_TEXT_SYNONYMS_POS);
}
private void checkGraphQuery(String field) throws IOException {
withReader(
List.of(
Map.of(field, values("foo bar baz")),
Map.of(field, values("bar foo baz")),
Map.of(field, values("bar baz foo")),
Map.of(field, values("bar bar bar irrelevant"))),
reader -> {
assertThat(highlights(reader, new TermQuery(new Term(field, "syn1"))),
containsInAnyOrder(fmt("0: (%s: '>foo bar< baz')", field)));
// [syn2 syn3] = baz
// so both these queries highlight baz.
assertThat(highlights(reader, new TermQuery(new Term(field, "syn3"))),
containsInAnyOrder(
fmt("0: (%s: 'foo bar >baz<')", field),
fmt("1: (%s: 'bar foo >baz<')", field),
fmt("2: (%s: 'bar >baz< foo')", field)));
assertThat(
highlights(reader, stdQueryParser.apply(field + ":\"syn2 syn3\"", field)),
containsInAnyOrder(
fmt("0: (%s: 'foo bar >baz<')", field),
fmt("1: (%s: 'bar foo >baz<')", field),
fmt("2: (%s: 'bar >baz< foo')", field)));
assertThat(
highlights(reader, stdQueryParser.apply(field + ":\"foo syn2 syn3\"", field)),
containsInAnyOrder(fmt("1: (%s: 'bar >foo baz<')", field)));
});
}
@Test
public void testSpanQueryWithOffsets() throws Exception {
checkSpanQueries(FLD_TEXT_POS_OFFS);
}
@Test
public void testSpanQueryWithPositions() throws Exception {
checkSpanQueries(FLD_TEXT_POS);
}
private void checkSpanQueries(String field) throws IOException {
withReader(
List.of(
Map.of(field, values("foo bar baz")),
Map.of(field, values("bar foo baz")),
Map.of(field, values("bar baz foo")),
Map.of(field, values("bar bar bar irrelevant"))),
reader -> {
assertThat(
highlights(
reader,
SpanNearQuery.newOrderedNearQuery(field)
.addClause(new SpanTermQuery(new Term(field, "bar")))
.addClause(new SpanTermQuery(new Term(field, "foo")))
.build()),
containsInAnyOrder(fmt("1: (%s: '>bar foo< baz')", field)));
assertThat(
highlights(
reader,
SpanNearQuery.newOrderedNearQuery(field)
.addClause(new SpanTermQuery(new Term(field, "bar")))
.addGap(1)
.addClause(new SpanTermQuery(new Term(field, "foo")))
.build()),
containsInAnyOrder(fmt("2: (%s: '>bar baz foo<')", field)));
assertThat(
highlights(
reader,
SpanNearQuery.newUnorderedNearQuery(field)
.addClause(new SpanTermQuery(new Term(field, "foo")))
.addClause(new SpanTermQuery(new Term(field, "bar")))
.build()),
containsInAnyOrder(
fmt("0: (%s: '>foo bar< baz')", field), fmt("1: (%s: '>bar foo< baz')", field)));
assertThat(
highlights(
reader,
SpanNearQuery.newUnorderedNearQuery(field)
.addClause(new SpanTermQuery(new Term(field, "foo")))
.addClause(new SpanTermQuery(new Term(field, "bar")))
.setSlop(1)
.build()),
containsInAnyOrder(
fmt("0: (%s: '>foo bar< baz')", field),
fmt("1: (%s: '>bar foo< baz')", field),
fmt("2: (%s: '>bar baz foo<')", field)));
});
}
/**
* This test runs a term query against a field with no stored
* positions or offsets. This test checks the {@link OffsetsFromValues}
* strategy that returns highlights over entire indexed values.
*/
@Test
public void testTextFieldNoPositionsOffsetFromValues() throws Exception {
String field = FLD_TEXT_NOPOS;
withReader(
List.of(
Map.of(FLD_TEXT_NOPOS, values("foo bar")),
Map.of(FLD_TEXT_NOPOS, values("foo bar", "baz baz"))
),
reader -> {
OffsetsRetrievalStrategySupplier defaults = MatchRegionRetriever
.computeOffsetRetrievalStrategies(reader, analyzer);
OffsetsRetrievalStrategySupplier customSuppliers = (fld) -> {
if (fld.equals(field)) {
return new OffsetsFromValues(field, analyzer);
} else {
return defaults.apply(field);
}
};
assertThat(
highlights(
customSuppliers,
reader,
new TermQuery(new Term(field, "bar"))),
containsInAnyOrder(
fmt("0: (%s: '>foo bar<')", field),
fmt("1: (%s: '>foo bar< | >baz baz<')", field)));
});
}
/**
* This test runs a term query against a field with no stored
* positions or offsets.
* <p>
* Such field structure is often useful for multivalued "keyword-like"
* fields.
*/
@Test
public void testTextFieldNoPositionsOffsetsFromTokens() throws Exception {
String field = FLD_TEXT_NOPOS;
withReader(
List.of(
Map.of(FLD_TEXT_NOPOS, values("foo bar"),
FLD_TEXT_POS, values("bar bar")),
Map.of(FLD_TEXT_NOPOS, values("foo bar", "baz bar"))
),
reader -> {
assertThat(
highlights(
reader,
new TermQuery(new Term(field, "bar"))),
containsInAnyOrder(
fmt("0: (%s: 'foo >bar<')", field),
fmt("1: (%s: 'foo >bar< | baz >bar<')", field)));
});
}
private List<String> highlights(IndexReader reader, Query query) throws IOException {
return highlights(MatchRegionRetriever.computeOffsetRetrievalStrategies(reader, analyzer),
reader, query);
}
private List<String> highlights(OffsetsRetrievalStrategySupplier offsetsStrategySupplier,
IndexReader reader, Query query) throws IOException {
IndexSearcher searcher = new IndexSearcher(reader);
int maxDocs = 1000;
Query rewrittenQuery = searcher.rewrite(query);
TopDocs topDocs = searcher.search(rewrittenQuery, maxDocs);
ArrayList<String> highlights = new ArrayList<>();
AsciiMatchRangeHighlighter formatter = new AsciiMatchRangeHighlighter(analyzer);
MatchRegionRetriever.MatchOffsetsConsumer highlightCollector =
(docId, leafReader, leafDocId, fieldHighlights) -> {
StringBuilder sb = new StringBuilder();
Document document = leafReader.document(leafDocId);
formatter
.apply(document, new TreeMap<>(fieldHighlights))
.forEach(
(field, snippets) -> {
sb.append(
String.format(
Locale.ROOT, "(%s: '%s')", field, String.join(" | ", snippets)));
});
if (sb.length() > 0) {
sb.insert(0, document.get(FLD_ID) + ": ");
highlights.add(sb.toString());
}
};
MatchRegionRetriever highlighter = new MatchRegionRetriever(searcher, rewrittenQuery, analyzer,
offsetsStrategySupplier);
highlighter.highlightDocuments(topDocs, highlightCollector);
return highlights;
}
private String[] values(String... values) {
assertThat(values, not(emptyArray()));
return values;
}
private void withReader(
Collection<Map<String, String[]>> docs, IOUtils.IOConsumer<DirectoryReader> block)
throws IOException {
IndexWriterConfig config = new IndexWriterConfig(analyzer);
try (Directory directory = new ByteBuffersDirectory()) {
IndexWriter iw = new IndexWriter(directory, config);
int seq = 0;
for (Map<String, String[]> fields : docs) {
Document doc = new Document();
doc.add(new StringField(FLD_ID, Integer.toString(seq++), Field.Store.YES));
for (Map.Entry<String, String[]> field : fields.entrySet()) {
for (String value : field.getValue()) {
doc.add(toField(field.getKey(), value));
}
}
iw.addDocument(doc);
if (RandomizedTest.randomBoolean()) {
iw.commit();
}
}
iw.flush();
try (DirectoryReader reader = DirectoryReader.open(iw)) {
block.accept(reader);
}
}
}
private IndexableField toField(String name, String value) {
switch (name) {
case FLD_TEXT_NOPOS:
return new Field(name, value, TYPE_STORED_NO_POSITIONS);
case FLD_TEXT_POS:
case FLD_TEXT_SYNONYMS_POS:
return new TextField(name, value, Field.Store.YES);
case FLD_TEXT_POS_OFFS:
case FLD_TEXT_POS_OFFS1:
case FLD_TEXT_POS_OFFS2:
case FLD_TEXT_SYNONYMS_POS_OFFS:
return new Field(name, value, TYPE_STORED_WITH_OFFSETS);
default:
throw new AssertionError("Don't know how to handle this field: " + name);
}
}
private static String fmt(String string, Object... args) {
return String.format(Locale.ROOT, string, args);
}
}

View File

@ -0,0 +1,284 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.matchhighlight;
import static com.carrotsearch.randomizedtesting.RandomizedTest.randomAsciiLettersOfLengthBetween;
import static com.carrotsearch.randomizedtesting.RandomizedTest.randomBoolean;
import static com.carrotsearch.randomizedtesting.RandomizedTest.randomIntBetween;
import static com.carrotsearch.randomizedtesting.RandomizedTest.randomRealisticUnicodeOfCodepointLengthBetween;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Objects;
import org.apache.lucene.util.LuceneTestCase;
import org.hamcrest.Matchers;
import org.junit.Test;
public class TestPassageSelector extends LuceneTestCase {
@Test
public void checkEmptyExtra() {
checkPassages(
"foo >>bar<< baz abc",
"foo bar baz abc",
300,
100,
new OffsetRange(4, 7),
new OffsetRange(4, 7));
checkPassages(
">foo >bar< >baz<< abc",
"foo bar baz abc",
300,
100,
new OffsetRange(0, 11),
new OffsetRange(4, 7),
new OffsetRange(8, 11));
checkPassages(
">>foo< bar >baz<< abc",
"foo bar baz abc",
300,
100,
new OffsetRange(0, 11),
new OffsetRange(0, 3),
new OffsetRange(8, 11));
}
@Test
public void oneMarker() {
checkPassages(">0<123456789a", "0123456789a", 300, 1, new OffsetRange(0, 1));
checkPassages("0123456789>a<", "0123456789a", 300, 1, new OffsetRange(10, 11));
checkPassages(">0123456789a<", "0123456789a", 300, 1, new OffsetRange(0, 11));
}
@Test
public void noHighlights() {
checkPassages("0123456789a", "0123456789a", 300, 1);
checkPassages("01234...", "0123456789a", 5, 1);
checkPassages(
"0123",
"0123456789a",
15,
2,
new OffsetRange[0],
new OffsetRange[] {new OffsetRange(0, 4), new OffsetRange(4, 9)});
}
@Test
public void oneMarkerTruncated() {
checkPassages(">0<12...", "0123456789a", 4, 1, new OffsetRange(0, 1));
checkPassages("...789>a<", "0123456789a", 4, 1, new OffsetRange(10, 11));
checkPassages("...>3456<...", "0123456789a", 4, 1, new OffsetRange(3, 7));
checkPassages("...3>45<6...", "0123456789a", 4, 1, new OffsetRange(4, 6));
}
@Test
public void highlightLargerThanWindow() {
String value = "0123456789a";
checkPassages("0123...", value, 4, 1, new OffsetRange(0, value.length()));
}
@Test
public void twoMarkers() {
checkPassages(
"0>12<3>45<6789a", "0123456789a", 300, 1, new OffsetRange(1, 3), new OffsetRange(4, 6));
checkPassages(
"0>123<>45<6789a", "0123456789a", 300, 1, new OffsetRange(1, 4), new OffsetRange(4, 6));
}
@Test
public void noMarkers() {
checkPassages("0123456789a", "0123456789a", 300, 1);
checkPassages("0123...", "0123456789a", 4, 1);
}
@Test
public void markersOutsideValue() {
checkPassages("0123456789a", "0123456789a", 300, 1, new OffsetRange(100, 200));
}
@Test
public void twoPassages() {
checkPassages(
"0>12<3...|...6>78<9...",
"0123456789a",
4,
2,
new OffsetRange(1, 3),
new OffsetRange(7, 9));
}
@Test
public void emptyRanges() {
// Empty ranges cover the highlight, so it is omitted.
// Instead, the first non-empty range is taken as the default.
checkPassages(
"6789...",
"0123456789a",
4,
2,
ranges(new OffsetRange(0, 1)),
ranges(new OffsetRange(0, 0), new OffsetRange(2, 2), new OffsetRange(6, 11)));
}
@Test
public void passageScoring() {
// More highlights per passage -> better passage
checkPassages(
">01<>23<...",
"0123456789a",
4,
1,
new OffsetRange(0, 2),
new OffsetRange(2, 4),
new OffsetRange(8, 10));
checkPassages(
"...>01<23>45<67>89<...",
"__________0123456789a__________",
10,
1,
new OffsetRange(10, 12),
new OffsetRange(14, 16),
new OffsetRange(18, 20));
// ...if tied, the one with longer highlight length overall.
checkPassages(
"...6>789<...", "0123456789a", 4, 1, new OffsetRange(0, 2), new OffsetRange(7, 10));
// ...if tied, the first one in order.
checkPassages(">01<23...", "0123456789a", 4, 1, new OffsetRange(0, 2), new OffsetRange(8, 10));
}
@Test
public void rangeWindows() {
// Add constraint windows to split the three highlights.
checkPassages(
"..._______>01<2",
"__________0123456789a__________",
10,
3,
ranges(new OffsetRange(10, 12), new OffsetRange(14, 16), new OffsetRange(18, 20)),
ranges(new OffsetRange(0, 13)));
checkPassages(
">89<a_______...",
"__________0123456789a__________",
10,
3,
ranges(new OffsetRange(10, 12), new OffsetRange(14, 16), new OffsetRange(18, 20)),
ranges(new OffsetRange(18, Integer.MAX_VALUE)));
checkPassages(
"...________>01<|23>45<67|>89<a_______...",
"__________0123456789a__________",
10,
3,
ranges(new OffsetRange(10, 12), new OffsetRange(14, 16), new OffsetRange(18, 20)),
ranges(
new OffsetRange(0, 12),
new OffsetRange(12, 18),
new OffsetRange(18, Integer.MAX_VALUE)));
}
@Test
public void randomizedSanityCheck() {
PassageSelector selector = new PassageSelector();
PassageFormatter formatter = new PassageFormatter("...", ">", "<");
ArrayList<OffsetRange> highlights = new ArrayList<>();
ArrayList<OffsetRange> ranges = new ArrayList<>();
for (int i = 0; i < 5000; i++) {
String value =
randomBoolean()
? randomAsciiLettersOfLengthBetween(0, 100)
: randomRealisticUnicodeOfCodepointLengthBetween(0, 1000);
ranges.clear();
highlights.clear();
for (int j = randomIntBetween(0, 10); --j >= 0; ) {
int from = randomIntBetween(0, value.length());
highlights.add(new OffsetRange(from, from + randomIntBetween(1, 10)));
}
int charWindow = randomIntBetween(1, 100);
int maxPassages = randomIntBetween(1, 10);
if (randomIntBetween(0, 5) == 0) {
int increment = value.length() / 10;
for (int c = randomIntBetween(0, 20), start = 0; --c >= 0; ) {
int step = randomIntBetween(0, increment);
ranges.add(new OffsetRange(start, start + step));
start += step + randomIntBetween(0, 3);
}
} else {
ranges.add(new OffsetRange(0, value.length()));
}
// Just make sure there are no exceptions.
List<Passage> passages =
selector.pickBest(value, highlights, charWindow, maxPassages, ranges);
formatter.format(value, passages, ranges);
}
}
private void checkPassages(
String expected, String value, int charWindow, int maxPassages, OffsetRange... highlights) {
checkPassages(
expected,
value,
charWindow,
maxPassages,
highlights,
ranges(new OffsetRange(0, value.length())));
}
private void checkPassages(
String expected,
String value,
int charWindow,
int maxPassages,
OffsetRange[] highlights,
OffsetRange[] ranges) {
String result = getPassages(value, charWindow, maxPassages, highlights, ranges);
if (!Objects.equals(result, expected)) {
System.out.println("Value: " + value);
System.out.println("Result: " + result);
System.out.println("Expect: " + expected);
}
assertThat(result, Matchers.equalTo(expected));
}
protected String getPassages(
String value,
int charWindow,
int maxPassages,
OffsetRange[] highlights,
OffsetRange[] ranges) {
PassageFormatter passageFormatter = new PassageFormatter("...", ">", "<");
PassageSelector selector = new PassageSelector();
List<OffsetRange> hlist = Arrays.asList(highlights);
List<OffsetRange> rangeList = Arrays.asList(ranges);
List<Passage> passages = selector.pickBest(value, hlist, charWindow, maxPassages, rangeList);
return String.join("|", passageFormatter.format(value, passages, rangeList));
}
protected OffsetRange[] ranges(OffsetRange... offsets) {
return offsets;
}
}