mirror of https://github.com/apache/lucene.git
LUCENE-9463: Query match region retrieval component, passage scoring and formatting (#1750)
Reviewed as part of previous issue by @romseygeek
This commit is contained in:
parent
a003f64649
commit
150a8dacb5
|
@ -63,6 +63,9 @@ API Changes
|
|||
|
||||
Improvements
|
||||
|
||||
* LUCENE-9463: Query match region retrieval component, passage scoring and formatting
|
||||
for building custom highlighters. (Alan Woodward, Dawid Weiss)
|
||||
|
||||
* LUCENE-9370: RegExp query is no longer lenient about inappropriate backslashes and
|
||||
follows the Java Pattern policy for rejecting illegal syntax. (Mark Harwood)
|
||||
|
||||
|
|
|
@ -28,4 +28,5 @@ dependencies {
|
|||
|
||||
testImplementation project(':lucene:test-framework')
|
||||
testImplementation project(':lucene:analysis:common')
|
||||
testImplementation project(':lucene:queryparser')
|
||||
}
|
||||
|
|
|
@ -38,6 +38,7 @@
|
|||
<pathelement path="${memory.jar}"/>
|
||||
<pathelement path="${queries.jar}"/>
|
||||
<pathelement path="${analyzers-common.jar}"/>
|
||||
<pathelement path="${queryparser.jar}"/>
|
||||
<path refid="test.base.classpath"/>
|
||||
</path>
|
||||
|
||||
|
|
|
@ -0,0 +1,86 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.search.matchhighlight;
|
||||
|
||||
import java.text.BreakIterator;
|
||||
import java.util.Locale;
|
||||
|
||||
/**
|
||||
* A {@link PassageAdjuster} that adjusts the {@link Passage} range to
|
||||
* word boundaries hinted by the given {@link BreakIterator}.
|
||||
*/
|
||||
public class BreakIteratorShrinkingAdjuster implements PassageAdjuster {
|
||||
private final BreakIterator bi;
|
||||
private CharSequence value;
|
||||
|
||||
public BreakIteratorShrinkingAdjuster() {
|
||||
this(BreakIterator.getWordInstance(Locale.ROOT));
|
||||
}
|
||||
|
||||
public BreakIteratorShrinkingAdjuster(BreakIterator bi) {
|
||||
this.bi = bi;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void currentValue(CharSequence value) {
|
||||
this.value = value;
|
||||
bi.setText(new CharSequenceIterator(value));
|
||||
}
|
||||
|
||||
@Override
|
||||
public OffsetRange adjust(Passage passage) {
|
||||
int from = passage.from;
|
||||
if (from > 0) {
|
||||
while (!bi.isBoundary(from)
|
||||
|| (from < value.length() && Character.isWhitespace(value.charAt(from)))) {
|
||||
from = bi.following(from);
|
||||
if (from == BreakIterator.DONE) {
|
||||
from = passage.from;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (from == value.length()) {
|
||||
from = passage.from;
|
||||
}
|
||||
}
|
||||
|
||||
int to = passage.to;
|
||||
if (to != value.length()) {
|
||||
while (!bi.isBoundary(to) || (to > 0 && Character.isWhitespace(value.charAt(to - 1)))) {
|
||||
to = bi.preceding(to);
|
||||
if (to == BreakIterator.DONE) {
|
||||
to = passage.to;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (to == 0) {
|
||||
to = passage.to;
|
||||
}
|
||||
}
|
||||
|
||||
for (OffsetRange r : passage.markers) {
|
||||
from = Math.min(from, r.from);
|
||||
to = Math.max(to, r.to);
|
||||
}
|
||||
|
||||
if (from > to) {
|
||||
from = to;
|
||||
}
|
||||
|
||||
return new OffsetRange(from, to);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,104 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.search.matchhighlight;
|
||||
|
||||
import java.text.CharacterIterator;
|
||||
|
||||
/**
|
||||
* A {@link CharacterIterator} over a {@link CharSequence}.
|
||||
*/
|
||||
final class CharSequenceIterator implements CharacterIterator {
|
||||
private final CharSequence text;
|
||||
|
||||
private int begin;
|
||||
private int end;
|
||||
private int pos;
|
||||
|
||||
public CharSequenceIterator(CharSequence text) {
|
||||
this.text = text;
|
||||
this.begin = 0;
|
||||
this.end = text.length();
|
||||
}
|
||||
|
||||
public char first() {
|
||||
pos = begin;
|
||||
return current();
|
||||
}
|
||||
|
||||
public char last() {
|
||||
if (end != begin) {
|
||||
pos = end - 1;
|
||||
} else {
|
||||
pos = end;
|
||||
}
|
||||
return current();
|
||||
}
|
||||
|
||||
public char setIndex(int p) {
|
||||
if (p < begin || p > end) throw new IllegalArgumentException("Invalid index");
|
||||
pos = p;
|
||||
return current();
|
||||
}
|
||||
|
||||
public char current() {
|
||||
if (pos >= begin && pos < end) {
|
||||
return text.charAt(pos);
|
||||
} else {
|
||||
return DONE;
|
||||
}
|
||||
}
|
||||
|
||||
public char next() {
|
||||
if (pos < end - 1) {
|
||||
pos++;
|
||||
return text.charAt(pos);
|
||||
} else {
|
||||
pos = end;
|
||||
return DONE;
|
||||
}
|
||||
}
|
||||
|
||||
public char previous() {
|
||||
if (pos > begin) {
|
||||
pos--;
|
||||
return text.charAt(pos);
|
||||
} else {
|
||||
return DONE;
|
||||
}
|
||||
}
|
||||
|
||||
public int getBeginIndex() {
|
||||
return begin;
|
||||
}
|
||||
|
||||
public int getEndIndex() {
|
||||
return end;
|
||||
}
|
||||
|
||||
public int getIndex() {
|
||||
return pos;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Object clone() {
|
||||
try {
|
||||
return super.clone();
|
||||
} catch (CloneNotSupportedException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,304 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.search.matchhighlight;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.FieldInfos;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.LeafReader;
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.Matches;
|
||||
import org.apache.lucene.search.MatchesIterator;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.QueryVisitor;
|
||||
import org.apache.lucene.search.ScoreMode;
|
||||
import org.apache.lucene.search.TopDocs;
|
||||
import org.apache.lucene.search.Weight;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.PrimitiveIterator;
|
||||
import java.util.Set;
|
||||
import java.util.TreeMap;
|
||||
import java.util.TreeSet;
|
||||
import java.util.function.Predicate;
|
||||
|
||||
/**
|
||||
* Utility class to compute a list of "match regions" for a given query, searcher and
|
||||
* document(s) using {@link Matches} API.
|
||||
*/
|
||||
public class MatchRegionRetriever {
|
||||
private final List<LeafReaderContext> leaves;
|
||||
private final Weight weight;
|
||||
private final TreeSet<String> affectedFields;
|
||||
private final Map<String, OffsetsRetrievalStrategy> offsetStrategies;
|
||||
private final Set<String> preloadFields;
|
||||
|
||||
/**
|
||||
* A callback for accepting a single document (and its associated leaf reader, leaf document ID)
|
||||
* and its match offset ranges, as indicated by the {@link Matches} interface retrieved for
|
||||
* the query.
|
||||
*/
|
||||
@FunctionalInterface
|
||||
public interface MatchOffsetsConsumer {
|
||||
void accept(int docId, LeafReader leafReader, int leafDocId, Map<String, List<OffsetRange>> hits)
|
||||
throws IOException;
|
||||
}
|
||||
|
||||
/**
|
||||
* An abstraction that provides document values for a given field. Default implementation
|
||||
* in {@link DocumentFieldValueProvider} just reaches to a preloaded {@link Document}. It is
|
||||
* possible to write a more efficient implementation on top of a reusable character buffer
|
||||
* (that reuses the buffer while retrieving hit regions for documents).
|
||||
*/
|
||||
@FunctionalInterface
|
||||
public interface FieldValueProvider {
|
||||
List<CharSequence> getValues(String field);
|
||||
}
|
||||
|
||||
/**
|
||||
* A constructor with the default offset strategy supplier.
|
||||
*/
|
||||
public MatchRegionRetriever(IndexSearcher searcher, Query query, Analyzer analyzer) throws IOException {
|
||||
this(searcher, query, analyzer, computeOffsetRetrievalStrategies(searcher.getIndexReader(), analyzer));
|
||||
}
|
||||
|
||||
/**
|
||||
* @param searcher Index searcher to be used for retrieving matches.
|
||||
* @param query The query for which matches should be retrieved. The query should be rewritten
|
||||
* against the provided searcher.
|
||||
* @param analyzer An analyzer that may be used to reprocess (retokenize) document fields
|
||||
* in the absence of position offsets in the index. Note that the analyzer must return
|
||||
* tokens (positions and offsets) identical to the ones stored in the index.
|
||||
* @param fieldOffsetStrategySupplier A custom supplier of per-field {@link OffsetsRetrievalStrategy}
|
||||
* instances.
|
||||
*/
|
||||
public MatchRegionRetriever(IndexSearcher searcher, Query query, Analyzer analyzer,
|
||||
OffsetsRetrievalStrategySupplier fieldOffsetStrategySupplier)
|
||||
throws IOException {
|
||||
leaves = searcher.getIndexReader().leaves();
|
||||
assert checkOrderConsistency(leaves);
|
||||
|
||||
// We need full scoring mode so that we can receive matches from all sub-clauses
|
||||
// (no optimizations in Boolean queries take place).
|
||||
weight = searcher.createWeight(query, ScoreMode.COMPLETE, 0);
|
||||
|
||||
// Compute the subset of fields affected by this query so that we don't load or scan
|
||||
// fields that are irrelevant.
|
||||
affectedFields = new TreeSet<>();
|
||||
query.visit(
|
||||
new QueryVisitor() {
|
||||
@Override
|
||||
public boolean acceptField(String field) {
|
||||
affectedFields.add(field);
|
||||
return false;
|
||||
}
|
||||
});
|
||||
|
||||
// Compute value offset retrieval strategy for all affected fields.
|
||||
offsetStrategies = new HashMap<>();
|
||||
for (String field : affectedFields) {
|
||||
offsetStrategies.put(field, fieldOffsetStrategySupplier.apply(field));
|
||||
}
|
||||
|
||||
// Ask offset strategies if they'll need field values.
|
||||
preloadFields = new HashSet<>();
|
||||
offsetStrategies.forEach(
|
||||
(field, strategy) -> {
|
||||
if (strategy.requiresDocument()) {
|
||||
preloadFields.add(field);
|
||||
}
|
||||
});
|
||||
|
||||
// Only preload those field values that can be affected by the query and are required
|
||||
// by strategies.
|
||||
preloadFields.retainAll(affectedFields);
|
||||
}
|
||||
|
||||
public void highlightDocuments(TopDocs topDocs, MatchOffsetsConsumer consumer) throws IOException {
|
||||
highlightDocuments(Arrays.stream(topDocs.scoreDocs)
|
||||
.mapToInt(scoreDoc -> scoreDoc.doc)
|
||||
.sorted()
|
||||
.iterator(), consumer);
|
||||
}
|
||||
|
||||
/**
|
||||
* Low-level, high-efficiency method for highlighting large numbers of documents at once in a
|
||||
* streaming fashion.
|
||||
*
|
||||
* @param docIds A stream of <em>sorted</em> document identifiers for which hit ranges should
|
||||
* be returned.
|
||||
* @param consumer A streaming consumer for document-hits pairs.
|
||||
*/
|
||||
public void highlightDocuments(PrimitiveIterator.OfInt docIds, MatchOffsetsConsumer consumer)
|
||||
throws IOException {
|
||||
if (leaves.isEmpty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
Iterator<LeafReaderContext> ctx = leaves.iterator();
|
||||
LeafReaderContext currentContext = ctx.next();
|
||||
int previousDocId = -1;
|
||||
Map<String, List<OffsetRange>> highlights = new TreeMap<>();
|
||||
while (docIds.hasNext()) {
|
||||
int docId = docIds.nextInt();
|
||||
|
||||
if (docId < previousDocId) {
|
||||
throw new RuntimeException("Input document IDs must be sorted (increasing).");
|
||||
}
|
||||
previousDocId = docId;
|
||||
|
||||
while (docId >= currentContext.docBase + currentContext.reader().maxDoc()) {
|
||||
currentContext = ctx.next();
|
||||
}
|
||||
|
||||
int contextRelativeDocId = docId - currentContext.docBase;
|
||||
|
||||
// Only preload fields we may potentially need.
|
||||
FieldValueProvider documentSupplier;
|
||||
if (preloadFields.isEmpty()) {
|
||||
documentSupplier = null;
|
||||
} else {
|
||||
Document doc = currentContext.reader().document(contextRelativeDocId, preloadFields);
|
||||
documentSupplier = new DocumentFieldValueProvider(doc);
|
||||
}
|
||||
|
||||
highlights.clear();
|
||||
highlightDocument(currentContext, contextRelativeDocId, documentSupplier, (field) -> true, highlights);
|
||||
consumer.accept(docId, currentContext.reader(), contextRelativeDocId, highlights);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Low-level method for retrieving hit ranges for a single document. This method can be used with
|
||||
* custom document {@link FieldValueProvider}.
|
||||
*/
|
||||
public void highlightDocument(
|
||||
LeafReaderContext leafReaderContext,
|
||||
int contextDocId,
|
||||
FieldValueProvider doc,
|
||||
Predicate<String> acceptField,
|
||||
Map<String, List<OffsetRange>> outputHighlights)
|
||||
throws IOException {
|
||||
Matches matches = weight.matches(leafReaderContext, contextDocId);
|
||||
if (matches == null) {
|
||||
return;
|
||||
}
|
||||
|
||||
for (String field : affectedFields) {
|
||||
if (acceptField.test(field)) {
|
||||
MatchesIterator matchesIterator = matches.getMatches(field);
|
||||
if (matchesIterator == null) {
|
||||
// No matches on this field, even though the field was part of the query. This may be possible
|
||||
// with complex queries that source non-text fields (have no "hit regions" in any textual
|
||||
// representation). Skip.
|
||||
} else {
|
||||
OffsetsRetrievalStrategy offsetStrategy = offsetStrategies.get(field);
|
||||
if (offsetStrategy == null) {
|
||||
throw new IOException(
|
||||
"Non-empty matches but no offset retrieval strategy for field: " + field);
|
||||
}
|
||||
List<OffsetRange> ranges = offsetStrategy.get(matchesIterator, doc);
|
||||
if (!ranges.isEmpty()) {
|
||||
outputHighlights.put(field, ranges);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private boolean checkOrderConsistency(List<LeafReaderContext> leaves) {
|
||||
for (int i = 1; i < leaves.size(); i++) {
|
||||
LeafReaderContext prev = leaves.get(i - 1);
|
||||
LeafReaderContext next = leaves.get(i);
|
||||
assert prev.docBase <= next.docBase;
|
||||
assert prev.docBase + prev.reader().maxDoc() == next.docBase;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute default strategies for retrieving offsets from {@link MatchesIterator}
|
||||
* instances for a set of given fields.
|
||||
*/
|
||||
public static OffsetsRetrievalStrategySupplier computeOffsetRetrievalStrategies(
|
||||
IndexReader reader, Analyzer analyzer) {
|
||||
FieldInfos fieldInfos = FieldInfos.getMergedFieldInfos(reader);
|
||||
return (field) -> {
|
||||
FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
|
||||
if (fieldInfo == null) {
|
||||
return (mi, doc) -> {
|
||||
throw new IOException("FieldInfo is null for field: " + field);
|
||||
};
|
||||
}
|
||||
|
||||
switch (fieldInfo.getIndexOptions()) {
|
||||
case DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS:
|
||||
return new OffsetsFromMatchIterator(field);
|
||||
|
||||
case DOCS_AND_FREQS_AND_POSITIONS:
|
||||
return new OffsetsFromPositions(field, analyzer);
|
||||
|
||||
case DOCS_AND_FREQS:
|
||||
case DOCS:
|
||||
// By default retrieve offsets from individual tokens
|
||||
// retrieved by the analyzer (possibly narrowed down to
|
||||
// only those terms that the query hinted at when passed
|
||||
// a QueryVisitor.
|
||||
//
|
||||
// Alternative straties are also possible and may make sense
|
||||
// depending on the use case (OffsetsFromValues, for example).
|
||||
return new OffsetsFromTokens(field, analyzer);
|
||||
|
||||
default:
|
||||
return
|
||||
(matchesIterator, doc) -> {
|
||||
throw new IOException(
|
||||
"Field is indexed without positions and/or offsets: "
|
||||
+ field
|
||||
+ ", "
|
||||
+ fieldInfo.getIndexOptions());
|
||||
};
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Implements {@link FieldValueProvider} wrapping a preloaded
|
||||
* {@link Document}.
|
||||
*/
|
||||
private static final class DocumentFieldValueProvider implements FieldValueProvider {
|
||||
private final Document doc;
|
||||
|
||||
public DocumentFieldValueProvider(Document doc) {
|
||||
this.doc = doc;
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<CharSequence> getValues(String field) {
|
||||
return Arrays.asList(doc.getValues(field));
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,65 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.search.matchhighlight;
|
||||
|
||||
import java.util.Objects;
|
||||
|
||||
/**
|
||||
* A non-empty range of offset positions.
|
||||
*/
|
||||
public class OffsetRange {
|
||||
/** Start index, inclusive. */
|
||||
public final int from;
|
||||
|
||||
/** End index, exclusive. */
|
||||
public final int to;
|
||||
|
||||
/**
|
||||
* @param from Start index, inclusive.
|
||||
* @param to End index, exclusive.
|
||||
*/
|
||||
public OffsetRange(int from, int to) {
|
||||
assert from <= to : "A non-empty offset range is required: " + from + "-" + to;
|
||||
this.from = from;
|
||||
this.to = to;
|
||||
}
|
||||
|
||||
public int length() {
|
||||
return to - from;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "[from=" + from + ", to=" + to + "]";
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object other) {
|
||||
if (other == this) return true;
|
||||
if (other instanceof OffsetRange) {
|
||||
OffsetRange that = (OffsetRange) other;
|
||||
return from == that.from && to == that.to;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return Objects.hash(from, to);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,49 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.search.matchhighlight;
|
||||
|
||||
import org.apache.lucene.search.MatchesIterator;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* This strategy retrieves offsets directly from {@link MatchesIterator}.
|
||||
*/
|
||||
public final class OffsetsFromMatchIterator implements OffsetsRetrievalStrategy {
|
||||
private final String field;
|
||||
|
||||
OffsetsFromMatchIterator(String field) {
|
||||
this.field = field;
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<OffsetRange> get(MatchesIterator matchesIterator, MatchRegionRetriever.FieldValueProvider doc)
|
||||
throws IOException {
|
||||
ArrayList<OffsetRange> ranges = new ArrayList<>();
|
||||
while (matchesIterator.next()) {
|
||||
int from = matchesIterator.startOffset();
|
||||
int to = matchesIterator.endOffset();
|
||||
if (from < 0 || to < 0) {
|
||||
throw new IOException("Matches API returned negative offsets for field: " + field);
|
||||
}
|
||||
ranges.add(new OffsetRange(from, to));
|
||||
}
|
||||
return ranges;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,154 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.search.matchhighlight;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.search.MatchesIterator;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* This strategy applies to fields with stored positions but no offsets. We re-analyze
|
||||
* the field's value to find out offsets of match positions.
|
||||
* <p>
|
||||
* Note that this may fail if index data (positions stored in the index) is out of sync
|
||||
* with the field values or the analyzer. This strategy assumes it'll never happen.
|
||||
*/
|
||||
public final class OffsetsFromPositions implements OffsetsRetrievalStrategy {
|
||||
private final String field;
|
||||
private final Analyzer analyzer;
|
||||
|
||||
OffsetsFromPositions(String field, Analyzer analyzer) {
|
||||
this.field = field;
|
||||
this.analyzer = analyzer;
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<OffsetRange> get(MatchesIterator matchesIterator, MatchRegionRetriever.FieldValueProvider doc)
|
||||
throws IOException {
|
||||
ArrayList<OffsetRange> ranges = new ArrayList<>();
|
||||
while (matchesIterator.next()) {
|
||||
int from = matchesIterator.startPosition();
|
||||
int to = matchesIterator.endPosition();
|
||||
if (from < 0 || to < 0) {
|
||||
throw new IOException("Matches API returned negative positions for field: " + field);
|
||||
}
|
||||
ranges.add(new OffsetRange(from, to));
|
||||
}
|
||||
|
||||
// Convert from positions to offsets.
|
||||
ranges = convertPositionsToOffsets(ranges, analyzer, field, doc.getValues(field));
|
||||
|
||||
return ranges;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean requiresDocument() {
|
||||
return true;
|
||||
}
|
||||
|
||||
private static ArrayList<OffsetRange> convertPositionsToOffsets(
|
||||
ArrayList<OffsetRange> ranges,
|
||||
Analyzer analyzer,
|
||||
String fieldName,
|
||||
List<CharSequence> values)
|
||||
throws IOException {
|
||||
|
||||
if (ranges.isEmpty()) {
|
||||
return ranges;
|
||||
}
|
||||
|
||||
class LeftRight {
|
||||
int left = Integer.MAX_VALUE;
|
||||
int right = Integer.MIN_VALUE;
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "[" + "L: " + left + ", R: " + right + ']';
|
||||
}
|
||||
}
|
||||
|
||||
Map<Integer, LeftRight> requiredPositionSpans = new HashMap<>();
|
||||
int minPosition = Integer.MAX_VALUE;
|
||||
int maxPosition = Integer.MIN_VALUE;
|
||||
for (OffsetRange range : ranges) {
|
||||
requiredPositionSpans.computeIfAbsent(range.from, (key) -> new LeftRight());
|
||||
requiredPositionSpans.computeIfAbsent(range.to, (key) -> new LeftRight());
|
||||
minPosition = Math.min(minPosition, range.from);
|
||||
maxPosition = Math.max(maxPosition, range.to);
|
||||
}
|
||||
|
||||
int position = -1;
|
||||
int valueOffset = 0;
|
||||
for (int valueIndex = 0, max = values.size(); valueIndex < max; valueIndex++) {
|
||||
final String value = values.get(valueIndex).toString();
|
||||
final boolean lastValue = valueIndex + 1 == max;
|
||||
|
||||
TokenStream ts = analyzer.tokenStream(fieldName, value);
|
||||
OffsetAttribute offsetAttr = ts.getAttribute(OffsetAttribute.class);
|
||||
PositionIncrementAttribute posAttr = ts.getAttribute(PositionIncrementAttribute.class);
|
||||
ts.reset();
|
||||
while (ts.incrementToken()) {
|
||||
position += posAttr.getPositionIncrement();
|
||||
|
||||
if (position >= minPosition) {
|
||||
LeftRight leftRight = requiredPositionSpans.get(position);
|
||||
if (leftRight != null) {
|
||||
int startOffset = valueOffset + offsetAttr.startOffset();
|
||||
int endOffset = valueOffset + offsetAttr.endOffset();
|
||||
|
||||
leftRight.left = Math.min(leftRight.left, startOffset);
|
||||
leftRight.right = Math.max(leftRight.right, endOffset);
|
||||
}
|
||||
|
||||
// Only short-circuit if we're on the last value (which should be the common
|
||||
// case since most fields would only have a single value anyway). We need
|
||||
// to make sure of this because otherwise offsetAttr would have incorrect value.
|
||||
if (position > maxPosition && lastValue) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
ts.end();
|
||||
position += posAttr.getPositionIncrement() + analyzer.getPositionIncrementGap(fieldName);
|
||||
valueOffset += offsetAttr.endOffset() + analyzer.getOffsetGap(fieldName);
|
||||
ts.close();
|
||||
}
|
||||
|
||||
ArrayList<OffsetRange> converted = new ArrayList<>();
|
||||
for (OffsetRange range : ranges) {
|
||||
LeftRight left = requiredPositionSpans.get(range.from);
|
||||
LeftRight right = requiredPositionSpans.get(range.to);
|
||||
if (left == null
|
||||
|| right == null
|
||||
|| left.left == Integer.MAX_VALUE
|
||||
|| right.right == Integer.MIN_VALUE) {
|
||||
throw new RuntimeException("Position not properly initialized for range: " + range);
|
||||
}
|
||||
converted.add(new OffsetRange(left.left, right.right));
|
||||
}
|
||||
|
||||
return converted;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,97 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.search.matchhighlight;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.MatchesIterator;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.QueryVisitor;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
* This strategy works for fields where we know the match occurred but there are
|
||||
* no known positions or offsets.
|
||||
* <p>
|
||||
* We re-analyze field values and return offset ranges for returned tokens that
|
||||
* are also returned by the query's term collector.
|
||||
*/
|
||||
public final class OffsetsFromTokens implements OffsetsRetrievalStrategy {
|
||||
private final String field;
|
||||
private final Analyzer analyzer;
|
||||
|
||||
public OffsetsFromTokens(String field, Analyzer analyzer) {
|
||||
this.field = field;
|
||||
this.analyzer = analyzer;
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<OffsetRange> get(MatchesIterator matchesIterator, MatchRegionRetriever.FieldValueProvider doc) throws IOException {
|
||||
List<CharSequence> values = doc.getValues(field);
|
||||
|
||||
Set<BytesRef> matchTerms = new HashSet<>();
|
||||
while (matchesIterator.next()) {
|
||||
Query q = matchesIterator.getQuery();
|
||||
q.visit(new QueryVisitor() {
|
||||
@Override
|
||||
public void consumeTerms(Query query, Term... terms) {
|
||||
for (Term t : terms) {
|
||||
if (field.equals(t.field())) {
|
||||
matchTerms.add(t.bytes());
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
ArrayList<OffsetRange> ranges = new ArrayList<>();
|
||||
int valueOffset = 0;
|
||||
for (int valueIndex = 0, max = values.size(); valueIndex < max; valueIndex++) {
|
||||
final String value = values.get(valueIndex).toString();
|
||||
|
||||
TokenStream ts = analyzer.tokenStream(field, value);
|
||||
OffsetAttribute offsetAttr = ts.getAttribute(OffsetAttribute.class);
|
||||
TermToBytesRefAttribute termAttr = ts.getAttribute(TermToBytesRefAttribute.class);
|
||||
ts.reset();
|
||||
while (ts.incrementToken()) {
|
||||
if (matchTerms.contains(termAttr.getBytesRef())) {
|
||||
int startOffset = valueOffset + offsetAttr.startOffset();
|
||||
int endOffset = valueOffset + offsetAttr.endOffset();
|
||||
ranges.add(new OffsetRange(startOffset, endOffset));
|
||||
}
|
||||
}
|
||||
ts.end();
|
||||
valueOffset += offsetAttr.endOffset() + analyzer.getOffsetGap(field);
|
||||
ts.close();
|
||||
}
|
||||
return ranges;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean requiresDocument() {
|
||||
return true;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,74 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.search.matchhighlight;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.search.MatchesIterator;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* This strategy works for fields where we know the match occurred but there are
|
||||
* no known positions or offsets.
|
||||
* <p>
|
||||
* We re-analyze field values and return offset ranges for entire values
|
||||
* (not individual tokens). Re-analysis is required because analyzer may return
|
||||
* an unknown offset gap.
|
||||
*/
|
||||
public final class OffsetsFromValues implements OffsetsRetrievalStrategy {
|
||||
private final String field;
|
||||
private final Analyzer analyzer;
|
||||
|
||||
public OffsetsFromValues(String field, Analyzer analyzer) {
|
||||
this.field = field;
|
||||
this.analyzer = analyzer;
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<OffsetRange> get(MatchesIterator matchesIterator, MatchRegionRetriever.FieldValueProvider doc) throws IOException {
|
||||
List<CharSequence> values = doc.getValues(field);
|
||||
|
||||
ArrayList<OffsetRange> ranges = new ArrayList<>();
|
||||
int valueOffset = 0;
|
||||
for (CharSequence charSequence : values) {
|
||||
final String value = charSequence.toString();
|
||||
|
||||
TokenStream ts = analyzer.tokenStream(field, value);
|
||||
OffsetAttribute offsetAttr = ts.getAttribute(OffsetAttribute.class);
|
||||
ts.reset();
|
||||
int startOffset = valueOffset;
|
||||
while (ts.incrementToken()) {
|
||||
// Go through all tokens to increment offset attribute properly.
|
||||
}
|
||||
ts.end();
|
||||
valueOffset += offsetAttr.endOffset();
|
||||
ranges.add(new OffsetRange(startOffset, valueOffset));
|
||||
valueOffset += analyzer.getOffsetGap(field);
|
||||
ts.close();
|
||||
}
|
||||
return ranges;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean requiresDocument() {
|
||||
return true;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,42 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.search.matchhighlight;
|
||||
|
||||
import org.apache.lucene.search.MatchesIterator;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Determines how match offset regions are computed from {@link MatchesIterator}. Several
|
||||
* possibilities exist, ranging from retrieving offsets directly from a match instance
|
||||
* to re-evaluating the document's field and recomputing offsets from there.
|
||||
*/
|
||||
public interface OffsetsRetrievalStrategy {
|
||||
/**
|
||||
* Return value offsets (match ranges) acquired from the given {@link MatchesIterator}.
|
||||
*/
|
||||
List<OffsetRange> get(MatchesIterator matchesIterator, MatchRegionRetriever.FieldValueProvider doc)
|
||||
throws IOException;
|
||||
|
||||
/**
|
||||
* Whether this strategy requires document field access.
|
||||
*/
|
||||
default boolean requiresDocument() {
|
||||
return false;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,26 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.search.matchhighlight;
|
||||
|
||||
import java.util.function.Function;
|
||||
|
||||
/**
|
||||
* A per-field supplier of {@link OffsetsRetrievalStrategy}.
|
||||
*/
|
||||
@FunctionalInterface
|
||||
public interface OffsetsRetrievalStrategySupplier extends Function<String, OffsetsRetrievalStrategy> {
|
||||
}
|
|
@ -0,0 +1,39 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.search.matchhighlight;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* A passage is a fragment of source text, scored and possibly with a list of sub-offsets (markers)
|
||||
* to be highlighted. The markers can be overlapping or nested, but they're always contained within
|
||||
* the passage.
|
||||
*/
|
||||
public class Passage extends OffsetRange {
|
||||
public List<OffsetRange> markers;
|
||||
|
||||
public Passage(int from, int to, List<OffsetRange> markers) {
|
||||
super(from, to);
|
||||
|
||||
this.markers = markers;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "[" + super.toString() + ", markers=" + markers + "]";
|
||||
}
|
||||
}
|
|
@ -0,0 +1,27 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.search.matchhighlight;
|
||||
|
||||
/**
|
||||
* Adjusts the range of one or more passages over a given value. An example
|
||||
* adjuster could shift passage boundary to the next or previous word delimiter
|
||||
* or white space, for example.
|
||||
*/
|
||||
public interface PassageAdjuster {
|
||||
void currentValue(CharSequence value);
|
||||
OffsetRange adjust(Passage p);
|
||||
}
|
|
@ -0,0 +1,214 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.search.matchhighlight;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
import java.util.RandomAccess;
|
||||
import java.util.function.Function;
|
||||
|
||||
/**
|
||||
* Formats a collection of {@linkplain Passage passages} over a given string, cleaning up
|
||||
* and resolving restrictions concerning overlaps, allowed sub-ranges over the
|
||||
* input string and length restrictions.
|
||||
*
|
||||
* Passages are demarcated with constructor-provided ellipsis and start/end marker
|
||||
* sequences.
|
||||
*/
|
||||
public class PassageFormatter {
|
||||
private final String ellipsis;
|
||||
private final Function<OffsetRange, String> markerStart;
|
||||
private final Function<OffsetRange, String> markerEnd;
|
||||
|
||||
private final ArrayList<OffsetRange> markerStack = new ArrayList<>();
|
||||
|
||||
public PassageFormatter(String ellipsis, String markerStart, String markerEnd) {
|
||||
this(ellipsis, (m) -> markerStart, (m) -> markerEnd);
|
||||
}
|
||||
|
||||
public PassageFormatter(
|
||||
String ellipsis,
|
||||
Function<OffsetRange, String> markerStart,
|
||||
Function<OffsetRange, String> markerEnd) {
|
||||
this.ellipsis = ellipsis;
|
||||
this.markerStart = markerStart;
|
||||
this.markerEnd = markerEnd;
|
||||
}
|
||||
|
||||
public List<String> format(CharSequence value, List<Passage> passages, List<OffsetRange> ranges) {
|
||||
assert PassageSelector.sortedAndNonOverlapping(passages);
|
||||
assert PassageSelector.sortedAndNonOverlapping(ranges);
|
||||
assert ranges instanceof RandomAccess;
|
||||
|
||||
if (ranges.isEmpty()) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
|
||||
ArrayList<String> result = new ArrayList<>();
|
||||
StringBuilder buf = new StringBuilder();
|
||||
|
||||
int rangeIndex = 0;
|
||||
OffsetRange range = ranges.get(rangeIndex);
|
||||
passageFormatting:
|
||||
for (Passage passage : passages) {
|
||||
// Move to the range of the current passage.
|
||||
while (passage.from >= range.to) {
|
||||
if (++rangeIndex == ranges.size()) {
|
||||
break passageFormatting;
|
||||
}
|
||||
range = ranges.get(rangeIndex);
|
||||
}
|
||||
|
||||
assert range.from <= passage.from && range.to >= passage.to : range + " ? " + passage;
|
||||
|
||||
buf.setLength(0);
|
||||
if (range.from < passage.from) {
|
||||
buf.append(ellipsis);
|
||||
}
|
||||
format(buf, value, passage);
|
||||
if (range.to > passage.to) {
|
||||
buf.append(ellipsis);
|
||||
}
|
||||
result.add(buf.toString());
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
public StringBuilder format(StringBuilder buf, CharSequence value, final Passage passage) {
|
||||
switch (passage.markers.size()) {
|
||||
case 0:
|
||||
// No markers, full passage appended.
|
||||
buf.append(value, passage.from, passage.to);
|
||||
break;
|
||||
|
||||
case 1:
|
||||
// One marker, trivial and frequent case so it's handled separately.
|
||||
OffsetRange m = passage.markers.iterator().next();
|
||||
buf.append(value, passage.from, m.from);
|
||||
buf.append(markerStart.apply(m));
|
||||
buf.append(value, m.from, m.to);
|
||||
buf.append(markerEnd.apply(m));
|
||||
buf.append(value, m.to, passage.to);
|
||||
break;
|
||||
|
||||
default:
|
||||
// Multiple markers, possibly overlapping or nested.
|
||||
markerStack.clear();
|
||||
multipleMarkers(value, passage, buf, markerStack);
|
||||
break;
|
||||
}
|
||||
|
||||
return buf;
|
||||
}
|
||||
|
||||
/** Handle multiple markers, possibly overlapping or nested. */
|
||||
private void multipleMarkers(
|
||||
CharSequence value, final Passage p, StringBuilder b, ArrayList<OffsetRange> markerStack) {
|
||||
int at = p.from;
|
||||
int max = p.to;
|
||||
SlicePoint[] slicePoints = slicePoints(p);
|
||||
for (SlicePoint slicePoint : slicePoints) {
|
||||
b.append(value, at, slicePoint.offset);
|
||||
OffsetRange currentMarker = slicePoint.marker;
|
||||
switch (slicePoint.type) {
|
||||
case START:
|
||||
markerStack.add(currentMarker);
|
||||
b.append(markerStart.apply(currentMarker));
|
||||
break;
|
||||
|
||||
case END:
|
||||
int markerIndex = markerStack.lastIndexOf(currentMarker);
|
||||
for (int k = markerIndex; k < markerStack.size(); k++) {
|
||||
b.append(markerEnd.apply(markerStack.get(k)));
|
||||
}
|
||||
markerStack.remove(markerIndex);
|
||||
for (int k = markerIndex; k < markerStack.size(); k++) {
|
||||
b.append(markerStart.apply(markerStack.get(k)));
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
throw new RuntimeException();
|
||||
}
|
||||
|
||||
at = slicePoint.offset;
|
||||
}
|
||||
|
||||
if (at < max) {
|
||||
b.append(value, at, max);
|
||||
}
|
||||
}
|
||||
|
||||
private static SlicePoint[] slicePoints(Passage p) {
|
||||
SlicePoint[] slicePoints = new SlicePoint[p.markers.size() * 2];
|
||||
int x = 0;
|
||||
for (OffsetRange m : p.markers) {
|
||||
slicePoints[x++] = new SlicePoint(SlicePoint.Type.START, m.from, m);
|
||||
slicePoints[x++] = new SlicePoint(SlicePoint.Type.END, m.to, m);
|
||||
}
|
||||
|
||||
// Order slice points by their offset
|
||||
Comparator<SlicePoint> c =
|
||||
Comparator.<SlicePoint>comparingInt(pt -> pt.offset)
|
||||
.thenComparingInt(pt -> pt.type.ordering)
|
||||
.thenComparing(
|
||||
(a, b) -> {
|
||||
if (a.type == SlicePoint.Type.START) {
|
||||
// Longer start slice points come first.
|
||||
return Integer.compare(b.marker.to, a.marker.to);
|
||||
} else {
|
||||
// Shorter end slice points come first.
|
||||
return Integer.compare(b.marker.from, a.marker.from);
|
||||
}
|
||||
});
|
||||
|
||||
Arrays.sort(slicePoints, c);
|
||||
|
||||
return slicePoints;
|
||||
}
|
||||
|
||||
static class SlicePoint {
|
||||
enum Type {
|
||||
START(2),
|
||||
END(1);
|
||||
|
||||
private final int ordering;
|
||||
|
||||
Type(int ordering) {
|
||||
this.ordering = ordering;
|
||||
}
|
||||
}
|
||||
|
||||
public final int offset;
|
||||
public final Type type;
|
||||
public final OffsetRange marker;
|
||||
|
||||
public SlicePoint(Type t, int offset, OffsetRange m) {
|
||||
this.type = t;
|
||||
this.offset = offset;
|
||||
this.marker = m;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "(" + type + ", " + marker + ")";
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,273 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.search.matchhighlight;
|
||||
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.PriorityQueue;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.RandomAccess;
|
||||
|
||||
/** Selects fragments of text that score best for the given set of highlight markers. */
|
||||
public class PassageSelector {
|
||||
public static final Comparator<Passage> DEFAULT_SCORER =
|
||||
(a, b) -> {
|
||||
// Compare the number of highlights first.
|
||||
int v;
|
||||
v = Integer.compare(a.markers.size(), b.markers.size());
|
||||
if (v != 0) {
|
||||
return v;
|
||||
}
|
||||
|
||||
// Total number of characters covered by the highlights.
|
||||
int len1 = 0, len2 = 0;
|
||||
for (OffsetRange o : a.markers) {
|
||||
len1 += o.length();
|
||||
}
|
||||
for (OffsetRange o : b.markers) {
|
||||
len2 += o.length();
|
||||
}
|
||||
if (len1 != len2) {
|
||||
return Integer.compare(len1, len2);
|
||||
}
|
||||
|
||||
return Integer.compare(b.from, a.from);
|
||||
};
|
||||
|
||||
private final Comparator<Passage> passageScorer;
|
||||
private final PassageAdjuster passageAdjuster;
|
||||
|
||||
public PassageSelector() {
|
||||
this(DEFAULT_SCORER, null);
|
||||
}
|
||||
|
||||
public PassageSelector(Comparator<Passage> passageScorer, PassageAdjuster passageAdjuster) {
|
||||
this.passageScorer = passageScorer;
|
||||
this.passageAdjuster = passageAdjuster;
|
||||
}
|
||||
|
||||
public List<Passage> pickBest(
|
||||
CharSequence value,
|
||||
List<? extends OffsetRange> markers,
|
||||
int maxPassageWindow,
|
||||
int maxPassages) {
|
||||
return pickBest(
|
||||
value, markers, maxPassageWindow, maxPassages, List.of(new OffsetRange(0, value.length())));
|
||||
}
|
||||
|
||||
public List<Passage> pickBest(
|
||||
CharSequence value,
|
||||
List<? extends OffsetRange> markers,
|
||||
int maxPassageWindow,
|
||||
int maxPassages,
|
||||
List<OffsetRange> permittedPassageRanges) {
|
||||
assert markers instanceof RandomAccess && permittedPassageRanges instanceof RandomAccess;
|
||||
|
||||
// Handle odd special cases early.
|
||||
if (value.length() == 0 || maxPassageWindow == 0) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
|
||||
// Sort markers by their start offset, shortest first.
|
||||
markers.sort(
|
||||
(a, b) -> {
|
||||
int v = Integer.compare(a.from, b.from);
|
||||
return v != 0 ? v : Integer.compare(a.to, b.to);
|
||||
});
|
||||
|
||||
// Determine a maximum offset window around each highlight marker and
|
||||
// pick the best scoring passage candidates.
|
||||
PriorityQueue<Passage> pq =
|
||||
new PriorityQueue<>(maxPassages) {
|
||||
@Override
|
||||
protected boolean lessThan(Passage a, Passage b) {
|
||||
return passageScorer.compare(a, b) < 0;
|
||||
}
|
||||
};
|
||||
|
||||
assert sortedAndNonOverlapping(permittedPassageRanges);
|
||||
|
||||
final int max = markers.size();
|
||||
int markerIndex = 0;
|
||||
nextRange:
|
||||
for (OffsetRange range : permittedPassageRanges) {
|
||||
final int rangeTo = Math.min(range.to, value.length());
|
||||
|
||||
// Skip ranges outside of the value window anyway.
|
||||
if (range.from >= rangeTo) {
|
||||
continue;
|
||||
}
|
||||
|
||||
while (markerIndex < max) {
|
||||
OffsetRange m = markers.get(markerIndex);
|
||||
|
||||
// Markers are sorted so if the current marker's start is past the range,
|
||||
// we can advance, but we need to check the same marker against the new range.
|
||||
if (m.from >= rangeTo) {
|
||||
continue nextRange;
|
||||
}
|
||||
|
||||
// Check if current marker falls within the range and is smaller than the largest allowed
|
||||
// passage window.
|
||||
if (m.from >= range.from && m.to <= rangeTo && m.length() <= maxPassageWindow) {
|
||||
|
||||
// Adjust the window range to center the highlight marker.
|
||||
int from = (m.from + m.to - maxPassageWindow) / 2;
|
||||
int to = (m.from + m.to + maxPassageWindow) / 2;
|
||||
if (from < range.from) {
|
||||
to += range.from - from;
|
||||
from = range.from;
|
||||
}
|
||||
if (to > rangeTo) {
|
||||
from -= to - rangeTo;
|
||||
to = rangeTo;
|
||||
if (from < range.from) {
|
||||
from = range.from;
|
||||
}
|
||||
}
|
||||
|
||||
if (from < to && to <= value.length()) {
|
||||
// Find other markers that are completely inside the passage window.
|
||||
ArrayList<OffsetRange> inside = new ArrayList<>();
|
||||
int i = markerIndex;
|
||||
while (i > 0 && markers.get(i - 1).from >= from) {
|
||||
i--;
|
||||
}
|
||||
|
||||
OffsetRange c;
|
||||
for (; i < max && (c = markers.get(i)).from < to; i++) {
|
||||
if (c.to <= to) {
|
||||
inside.add(c);
|
||||
}
|
||||
}
|
||||
|
||||
if (!inside.isEmpty()) {
|
||||
pq.insertWithOverflow(new Passage(from, to, inside));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Advance to the next marker.
|
||||
markerIndex++;
|
||||
}
|
||||
}
|
||||
|
||||
// Collect from the priority queue (reverse the order so that highest-scoring are first).
|
||||
Passage[] passages;
|
||||
if (pq.size() > 0) {
|
||||
passages = new Passage[pq.size()];
|
||||
for (int i = pq.size(); --i >= 0; ) {
|
||||
passages[i] = pq.pop();
|
||||
}
|
||||
} else {
|
||||
// Handle the default, no highlighting markers case.
|
||||
passages = pickDefaultPassage(value, maxPassageWindow, permittedPassageRanges);
|
||||
}
|
||||
|
||||
// Correct passage boundaries from maxExclusive window. Typically shrink boundaries until we're
|
||||
// on a proper word/sentence boundary.
|
||||
if (passageAdjuster != null) {
|
||||
passageAdjuster.currentValue(value);
|
||||
for (int x = 0; x < passages.length; x++) {
|
||||
Passage p = passages[x];
|
||||
OffsetRange newRange = passageAdjuster.adjust(p);
|
||||
if (newRange.from != p.from || newRange.to != p.to) {
|
||||
assert newRange.from >= p.from && newRange.to <= p.to
|
||||
: "Adjusters must not expand the passage's range: was "
|
||||
+ p
|
||||
+ " => changed to "
|
||||
+ newRange;
|
||||
passages[x] = new Passage(newRange.from, newRange.to, p.markers);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Ensure there are no overlaps on passages. In case of conflicts, better score wins.
|
||||
int last = 0;
|
||||
for (int i = 0; i < passages.length; i++) {
|
||||
Passage a = passages[i];
|
||||
if (a != null && a.length() > 0) {
|
||||
passages[last++] = a;
|
||||
for (int j = i + 1; j < passages.length; j++) {
|
||||
Passage b = passages[j];
|
||||
if (b != null) {
|
||||
if (adjecentOrOverlapping(a, b)) {
|
||||
passages[j] = null;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Remove nullified slots.
|
||||
if (passages.length != last) {
|
||||
passages = ArrayUtil.copyOfSubArray(passages, 0, last);
|
||||
}
|
||||
|
||||
// Sort in the offset order again.
|
||||
Arrays.sort(passages, (a, b) -> Integer.compare(a.from, b.from));
|
||||
|
||||
return Arrays.asList(passages);
|
||||
}
|
||||
|
||||
static boolean sortedAndNonOverlapping(List<? extends OffsetRange> permittedPassageRanges) {
|
||||
if (permittedPassageRanges.size() > 1) {
|
||||
Iterator<? extends OffsetRange> i = permittedPassageRanges.iterator();
|
||||
for (OffsetRange next, previous = i.next(); i.hasNext(); previous = next) {
|
||||
next = i.next();
|
||||
if (previous.to > next.from) {
|
||||
throw new AssertionError(
|
||||
"Ranges must be sorted and non-overlapping: " + permittedPassageRanges);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Invoked when no passages could be selected (due to constraints or lack of highlight markers).
|
||||
*/
|
||||
protected Passage[] pickDefaultPassage(
|
||||
CharSequence value, int maxCharacterWindow, List<OffsetRange> permittedPassageRanges) {
|
||||
// Search for the first range that is not empty.
|
||||
for (OffsetRange o : permittedPassageRanges) {
|
||||
int to = Math.min(value.length(), o.to);
|
||||
if (o.from < to) {
|
||||
return new Passage[] {
|
||||
new Passage(
|
||||
o.from, o.from + Math.min(maxCharacterWindow, o.length()), Collections.emptyList())
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
return new Passage[] {};
|
||||
}
|
||||
|
||||
private static boolean adjecentOrOverlapping(Passage a, Passage b) {
|
||||
if (a.from >= b.from) {
|
||||
return a.from <= b.to - 1;
|
||||
} else {
|
||||
return a.to - 1 >= b.from;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,28 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/**
|
||||
* This package contains several components useful to build a highlighter
|
||||
* on top of the {@link org.apache.lucene.search.Matches} API.
|
||||
*
|
||||
* {@link org.apache.lucene.search.matchhighlight.MatchRegionRetriever} can be
|
||||
* used to retrieve hit areas for a given {@link org.apache.lucene.search.Query}
|
||||
* and one (or more) indexed documents. These hit areas can be then passed to
|
||||
* {@link org.apache.lucene.search.matchhighlight.PassageSelector} and formatted
|
||||
* with {@link org.apache.lucene.search.matchhighlight.PassageFormatter}.
|
||||
*/
|
||||
package org.apache.lucene.search.matchhighlight;
|
|
@ -0,0 +1,81 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.search.matchhighlight;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.document.Document;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* A simple ASCII match range highlighter for tests.
|
||||
*/
|
||||
final class AsciiMatchRangeHighlighter {
|
||||
private final Analyzer analyzer;
|
||||
private final PassageFormatter passageFormatter;
|
||||
private final PassageSelector selector;
|
||||
|
||||
private int maxPassageWindow = 160;
|
||||
private int maxPassages = 10;
|
||||
|
||||
public AsciiMatchRangeHighlighter(Analyzer analyzer) {
|
||||
this.passageFormatter = new PassageFormatter("...", ">", "<");
|
||||
this.selector = new PassageSelector();
|
||||
this.analyzer = analyzer;
|
||||
}
|
||||
|
||||
public Map<String, List<String>> apply(Document document, Map<String, List<OffsetRange>> fieldHighlights) {
|
||||
ArrayList<OffsetRange> valueRanges = new ArrayList<>();
|
||||
Map<String, List<String>> fieldSnippets = new LinkedHashMap<>();
|
||||
|
||||
fieldHighlights.forEach(
|
||||
(field, matchRanges) -> {
|
||||
int offsetGap = analyzer.getOffsetGap(field);
|
||||
|
||||
String[] values = document.getValues(field);
|
||||
String value;
|
||||
if (values.length == 1) {
|
||||
value = values[0];
|
||||
} else {
|
||||
// This can be inefficient if offset gap is large but recomputing
|
||||
// offsets in a smart way doesn't make sense for tests.
|
||||
String fieldGapPadding = " ".repeat(offsetGap);
|
||||
value = String.join(fieldGapPadding, values);
|
||||
}
|
||||
|
||||
// Create permitted range windows for passages so that they don't cross
|
||||
// multi-value boundary.
|
||||
valueRanges.clear();
|
||||
int offset = 0;
|
||||
for (CharSequence v : values) {
|
||||
valueRanges.add(new OffsetRange(offset, offset + v.length()));
|
||||
offset += v.length();
|
||||
offset += offsetGap;
|
||||
}
|
||||
|
||||
List<Passage> passages =
|
||||
selector.pickBest(value, matchRanges, maxPassageWindow, maxPassages, valueRanges);
|
||||
|
||||
fieldSnippets.put(field, passageFormatter.format(value, passages, valueRanges));
|
||||
});
|
||||
|
||||
return fieldSnippets;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,39 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.search.matchhighlight;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
|
||||
import java.io.Reader;
|
||||
|
||||
/** An {@link Analyzer} that throws a runtime exception when used for anything. */
|
||||
final class MissingAnalyzer extends Analyzer {
|
||||
@Override
|
||||
protected Reader initReader(String fieldName, Reader reader) {
|
||||
throw new RuntimeException("Field must have an explicit Analyzer: " + fieldName);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
throw new RuntimeException("Field must have an explicit Analyzer: " + fieldName);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getOffsetGap(String fieldName) {
|
||||
return 0;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,767 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.search.matchhighlight;
|
||||
|
||||
import com.carrotsearch.randomizedtesting.RandomizedTest;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
|
||||
import org.apache.lucene.analysis.synonym.SynonymGraphFilter;
|
||||
import org.apache.lucene.analysis.synonym.SynonymMap;
|
||||
import org.apache.lucene.analysis.util.CharTokenizer;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.FieldType;
|
||||
import org.apache.lucene.document.StringField;
|
||||
import org.apache.lucene.document.TextField;
|
||||
import org.apache.lucene.index.DirectoryReader;
|
||||
import org.apache.lucene.index.IndexOptions;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.IndexWriterConfig;
|
||||
import org.apache.lucene.index.IndexableField;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.queries.intervals.IntervalQuery;
|
||||
import org.apache.lucene.queries.intervals.Intervals;
|
||||
import org.apache.lucene.queryparser.flexible.core.QueryNodeException;
|
||||
import org.apache.lucene.queryparser.flexible.standard.StandardQueryParser;
|
||||
import org.apache.lucene.queryparser.flexible.standard.config.StandardQueryConfigHandler;
|
||||
import org.apache.lucene.search.BooleanClause;
|
||||
import org.apache.lucene.search.BooleanQuery;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.MatchAllDocsQuery;
|
||||
import org.apache.lucene.search.PhraseQuery;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.search.TopDocs;
|
||||
import org.apache.lucene.search.spans.SpanNearQuery;
|
||||
import org.apache.lucene.search.spans.SpanTermQuery;
|
||||
import org.apache.lucene.store.ByteBuffersDirectory;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.hamcrest.Matchers;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.UncheckedIOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.TreeMap;
|
||||
import java.util.function.BiFunction;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import static org.hamcrest.Matchers.containsInAnyOrder;
|
||||
import static org.hamcrest.Matchers.emptyArray;
|
||||
import static org.hamcrest.Matchers.not;
|
||||
|
||||
public class TestMatchRegionRetriever extends LuceneTestCase {
|
||||
private static final String FLD_ID = "field_id";
|
||||
|
||||
private static final String FLD_TEXT_POS_OFFS1 = "field_text_offs1";
|
||||
private static final String FLD_TEXT_POS_OFFS2 = "field_text_offs2";
|
||||
|
||||
private static final String FLD_TEXT_POS_OFFS = "field_text_offs";
|
||||
private static final String FLD_TEXT_POS = "field_text";
|
||||
|
||||
private static final String FLD_TEXT_SYNONYMS_POS_OFFS = "field_text_syns_offs";
|
||||
private static final String FLD_TEXT_SYNONYMS_POS = "field_text_syns";
|
||||
|
||||
private static final String FLD_TEXT_NOPOS = "field_text_nopos";
|
||||
|
||||
private static final String FLD_NON_EXISTING = "field_missing";
|
||||
|
||||
private FieldType TYPE_STORED_WITH_OFFSETS;
|
||||
private FieldType TYPE_STORED_NO_POSITIONS;
|
||||
|
||||
private Analyzer analyzer;
|
||||
|
||||
@Before
|
||||
public void setup() {
|
||||
TYPE_STORED_WITH_OFFSETS = new FieldType(TextField.TYPE_STORED);
|
||||
TYPE_STORED_WITH_OFFSETS.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
|
||||
TYPE_STORED_WITH_OFFSETS.freeze();
|
||||
|
||||
TYPE_STORED_NO_POSITIONS = new FieldType(TextField.TYPE_STORED);
|
||||
TYPE_STORED_NO_POSITIONS.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
|
||||
TYPE_STORED_NO_POSITIONS.freeze();
|
||||
|
||||
Analyzer whitespaceAnalyzer =
|
||||
new Analyzer() {
|
||||
final int offsetGap = RandomizedTest.randomIntBetween(0, 2);
|
||||
final int positionGap = RandomizedTest.randomFrom(new int[]{0, 1, 100});
|
||||
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
WhitespaceTokenizer tokenizer =
|
||||
new WhitespaceTokenizer(CharTokenizer.DEFAULT_MAX_WORD_LEN);
|
||||
return new TokenStreamComponents(tokenizer);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getOffsetGap(String fieldName) {
|
||||
return offsetGap;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getPositionIncrementGap(String fieldName) {
|
||||
return positionGap;
|
||||
}
|
||||
};
|
||||
|
||||
Map<String, Analyzer> fieldAnalyzers = new HashMap<>();
|
||||
fieldAnalyzers.put(FLD_TEXT_POS, whitespaceAnalyzer);
|
||||
fieldAnalyzers.put(FLD_TEXT_POS_OFFS, whitespaceAnalyzer);
|
||||
fieldAnalyzers.put(FLD_TEXT_POS_OFFS1, whitespaceAnalyzer);
|
||||
fieldAnalyzers.put(FLD_TEXT_POS_OFFS2, whitespaceAnalyzer);
|
||||
fieldAnalyzers.put(FLD_TEXT_NOPOS, whitespaceAnalyzer);
|
||||
|
||||
try {
|
||||
SynonymMap.Builder b = new SynonymMap.Builder();
|
||||
b.add(new CharsRef("foo\u0000bar"), new CharsRef("syn1"), true);
|
||||
b.add(new CharsRef("baz"), new CharsRef("syn2\u0000syn3"), true);
|
||||
SynonymMap synonymMap = b.build();
|
||||
Analyzer synonymsAnalyzer =
|
||||
new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer();
|
||||
TokenStream tokenStream = new SynonymGraphFilter(tokenizer, synonymMap, true);
|
||||
return new TokenStreamComponents(tokenizer, tokenStream);
|
||||
}
|
||||
};
|
||||
fieldAnalyzers.put(FLD_TEXT_SYNONYMS_POS_OFFS, synonymsAnalyzer);
|
||||
fieldAnalyzers.put(FLD_TEXT_SYNONYMS_POS, synonymsAnalyzer);
|
||||
} catch (IOException e) {
|
||||
throw new UncheckedIOException(e);
|
||||
}
|
||||
|
||||
analyzer = new PerFieldAnalyzerWrapper(new MissingAnalyzer(), fieldAnalyzers);
|
||||
}
|
||||
|
||||
BiFunction<String, String, Query> stdQueryParser =
|
||||
(query, defField) -> {
|
||||
try {
|
||||
StandardQueryParser parser = new StandardQueryParser(analyzer);
|
||||
parser.setDefaultOperator(StandardQueryConfigHandler.Operator.AND);
|
||||
return parser.parse(query, defField);
|
||||
} catch (QueryNodeException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
};
|
||||
|
||||
@Test
|
||||
public void testTermQueryWithOffsets() throws IOException {
|
||||
checkTermQuery(FLD_TEXT_POS_OFFS);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testTermQueryWithPositions() throws IOException {
|
||||
checkTermQuery(FLD_TEXT_POS);
|
||||
}
|
||||
|
||||
private void checkTermQuery(String field) throws IOException {
|
||||
withReader(
|
||||
List.of(
|
||||
Map.of(field, values("foo bar baz")),
|
||||
Map.of(field, values("bar foo baz")),
|
||||
Map.of(field, values("bar baz foo")),
|
||||
Map.of(field, values("bar bar bar irrelevant"))),
|
||||
reader -> {
|
||||
assertThat(highlights(reader, new TermQuery(new Term(field, "foo"))),
|
||||
containsInAnyOrder(
|
||||
fmt("0: (%s: '>foo< bar baz')", field),
|
||||
fmt("1: (%s: 'bar >foo< baz')", field),
|
||||
fmt("2: (%s: 'bar baz >foo<')", field)));
|
||||
});
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testBooleanMultifieldQueryWithOffsets() throws IOException {
|
||||
checkBooleanMultifieldQuery(FLD_TEXT_POS_OFFS);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testBooleanMultifieldQueryWithPositions() throws IOException {
|
||||
checkBooleanMultifieldQuery(FLD_TEXT_POS);
|
||||
}
|
||||
|
||||
private void checkBooleanMultifieldQuery(String field) throws IOException {
|
||||
Query query =
|
||||
new BooleanQuery.Builder()
|
||||
.add(new PhraseQuery(1, field, "foo", "baz"), BooleanClause.Occur.SHOULD)
|
||||
.add(new TermQuery(new Term(FLD_NON_EXISTING, "abc")), BooleanClause.Occur.SHOULD)
|
||||
.add(new TermQuery(new Term(field, "xyz")), BooleanClause.Occur.MUST_NOT)
|
||||
.build();
|
||||
|
||||
withReader(
|
||||
List.of(
|
||||
Map.of(field, values("foo bar baz abc")),
|
||||
Map.of(field, values("bar foo baz def")),
|
||||
Map.of(field, values("bar baz foo xyz"))),
|
||||
reader -> {
|
||||
assertThat(highlights(reader, query),
|
||||
containsInAnyOrder(
|
||||
fmt("0: (%s: '>foo bar baz< abc')", field),
|
||||
fmt("1: (%s: 'bar >foo baz< def')", field)));
|
||||
});
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testVariousQueryTypesWithOffsets() throws IOException {
|
||||
checkVariousQueryTypes(FLD_TEXT_POS_OFFS);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testVariousQueryTypesWithPositions() throws IOException {
|
||||
checkVariousQueryTypes(FLD_TEXT_POS);
|
||||
}
|
||||
|
||||
private void checkVariousQueryTypes(String field) throws IOException {
|
||||
withReader(
|
||||
List.of(
|
||||
Map.of(field, values("foo bar baz abc")),
|
||||
Map.of(field, values("bar foo baz def")),
|
||||
Map.of(field, values("bar baz foo xyz"))),
|
||||
reader -> {
|
||||
assertThat(highlights(reader, stdQueryParser.apply("foo baz", field)),
|
||||
containsInAnyOrder(
|
||||
fmt("0: (%s: '>foo< bar >baz< abc')", field),
|
||||
fmt("1: (%s: 'bar >foo< >baz< def')", field),
|
||||
fmt("2: (%s: 'bar >baz< >foo< xyz')", field)));
|
||||
|
||||
assertThat(highlights(reader, stdQueryParser.apply("foo OR xyz", field)),
|
||||
containsInAnyOrder(
|
||||
fmt("0: (%s: '>foo< bar baz abc')", field),
|
||||
fmt("1: (%s: 'bar >foo< baz def')", field),
|
||||
fmt("2: (%s: 'bar baz >foo< >xyz<')", field)));
|
||||
|
||||
assertThat(highlights(reader, stdQueryParser.apply("bas~2", field)),
|
||||
containsInAnyOrder(
|
||||
fmt("0: (%s: 'foo >bar< >baz< >abc<')", field),
|
||||
fmt("1: (%s: '>bar< foo >baz< def')", field),
|
||||
fmt("2: (%s: '>bar< >baz< foo xyz')", field)));
|
||||
|
||||
assertThat(highlights(reader, stdQueryParser.apply("\"foo bar\"", field)),
|
||||
containsInAnyOrder((fmt("0: (%s: '>foo bar< baz abc')", field))));
|
||||
|
||||
assertThat(highlights(reader, stdQueryParser.apply("\"foo bar\"~3", field)),
|
||||
containsInAnyOrder(
|
||||
fmt("0: (%s: '>foo bar< baz abc')", field),
|
||||
fmt("1: (%s: '>bar foo< baz def')", field),
|
||||
fmt("2: (%s: '>bar baz foo< xyz')", field)));
|
||||
|
||||
assertThat(highlights(reader, stdQueryParser.apply("ba*", field)),
|
||||
containsInAnyOrder(
|
||||
fmt("0: (%s: 'foo >bar< >baz< abc')", field),
|
||||
fmt("1: (%s: '>bar< foo >baz< def')", field),
|
||||
fmt("2: (%s: '>bar< >baz< foo xyz')", field)));
|
||||
|
||||
assertThat(highlights(reader, stdQueryParser.apply("[bar TO bas]", field)),
|
||||
containsInAnyOrder(
|
||||
fmt("0: (%s: 'foo >bar< baz abc')", field),
|
||||
fmt("1: (%s: '>bar< foo baz def')", field),
|
||||
fmt("2: (%s: '>bar< baz foo xyz')", field)));
|
||||
|
||||
// Note how document '2' has 'bar' that isn't highlighted (because this
|
||||
// document is excluded in the first clause).
|
||||
assertThat(
|
||||
highlights(reader, stdQueryParser.apply("([bar TO baz] -xyz) OR baz", field)),
|
||||
containsInAnyOrder(
|
||||
fmt("0: (%s: 'foo >bar< >>baz<< abc')", field),
|
||||
fmt("1: (%s: '>bar< foo >>baz<< def')", field),
|
||||
fmt("2: (%s: 'bar >baz< foo xyz')", field)));
|
||||
|
||||
assertThat(highlights(reader, new MatchAllDocsQuery()),
|
||||
Matchers.hasSize(0));
|
||||
});
|
||||
|
||||
withReader(
|
||||
List.of(
|
||||
Map.of(field, values("foo baz foo")),
|
||||
Map.of(field, values("bas baz foo")),
|
||||
Map.of(field, values("bar baz foo xyz"))),
|
||||
reader -> {
|
||||
assertThat(
|
||||
highlights(reader, stdQueryParser.apply("[bar TO baz] -bar", field)),
|
||||
containsInAnyOrder(
|
||||
fmt("0: (%s: 'foo >baz< foo')", field), fmt("1: (%s: '>bas< >baz< foo')", field)));
|
||||
});
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testIntervalQueries() throws IOException {
|
||||
String field = FLD_TEXT_POS_OFFS;
|
||||
|
||||
withReader(
|
||||
List.of(
|
||||
Map.of(field, values("foo baz foo")),
|
||||
Map.of(field, values("bas baz foo")),
|
||||
Map.of(field, values("bar baz foo xyz"))),
|
||||
reader -> {
|
||||
assertThat(
|
||||
highlights(reader, new IntervalQuery(field,
|
||||
Intervals.unordered(
|
||||
Intervals.term("foo"),
|
||||
Intervals.term("bas"),
|
||||
Intervals.term("baz")))),
|
||||
containsInAnyOrder(
|
||||
fmt("1: (field_text_offs: '>bas baz foo<')", field)
|
||||
));
|
||||
|
||||
assertThat(
|
||||
highlights(reader, new IntervalQuery(field,
|
||||
Intervals.maxgaps(1,
|
||||
Intervals.unordered(
|
||||
Intervals.term("foo"),
|
||||
Intervals.term("bar"))))),
|
||||
containsInAnyOrder(
|
||||
fmt("2: (field_text_offs: '>bar baz foo< xyz')", field)
|
||||
));
|
||||
|
||||
assertThat(
|
||||
highlights(reader, new IntervalQuery(field,
|
||||
Intervals.containing(
|
||||
Intervals.unordered(
|
||||
Intervals.term("foo"),
|
||||
Intervals.term("bar")),
|
||||
Intervals.term("foo")))),
|
||||
containsInAnyOrder(
|
||||
fmt("2: (field_text_offs: '>bar baz foo< xyz')", field)
|
||||
));
|
||||
|
||||
assertThat(
|
||||
highlights(reader, new IntervalQuery(field,
|
||||
Intervals.containedBy(
|
||||
Intervals.term("foo"),
|
||||
Intervals.unordered(
|
||||
Intervals.term("foo"),
|
||||
Intervals.term("bar"))))),
|
||||
containsInAnyOrder(
|
||||
fmt("2: (field_text_offs: '>bar baz foo< xyz')", field)
|
||||
));
|
||||
|
||||
assertThat(
|
||||
highlights(reader, new IntervalQuery(field,
|
||||
Intervals.overlapping(
|
||||
Intervals.unordered(
|
||||
Intervals.term("foo"),
|
||||
Intervals.term("bar")),
|
||||
Intervals.term("foo")))),
|
||||
containsInAnyOrder(
|
||||
fmt("2: (field_text_offs: '>bar baz foo< xyz')", field)
|
||||
));
|
||||
});
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMultivaluedFieldsWithOffsets() throws IOException {
|
||||
checkMultivaluedFields(FLD_TEXT_POS_OFFS);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMultivaluedFieldsWithPositions() throws IOException {
|
||||
checkMultivaluedFields(FLD_TEXT_POS);
|
||||
}
|
||||
|
||||
public void checkMultivaluedFields(String field) throws IOException {
|
||||
withReader(
|
||||
List.of(
|
||||
Map.of(field, values("foo bar", "baz abc", "bad baz")),
|
||||
Map.of(field, values("bar foo", "baz def")),
|
||||
Map.of(field, values("bar baz", "foo xyz"))),
|
||||
reader -> {
|
||||
assertThat(highlights(reader, stdQueryParser.apply("baz", field)),
|
||||
containsInAnyOrder(
|
||||
fmt("0: (%s: '>baz< abc | bad >baz<')", field),
|
||||
fmt("1: (%s: '>baz< def')", field),
|
||||
fmt("2: (%s: 'bar >baz<')", field)));
|
||||
});
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMultiFieldHighlights() throws IOException {
|
||||
for (String[] fields :
|
||||
new String[][]{
|
||||
{FLD_TEXT_POS_OFFS1, FLD_TEXT_POS_OFFS2},
|
||||
{FLD_TEXT_POS, FLD_TEXT_POS_OFFS2},
|
||||
{FLD_TEXT_POS_OFFS1, FLD_TEXT_POS}
|
||||
}) {
|
||||
String field1 = fields[0];
|
||||
String field2 = fields[1];
|
||||
withReader(
|
||||
List.of(
|
||||
Map.of(
|
||||
field1, values("foo bar", "baz abc"),
|
||||
field2, values("foo baz", "loo bar"))),
|
||||
reader -> {
|
||||
String ordered =
|
||||
Stream.of(fmt("(%s: '>baz< abc')", field1), fmt("(%s: 'loo >bar<')", field2))
|
||||
.sorted()
|
||||
.collect(Collectors.joining(""));
|
||||
|
||||
assertThat(
|
||||
highlights(
|
||||
reader,
|
||||
stdQueryParser.apply(field1 + ":baz" + " OR " + field2 + ":bar", field1)),
|
||||
containsInAnyOrder(fmt("0: %s", ordered)));
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Rewritten Boolean queries may omit matches from {@link
|
||||
* org.apache.lucene.search.BooleanClause.Occur#SHOULD} clauses. Check that this isn't the case.
|
||||
*/
|
||||
@Test
|
||||
public void testNoRewrite() throws IOException {
|
||||
String field1 = FLD_TEXT_POS_OFFS1;
|
||||
String field2 = FLD_TEXT_POS_OFFS2;
|
||||
withReader(
|
||||
List.of(
|
||||
Map.of(
|
||||
field1, values("0100"),
|
||||
field2, values("loo bar")),
|
||||
Map.of(
|
||||
field1, values("0200"),
|
||||
field2, values("foo bar"))),
|
||||
reader -> {
|
||||
String expected = fmt("0: (%s: '>0100<')(%s: 'loo >bar<')", field1, field2);
|
||||
assertThat(
|
||||
highlights(
|
||||
reader,
|
||||
stdQueryParser.apply(fmt("+%s:01* OR %s:bar", field1, field2), field1)),
|
||||
containsInAnyOrder(expected));
|
||||
|
||||
assertThat(
|
||||
highlights(
|
||||
reader,
|
||||
stdQueryParser.apply(fmt("+%s:01* AND %s:bar", field1, field2), field1)),
|
||||
containsInAnyOrder(expected));
|
||||
});
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testNestedQueryHitsWithOffsets() throws IOException {
|
||||
checkNestedQueryHits(FLD_TEXT_POS_OFFS);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testNestedQueryHitsWithPositions() throws IOException {
|
||||
checkNestedQueryHits(FLD_TEXT_POS);
|
||||
}
|
||||
|
||||
public void checkNestedQueryHits(String field) throws IOException {
|
||||
withReader(
|
||||
List.of(Map.of(field, values("foo bar baz abc"))),
|
||||
reader -> {
|
||||
assertThat(
|
||||
highlights(
|
||||
reader,
|
||||
new BooleanQuery.Builder()
|
||||
.add(new PhraseQuery(1, field, "foo", "baz"), BooleanClause.Occur.SHOULD)
|
||||
.add(new TermQuery(new Term(field, "bar")), BooleanClause.Occur.SHOULD)
|
||||
.build()),
|
||||
containsInAnyOrder(fmt("0: (%s: '>foo >bar< baz< abc')", field)));
|
||||
|
||||
assertThat(
|
||||
highlights(
|
||||
reader,
|
||||
new BooleanQuery.Builder()
|
||||
.add(new PhraseQuery(1, field, "foo", "baz"), BooleanClause.Occur.SHOULD)
|
||||
.add(new TermQuery(new Term(field, "bar")), BooleanClause.Occur.SHOULD)
|
||||
.add(new TermQuery(new Term(field, "baz")), BooleanClause.Occur.SHOULD)
|
||||
.build()),
|
||||
containsInAnyOrder(fmt("0: (%s: '>foo >bar< >baz<< abc')", field)));
|
||||
});
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGraphQueryWithOffsets() throws Exception {
|
||||
checkGraphQuery(FLD_TEXT_SYNONYMS_POS_OFFS);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGraphQueryWithPositions() throws Exception {
|
||||
checkGraphQuery(FLD_TEXT_SYNONYMS_POS);
|
||||
}
|
||||
|
||||
private void checkGraphQuery(String field) throws IOException {
|
||||
withReader(
|
||||
List.of(
|
||||
Map.of(field, values("foo bar baz")),
|
||||
Map.of(field, values("bar foo baz")),
|
||||
Map.of(field, values("bar baz foo")),
|
||||
Map.of(field, values("bar bar bar irrelevant"))),
|
||||
reader -> {
|
||||
assertThat(highlights(reader, new TermQuery(new Term(field, "syn1"))),
|
||||
containsInAnyOrder(fmt("0: (%s: '>foo bar< baz')", field)));
|
||||
|
||||
// [syn2 syn3] = baz
|
||||
// so both these queries highlight baz.
|
||||
assertThat(highlights(reader, new TermQuery(new Term(field, "syn3"))),
|
||||
containsInAnyOrder(
|
||||
fmt("0: (%s: 'foo bar >baz<')", field),
|
||||
fmt("1: (%s: 'bar foo >baz<')", field),
|
||||
fmt("2: (%s: 'bar >baz< foo')", field)));
|
||||
assertThat(
|
||||
highlights(reader, stdQueryParser.apply(field + ":\"syn2 syn3\"", field)),
|
||||
containsInAnyOrder(
|
||||
fmt("0: (%s: 'foo bar >baz<')", field),
|
||||
fmt("1: (%s: 'bar foo >baz<')", field),
|
||||
fmt("2: (%s: 'bar >baz< foo')", field)));
|
||||
assertThat(
|
||||
highlights(reader, stdQueryParser.apply(field + ":\"foo syn2 syn3\"", field)),
|
||||
containsInAnyOrder(fmt("1: (%s: 'bar >foo baz<')", field)));
|
||||
});
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSpanQueryWithOffsets() throws Exception {
|
||||
checkSpanQueries(FLD_TEXT_POS_OFFS);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSpanQueryWithPositions() throws Exception {
|
||||
checkSpanQueries(FLD_TEXT_POS);
|
||||
}
|
||||
|
||||
private void checkSpanQueries(String field) throws IOException {
|
||||
withReader(
|
||||
List.of(
|
||||
Map.of(field, values("foo bar baz")),
|
||||
Map.of(field, values("bar foo baz")),
|
||||
Map.of(field, values("bar baz foo")),
|
||||
Map.of(field, values("bar bar bar irrelevant"))),
|
||||
reader -> {
|
||||
assertThat(
|
||||
highlights(
|
||||
reader,
|
||||
SpanNearQuery.newOrderedNearQuery(field)
|
||||
.addClause(new SpanTermQuery(new Term(field, "bar")))
|
||||
.addClause(new SpanTermQuery(new Term(field, "foo")))
|
||||
.build()),
|
||||
containsInAnyOrder(fmt("1: (%s: '>bar foo< baz')", field)));
|
||||
|
||||
assertThat(
|
||||
highlights(
|
||||
reader,
|
||||
SpanNearQuery.newOrderedNearQuery(field)
|
||||
.addClause(new SpanTermQuery(new Term(field, "bar")))
|
||||
.addGap(1)
|
||||
.addClause(new SpanTermQuery(new Term(field, "foo")))
|
||||
.build()),
|
||||
containsInAnyOrder(fmt("2: (%s: '>bar baz foo<')", field)));
|
||||
|
||||
assertThat(
|
||||
highlights(
|
||||
reader,
|
||||
SpanNearQuery.newUnorderedNearQuery(field)
|
||||
.addClause(new SpanTermQuery(new Term(field, "foo")))
|
||||
.addClause(new SpanTermQuery(new Term(field, "bar")))
|
||||
.build()),
|
||||
containsInAnyOrder(
|
||||
fmt("0: (%s: '>foo bar< baz')", field), fmt("1: (%s: '>bar foo< baz')", field)));
|
||||
|
||||
assertThat(
|
||||
highlights(
|
||||
reader,
|
||||
SpanNearQuery.newUnorderedNearQuery(field)
|
||||
.addClause(new SpanTermQuery(new Term(field, "foo")))
|
||||
.addClause(new SpanTermQuery(new Term(field, "bar")))
|
||||
.setSlop(1)
|
||||
.build()),
|
||||
containsInAnyOrder(
|
||||
fmt("0: (%s: '>foo bar< baz')", field),
|
||||
fmt("1: (%s: '>bar foo< baz')", field),
|
||||
fmt("2: (%s: '>bar baz foo<')", field)));
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* This test runs a term query against a field with no stored
|
||||
* positions or offsets. This test checks the {@link OffsetsFromValues}
|
||||
* strategy that returns highlights over entire indexed values.
|
||||
*/
|
||||
@Test
|
||||
public void testTextFieldNoPositionsOffsetFromValues() throws Exception {
|
||||
String field = FLD_TEXT_NOPOS;
|
||||
|
||||
withReader(
|
||||
List.of(
|
||||
Map.of(FLD_TEXT_NOPOS, values("foo bar")),
|
||||
Map.of(FLD_TEXT_NOPOS, values("foo bar", "baz baz"))
|
||||
),
|
||||
reader -> {
|
||||
OffsetsRetrievalStrategySupplier defaults = MatchRegionRetriever
|
||||
.computeOffsetRetrievalStrategies(reader, analyzer);
|
||||
OffsetsRetrievalStrategySupplier customSuppliers = (fld) -> {
|
||||
if (fld.equals(field)) {
|
||||
return new OffsetsFromValues(field, analyzer);
|
||||
} else {
|
||||
return defaults.apply(field);
|
||||
}
|
||||
};
|
||||
|
||||
assertThat(
|
||||
highlights(
|
||||
customSuppliers,
|
||||
reader,
|
||||
new TermQuery(new Term(field, "bar"))),
|
||||
containsInAnyOrder(
|
||||
fmt("0: (%s: '>foo bar<')", field),
|
||||
fmt("1: (%s: '>foo bar< | >baz baz<')", field)));
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* This test runs a term query against a field with no stored
|
||||
* positions or offsets.
|
||||
* <p>
|
||||
* Such field structure is often useful for multivalued "keyword-like"
|
||||
* fields.
|
||||
*/
|
||||
@Test
|
||||
public void testTextFieldNoPositionsOffsetsFromTokens() throws Exception {
|
||||
String field = FLD_TEXT_NOPOS;
|
||||
|
||||
withReader(
|
||||
List.of(
|
||||
Map.of(FLD_TEXT_NOPOS, values("foo bar"),
|
||||
FLD_TEXT_POS, values("bar bar")),
|
||||
Map.of(FLD_TEXT_NOPOS, values("foo bar", "baz bar"))
|
||||
),
|
||||
reader -> {
|
||||
assertThat(
|
||||
highlights(
|
||||
reader,
|
||||
new TermQuery(new Term(field, "bar"))),
|
||||
containsInAnyOrder(
|
||||
fmt("0: (%s: 'foo >bar<')", field),
|
||||
fmt("1: (%s: 'foo >bar< | baz >bar<')", field)));
|
||||
});
|
||||
}
|
||||
|
||||
private List<String> highlights(IndexReader reader, Query query) throws IOException {
|
||||
return highlights(MatchRegionRetriever.computeOffsetRetrievalStrategies(reader, analyzer),
|
||||
reader, query);
|
||||
}
|
||||
|
||||
private List<String> highlights(OffsetsRetrievalStrategySupplier offsetsStrategySupplier,
|
||||
IndexReader reader, Query query) throws IOException {
|
||||
IndexSearcher searcher = new IndexSearcher(reader);
|
||||
int maxDocs = 1000;
|
||||
|
||||
Query rewrittenQuery = searcher.rewrite(query);
|
||||
TopDocs topDocs = searcher.search(rewrittenQuery, maxDocs);
|
||||
|
||||
ArrayList<String> highlights = new ArrayList<>();
|
||||
|
||||
AsciiMatchRangeHighlighter formatter = new AsciiMatchRangeHighlighter(analyzer);
|
||||
|
||||
MatchRegionRetriever.MatchOffsetsConsumer highlightCollector =
|
||||
(docId, leafReader, leafDocId, fieldHighlights) -> {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
|
||||
Document document = leafReader.document(leafDocId);
|
||||
formatter
|
||||
.apply(document, new TreeMap<>(fieldHighlights))
|
||||
.forEach(
|
||||
(field, snippets) -> {
|
||||
sb.append(
|
||||
String.format(
|
||||
Locale.ROOT, "(%s: '%s')", field, String.join(" | ", snippets)));
|
||||
});
|
||||
|
||||
if (sb.length() > 0) {
|
||||
sb.insert(0, document.get(FLD_ID) + ": ");
|
||||
highlights.add(sb.toString());
|
||||
}
|
||||
};
|
||||
|
||||
MatchRegionRetriever highlighter = new MatchRegionRetriever(searcher, rewrittenQuery, analyzer,
|
||||
offsetsStrategySupplier);
|
||||
highlighter.highlightDocuments(topDocs, highlightCollector);
|
||||
|
||||
return highlights;
|
||||
}
|
||||
|
||||
private String[] values(String... values) {
|
||||
assertThat(values, not(emptyArray()));
|
||||
return values;
|
||||
}
|
||||
|
||||
private void withReader(
|
||||
Collection<Map<String, String[]>> docs, IOUtils.IOConsumer<DirectoryReader> block)
|
||||
throws IOException {
|
||||
IndexWriterConfig config = new IndexWriterConfig(analyzer);
|
||||
|
||||
try (Directory directory = new ByteBuffersDirectory()) {
|
||||
IndexWriter iw = new IndexWriter(directory, config);
|
||||
|
||||
int seq = 0;
|
||||
for (Map<String, String[]> fields : docs) {
|
||||
Document doc = new Document();
|
||||
doc.add(new StringField(FLD_ID, Integer.toString(seq++), Field.Store.YES));
|
||||
for (Map.Entry<String, String[]> field : fields.entrySet()) {
|
||||
for (String value : field.getValue()) {
|
||||
doc.add(toField(field.getKey(), value));
|
||||
}
|
||||
}
|
||||
iw.addDocument(doc);
|
||||
if (RandomizedTest.randomBoolean()) {
|
||||
iw.commit();
|
||||
}
|
||||
}
|
||||
iw.flush();
|
||||
|
||||
try (DirectoryReader reader = DirectoryReader.open(iw)) {
|
||||
block.accept(reader);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private IndexableField toField(String name, String value) {
|
||||
switch (name) {
|
||||
case FLD_TEXT_NOPOS:
|
||||
return new Field(name, value, TYPE_STORED_NO_POSITIONS);
|
||||
case FLD_TEXT_POS:
|
||||
case FLD_TEXT_SYNONYMS_POS:
|
||||
return new TextField(name, value, Field.Store.YES);
|
||||
case FLD_TEXT_POS_OFFS:
|
||||
case FLD_TEXT_POS_OFFS1:
|
||||
case FLD_TEXT_POS_OFFS2:
|
||||
case FLD_TEXT_SYNONYMS_POS_OFFS:
|
||||
return new Field(name, value, TYPE_STORED_WITH_OFFSETS);
|
||||
default:
|
||||
throw new AssertionError("Don't know how to handle this field: " + name);
|
||||
}
|
||||
}
|
||||
|
||||
private static String fmt(String string, Object... args) {
|
||||
return String.format(Locale.ROOT, string, args);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,284 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.search.matchhighlight;
|
||||
|
||||
import static com.carrotsearch.randomizedtesting.RandomizedTest.randomAsciiLettersOfLengthBetween;
|
||||
import static com.carrotsearch.randomizedtesting.RandomizedTest.randomBoolean;
|
||||
import static com.carrotsearch.randomizedtesting.RandomizedTest.randomIntBetween;
|
||||
import static com.carrotsearch.randomizedtesting.RandomizedTest.randomRealisticUnicodeOfCodepointLengthBetween;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.hamcrest.Matchers;
|
||||
import org.junit.Test;
|
||||
|
||||
public class TestPassageSelector extends LuceneTestCase {
|
||||
@Test
|
||||
public void checkEmptyExtra() {
|
||||
checkPassages(
|
||||
"foo >>bar<< baz abc",
|
||||
"foo bar baz abc",
|
||||
300,
|
||||
100,
|
||||
new OffsetRange(4, 7),
|
||||
new OffsetRange(4, 7));
|
||||
|
||||
checkPassages(
|
||||
">foo >bar< >baz<< abc",
|
||||
"foo bar baz abc",
|
||||
300,
|
||||
100,
|
||||
new OffsetRange(0, 11),
|
||||
new OffsetRange(4, 7),
|
||||
new OffsetRange(8, 11));
|
||||
|
||||
checkPassages(
|
||||
">>foo< bar >baz<< abc",
|
||||
"foo bar baz abc",
|
||||
300,
|
||||
100,
|
||||
new OffsetRange(0, 11),
|
||||
new OffsetRange(0, 3),
|
||||
new OffsetRange(8, 11));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void oneMarker() {
|
||||
checkPassages(">0<123456789a", "0123456789a", 300, 1, new OffsetRange(0, 1));
|
||||
checkPassages("0123456789>a<", "0123456789a", 300, 1, new OffsetRange(10, 11));
|
||||
checkPassages(">0123456789a<", "0123456789a", 300, 1, new OffsetRange(0, 11));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void noHighlights() {
|
||||
checkPassages("0123456789a", "0123456789a", 300, 1);
|
||||
checkPassages("01234...", "0123456789a", 5, 1);
|
||||
checkPassages(
|
||||
"0123",
|
||||
"0123456789a",
|
||||
15,
|
||||
2,
|
||||
new OffsetRange[0],
|
||||
new OffsetRange[] {new OffsetRange(0, 4), new OffsetRange(4, 9)});
|
||||
}
|
||||
|
||||
@Test
|
||||
public void oneMarkerTruncated() {
|
||||
checkPassages(">0<12...", "0123456789a", 4, 1, new OffsetRange(0, 1));
|
||||
checkPassages("...789>a<", "0123456789a", 4, 1, new OffsetRange(10, 11));
|
||||
checkPassages("...>3456<...", "0123456789a", 4, 1, new OffsetRange(3, 7));
|
||||
checkPassages("...3>45<6...", "0123456789a", 4, 1, new OffsetRange(4, 6));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void highlightLargerThanWindow() {
|
||||
String value = "0123456789a";
|
||||
checkPassages("0123...", value, 4, 1, new OffsetRange(0, value.length()));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void twoMarkers() {
|
||||
checkPassages(
|
||||
"0>12<3>45<6789a", "0123456789a", 300, 1, new OffsetRange(1, 3), new OffsetRange(4, 6));
|
||||
checkPassages(
|
||||
"0>123<>45<6789a", "0123456789a", 300, 1, new OffsetRange(1, 4), new OffsetRange(4, 6));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void noMarkers() {
|
||||
checkPassages("0123456789a", "0123456789a", 300, 1);
|
||||
checkPassages("0123...", "0123456789a", 4, 1);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void markersOutsideValue() {
|
||||
checkPassages("0123456789a", "0123456789a", 300, 1, new OffsetRange(100, 200));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void twoPassages() {
|
||||
checkPassages(
|
||||
"0>12<3...|...6>78<9...",
|
||||
"0123456789a",
|
||||
4,
|
||||
2,
|
||||
new OffsetRange(1, 3),
|
||||
new OffsetRange(7, 9));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void emptyRanges() {
|
||||
// Empty ranges cover the highlight, so it is omitted.
|
||||
// Instead, the first non-empty range is taken as the default.
|
||||
checkPassages(
|
||||
"6789...",
|
||||
"0123456789a",
|
||||
4,
|
||||
2,
|
||||
ranges(new OffsetRange(0, 1)),
|
||||
ranges(new OffsetRange(0, 0), new OffsetRange(2, 2), new OffsetRange(6, 11)));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void passageScoring() {
|
||||
// More highlights per passage -> better passage
|
||||
checkPassages(
|
||||
">01<>23<...",
|
||||
"0123456789a",
|
||||
4,
|
||||
1,
|
||||
new OffsetRange(0, 2),
|
||||
new OffsetRange(2, 4),
|
||||
new OffsetRange(8, 10));
|
||||
|
||||
checkPassages(
|
||||
"...>01<23>45<67>89<...",
|
||||
"__________0123456789a__________",
|
||||
10,
|
||||
1,
|
||||
new OffsetRange(10, 12),
|
||||
new OffsetRange(14, 16),
|
||||
new OffsetRange(18, 20));
|
||||
|
||||
// ...if tied, the one with longer highlight length overall.
|
||||
checkPassages(
|
||||
"...6>789<...", "0123456789a", 4, 1, new OffsetRange(0, 2), new OffsetRange(7, 10));
|
||||
|
||||
// ...if tied, the first one in order.
|
||||
checkPassages(">01<23...", "0123456789a", 4, 1, new OffsetRange(0, 2), new OffsetRange(8, 10));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void rangeWindows() {
|
||||
// Add constraint windows to split the three highlights.
|
||||
checkPassages(
|
||||
"..._______>01<2",
|
||||
"__________0123456789a__________",
|
||||
10,
|
||||
3,
|
||||
ranges(new OffsetRange(10, 12), new OffsetRange(14, 16), new OffsetRange(18, 20)),
|
||||
ranges(new OffsetRange(0, 13)));
|
||||
|
||||
checkPassages(
|
||||
">89<a_______...",
|
||||
"__________0123456789a__________",
|
||||
10,
|
||||
3,
|
||||
ranges(new OffsetRange(10, 12), new OffsetRange(14, 16), new OffsetRange(18, 20)),
|
||||
ranges(new OffsetRange(18, Integer.MAX_VALUE)));
|
||||
|
||||
checkPassages(
|
||||
"...________>01<|23>45<67|>89<a_______...",
|
||||
"__________0123456789a__________",
|
||||
10,
|
||||
3,
|
||||
ranges(new OffsetRange(10, 12), new OffsetRange(14, 16), new OffsetRange(18, 20)),
|
||||
ranges(
|
||||
new OffsetRange(0, 12),
|
||||
new OffsetRange(12, 18),
|
||||
new OffsetRange(18, Integer.MAX_VALUE)));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void randomizedSanityCheck() {
|
||||
PassageSelector selector = new PassageSelector();
|
||||
PassageFormatter formatter = new PassageFormatter("...", ">", "<");
|
||||
ArrayList<OffsetRange> highlights = new ArrayList<>();
|
||||
ArrayList<OffsetRange> ranges = new ArrayList<>();
|
||||
for (int i = 0; i < 5000; i++) {
|
||||
String value =
|
||||
randomBoolean()
|
||||
? randomAsciiLettersOfLengthBetween(0, 100)
|
||||
: randomRealisticUnicodeOfCodepointLengthBetween(0, 1000);
|
||||
|
||||
ranges.clear();
|
||||
highlights.clear();
|
||||
for (int j = randomIntBetween(0, 10); --j >= 0; ) {
|
||||
int from = randomIntBetween(0, value.length());
|
||||
highlights.add(new OffsetRange(from, from + randomIntBetween(1, 10)));
|
||||
}
|
||||
|
||||
int charWindow = randomIntBetween(1, 100);
|
||||
int maxPassages = randomIntBetween(1, 10);
|
||||
|
||||
if (randomIntBetween(0, 5) == 0) {
|
||||
int increment = value.length() / 10;
|
||||
for (int c = randomIntBetween(0, 20), start = 0; --c >= 0; ) {
|
||||
int step = randomIntBetween(0, increment);
|
||||
ranges.add(new OffsetRange(start, start + step));
|
||||
start += step + randomIntBetween(0, 3);
|
||||
}
|
||||
} else {
|
||||
ranges.add(new OffsetRange(0, value.length()));
|
||||
}
|
||||
|
||||
// Just make sure there are no exceptions.
|
||||
List<Passage> passages =
|
||||
selector.pickBest(value, highlights, charWindow, maxPassages, ranges);
|
||||
formatter.format(value, passages, ranges);
|
||||
}
|
||||
}
|
||||
|
||||
private void checkPassages(
|
||||
String expected, String value, int charWindow, int maxPassages, OffsetRange... highlights) {
|
||||
checkPassages(
|
||||
expected,
|
||||
value,
|
||||
charWindow,
|
||||
maxPassages,
|
||||
highlights,
|
||||
ranges(new OffsetRange(0, value.length())));
|
||||
}
|
||||
|
||||
private void checkPassages(
|
||||
String expected,
|
||||
String value,
|
||||
int charWindow,
|
||||
int maxPassages,
|
||||
OffsetRange[] highlights,
|
||||
OffsetRange[] ranges) {
|
||||
String result = getPassages(value, charWindow, maxPassages, highlights, ranges);
|
||||
if (!Objects.equals(result, expected)) {
|
||||
System.out.println("Value: " + value);
|
||||
System.out.println("Result: " + result);
|
||||
System.out.println("Expect: " + expected);
|
||||
}
|
||||
assertThat(result, Matchers.equalTo(expected));
|
||||
}
|
||||
|
||||
protected String getPassages(
|
||||
String value,
|
||||
int charWindow,
|
||||
int maxPassages,
|
||||
OffsetRange[] highlights,
|
||||
OffsetRange[] ranges) {
|
||||
PassageFormatter passageFormatter = new PassageFormatter("...", ">", "<");
|
||||
PassageSelector selector = new PassageSelector();
|
||||
List<OffsetRange> hlist = Arrays.asList(highlights);
|
||||
List<OffsetRange> rangeList = Arrays.asList(ranges);
|
||||
List<Passage> passages = selector.pickBest(value, hlist, charWindow, maxPassages, rangeList);
|
||||
return String.join("|", passageFormatter.format(value, passages, rangeList));
|
||||
}
|
||||
|
||||
protected OffsetRange[] ranges(OffsetRange... offsets) {
|
||||
return offsets;
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue