mirror of https://github.com/apache/lucene.git
LUCENE-10220: Add an utility method to get IntervalSource from analyzed text (or token stream) (#427)
This commit is contained in:
parent
bec8eaef70
commit
32d7f52446
|
@ -37,7 +37,9 @@ API Changes
|
||||||
|
|
||||||
New Features
|
New Features
|
||||||
---------------------
|
---------------------
|
||||||
(No changes)
|
|
||||||
|
* LUCENE-10220: Add an utility method to get IntervalSource from analyzed text (or token stream).
|
||||||
|
(Uwe Schindler, Dawid Weiss, Alan Woodward)
|
||||||
|
|
||||||
Improvements
|
Improvements
|
||||||
---------------------
|
---------------------
|
||||||
|
|
|
@ -32,10 +32,13 @@ import java.util.function.Function;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
import java.util.stream.Stream;
|
import java.util.stream.Stream;
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||||
|
import org.apache.lucene.analysis.en.PorterStemFilter;
|
||||||
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
|
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
|
||||||
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
import org.apache.lucene.analysis.synonym.SynonymGraphFilter;
|
import org.apache.lucene.analysis.synonym.SynonymGraphFilter;
|
||||||
import org.apache.lucene.analysis.synonym.SynonymMap;
|
import org.apache.lucene.analysis.synonym.SynonymMap;
|
||||||
import org.apache.lucene.document.Field;
|
import org.apache.lucene.document.Field;
|
||||||
|
@ -264,6 +267,57 @@ public class TestMatchHighlighter extends LuceneTestCase {
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testAnalyzedTextIntervals() throws IOException {
|
||||||
|
SynonymMap synonymMap =
|
||||||
|
buildSynonymMap(
|
||||||
|
new String[][] {
|
||||||
|
{"moon\u0000shine", "firewater"},
|
||||||
|
{"firewater", "moon\u0000shine"},
|
||||||
|
});
|
||||||
|
|
||||||
|
Analyzer analyzer =
|
||||||
|
new Analyzer() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName) {
|
||||||
|
Tokenizer tokenizer = new StandardTokenizer();
|
||||||
|
TokenStream ts = tokenizer;
|
||||||
|
ts = new LowerCaseFilter(ts);
|
||||||
|
ts = new SynonymGraphFilter(ts, synonymMap, true);
|
||||||
|
ts = new PorterStemFilter(ts);
|
||||||
|
return new TokenStreamComponents(tokenizer, ts);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
new IndexBuilder(this::toField)
|
||||||
|
.doc(FLD_TEXT1, "Where the moon shine falls, firewater flows.")
|
||||||
|
.build(
|
||||||
|
analyzer,
|
||||||
|
reader -> {
|
||||||
|
IndexSearcher searcher = new IndexSearcher(reader);
|
||||||
|
Sort sortOrder = Sort.INDEXORDER; // So that results are consistently ordered.
|
||||||
|
|
||||||
|
MatchHighlighter highlighter =
|
||||||
|
new MatchHighlighter(searcher, analyzer)
|
||||||
|
.appendFieldHighlighter(
|
||||||
|
FieldValueHighlighters.highlighted(
|
||||||
|
80 * 3, 1, new PassageFormatter("...", ">", "<"), FLD_TEXT1::equals))
|
||||||
|
.appendFieldHighlighter(FieldValueHighlighters.skipRemaining());
|
||||||
|
|
||||||
|
{
|
||||||
|
// [moon shine, firewater] are synonyms, tokens are lowercased. Porter stemming on.
|
||||||
|
Query query =
|
||||||
|
new IntervalQuery(
|
||||||
|
FLD_TEXT1,
|
||||||
|
Intervals.analyzedText("Firewater Fall", analyzer, FLD_TEXT1, 0, true));
|
||||||
|
|
||||||
|
assertHighlights(
|
||||||
|
toDocList(highlighter.highlight(searcher.search(query, 10, sortOrder), query)),
|
||||||
|
"0. text1: Where the >moon shine falls<, firewater flows.");
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testCustomFieldHighlightHandling() throws IOException {
|
public void testCustomFieldHighlightHandling() throws IOException {
|
||||||
// Match highlighter is a showcase of individual components in this package, suitable
|
// Match highlighter is a showcase of individual components in this package, suitable
|
||||||
|
|
|
@ -0,0 +1,325 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Code adopted from ASL-licensed Elasticsearch.
|
||||||
|
* https://github.com/elastic/elasticsearch/blob/7.10/server/src/main/java/org/elasticsearch/index/query/IntervalBuilder.java
|
||||||
|
*
|
||||||
|
* Licensed to Elasticsearch under one or more contributor
|
||||||
|
* license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright
|
||||||
|
* ownership. Elasticsearch licenses this file to you under
|
||||||
|
* the Apache License, Version 2.0 (the "License"); you may
|
||||||
|
* not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing,
|
||||||
|
* software distributed under the License is distributed on an
|
||||||
|
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
* KIND, either express or implied. See the License for the
|
||||||
|
* specific language governing permissions and limitations
|
||||||
|
* under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.lucene.queries.intervals;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Collection;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.Iterator;
|
||||||
|
import java.util.List;
|
||||||
|
import org.apache.lucene.analysis.CachingTokenFilter;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
|
||||||
|
import org.apache.lucene.index.LeafReaderContext;
|
||||||
|
import org.apache.lucene.search.BooleanQuery;
|
||||||
|
import org.apache.lucene.search.QueryVisitor;
|
||||||
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
import org.apache.lucene.util.graph.GraphTokenStreamFiniteStrings;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Constructs an {@link IntervalsSource} based on analyzed text.
|
||||||
|
*
|
||||||
|
* <p>Code adopted from ASL-licensed <a
|
||||||
|
* href="https://github.com/elastic/elasticsearch">Elasticsearch</a>.
|
||||||
|
*
|
||||||
|
* @see
|
||||||
|
* "https://github.com/elastic/elasticsearch/blob/7.10/server/src/main/java/org/elasticsearch/index/query/IntervalBuilder.java"
|
||||||
|
*/
|
||||||
|
final class IntervalBuilder {
|
||||||
|
static IntervalsSource analyzeText(CachingTokenFilter stream, int maxGaps, boolean ordered)
|
||||||
|
throws IOException {
|
||||||
|
assert stream != null;
|
||||||
|
|
||||||
|
TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
|
||||||
|
PositionIncrementAttribute posIncAtt = stream.addAttribute(PositionIncrementAttribute.class);
|
||||||
|
PositionLengthAttribute posLenAtt = stream.addAttribute(PositionLengthAttribute.class);
|
||||||
|
|
||||||
|
if (termAtt == null) {
|
||||||
|
return NO_INTERVALS;
|
||||||
|
}
|
||||||
|
|
||||||
|
// phase 1: read through the stream and assess the situation:
|
||||||
|
// counting the number of tokens/positions and marking if we have any synonyms.
|
||||||
|
|
||||||
|
int numTokens = 0;
|
||||||
|
boolean hasSynonyms = false;
|
||||||
|
boolean isGraph = false;
|
||||||
|
|
||||||
|
stream.reset();
|
||||||
|
while (stream.incrementToken()) {
|
||||||
|
numTokens++;
|
||||||
|
int positionIncrement = posIncAtt.getPositionIncrement();
|
||||||
|
if (positionIncrement == 0) {
|
||||||
|
hasSynonyms = true;
|
||||||
|
}
|
||||||
|
int positionLength = posLenAtt.getPositionLength();
|
||||||
|
if (positionLength > 1) {
|
||||||
|
isGraph = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// phase 2: based on token count, presence of synonyms, and options
|
||||||
|
// formulate a single term, boolean, or phrase.
|
||||||
|
|
||||||
|
if (numTokens == 0) {
|
||||||
|
return NO_INTERVALS;
|
||||||
|
} else if (numTokens == 1) {
|
||||||
|
// single term
|
||||||
|
return analyzeTerm(stream);
|
||||||
|
} else if (isGraph) {
|
||||||
|
// graph
|
||||||
|
return combineSources(analyzeGraph(stream), maxGaps, ordered);
|
||||||
|
} else {
|
||||||
|
// phrase
|
||||||
|
if (hasSynonyms) {
|
||||||
|
// phrase with single-term synonyms
|
||||||
|
return analyzeSynonyms(stream, maxGaps, ordered);
|
||||||
|
} else {
|
||||||
|
// simple phrase
|
||||||
|
return combineSources(analyzeTerms(stream), maxGaps, ordered);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static IntervalsSource analyzeTerm(TokenStream ts) throws IOException {
|
||||||
|
TermToBytesRefAttribute bytesAtt = ts.addAttribute(TermToBytesRefAttribute.class);
|
||||||
|
ts.reset();
|
||||||
|
ts.incrementToken();
|
||||||
|
return Intervals.term(BytesRef.deepCopyOf(bytesAtt.getBytesRef()));
|
||||||
|
}
|
||||||
|
|
||||||
|
private static IntervalsSource combineSources(
|
||||||
|
List<IntervalsSource> sources, int maxGaps, boolean ordered) {
|
||||||
|
if (sources.size() == 0) {
|
||||||
|
return NO_INTERVALS;
|
||||||
|
}
|
||||||
|
if (sources.size() == 1) {
|
||||||
|
return sources.get(0);
|
||||||
|
}
|
||||||
|
IntervalsSource[] sourcesArray = sources.toArray(new IntervalsSource[0]);
|
||||||
|
if (maxGaps == 0 && ordered) {
|
||||||
|
return Intervals.phrase(sourcesArray);
|
||||||
|
}
|
||||||
|
IntervalsSource inner =
|
||||||
|
ordered ? Intervals.ordered(sourcesArray) : Intervals.unordered(sourcesArray);
|
||||||
|
if (maxGaps == -1) {
|
||||||
|
return inner;
|
||||||
|
}
|
||||||
|
return Intervals.maxgaps(maxGaps, inner);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static List<IntervalsSource> analyzeTerms(TokenStream ts) throws IOException {
|
||||||
|
List<IntervalsSource> terms = new ArrayList<>();
|
||||||
|
TermToBytesRefAttribute bytesAtt = ts.addAttribute(TermToBytesRefAttribute.class);
|
||||||
|
PositionIncrementAttribute posAtt = ts.addAttribute(PositionIncrementAttribute.class);
|
||||||
|
ts.reset();
|
||||||
|
while (ts.incrementToken()) {
|
||||||
|
BytesRef term = bytesAtt.getBytesRef();
|
||||||
|
int precedingSpaces = posAtt.getPositionIncrement() - 1;
|
||||||
|
terms.add(extend(Intervals.term(BytesRef.deepCopyOf(term)), precedingSpaces));
|
||||||
|
}
|
||||||
|
ts.end();
|
||||||
|
return terms;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static IntervalsSource extend(IntervalsSource source, int precedingSpaces) {
|
||||||
|
if (precedingSpaces == 0) {
|
||||||
|
return source;
|
||||||
|
}
|
||||||
|
return Intervals.extend(source, precedingSpaces, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static IntervalsSource analyzeSynonyms(TokenStream ts, int maxGaps, boolean ordered)
|
||||||
|
throws IOException {
|
||||||
|
List<IntervalsSource> terms = new ArrayList<>();
|
||||||
|
List<IntervalsSource> synonyms = new ArrayList<>();
|
||||||
|
TermToBytesRefAttribute bytesAtt = ts.addAttribute(TermToBytesRefAttribute.class);
|
||||||
|
PositionIncrementAttribute posAtt = ts.addAttribute(PositionIncrementAttribute.class);
|
||||||
|
ts.reset();
|
||||||
|
int spaces = 0;
|
||||||
|
while (ts.incrementToken()) {
|
||||||
|
int posInc = posAtt.getPositionIncrement();
|
||||||
|
if (posInc > 0) {
|
||||||
|
if (synonyms.size() == 1) {
|
||||||
|
terms.add(extend(synonyms.get(0), spaces));
|
||||||
|
} else if (synonyms.size() > 1) {
|
||||||
|
terms.add(extend(Intervals.or(synonyms.toArray(new IntervalsSource[0])), spaces));
|
||||||
|
}
|
||||||
|
synonyms.clear();
|
||||||
|
spaces = posInc - 1;
|
||||||
|
}
|
||||||
|
synonyms.add(Intervals.term(BytesRef.deepCopyOf(bytesAtt.getBytesRef())));
|
||||||
|
}
|
||||||
|
if (synonyms.size() == 1) {
|
||||||
|
terms.add(extend(synonyms.get(0), spaces));
|
||||||
|
} else {
|
||||||
|
terms.add(extend(Intervals.or(synonyms.toArray(new IntervalsSource[0])), spaces));
|
||||||
|
}
|
||||||
|
return combineSources(terms, maxGaps, ordered);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static List<IntervalsSource> analyzeGraph(TokenStream source) throws IOException {
|
||||||
|
source.reset();
|
||||||
|
GraphTokenStreamFiniteStrings graph = new GraphTokenStreamFiniteStrings(source);
|
||||||
|
|
||||||
|
List<IntervalsSource> clauses = new ArrayList<>();
|
||||||
|
int[] articulationPoints = graph.articulationPoints();
|
||||||
|
int lastState = 0;
|
||||||
|
int maxClauseCount = BooleanQuery.getMaxClauseCount();
|
||||||
|
for (int i = 0; i <= articulationPoints.length; i++) {
|
||||||
|
int start = lastState;
|
||||||
|
int end = -1;
|
||||||
|
if (i < articulationPoints.length) {
|
||||||
|
end = articulationPoints[i];
|
||||||
|
}
|
||||||
|
lastState = end;
|
||||||
|
if (graph.hasSidePath(start)) {
|
||||||
|
List<IntervalsSource> paths = new ArrayList<>();
|
||||||
|
Iterator<TokenStream> it = graph.getFiniteStrings(start, end);
|
||||||
|
while (it.hasNext()) {
|
||||||
|
TokenStream ts = it.next();
|
||||||
|
IntervalsSource phrase = combineSources(analyzeTerms(ts), 0, true);
|
||||||
|
if (paths.size() >= maxClauseCount) {
|
||||||
|
throw new BooleanQuery.TooManyClauses();
|
||||||
|
}
|
||||||
|
paths.add(phrase);
|
||||||
|
}
|
||||||
|
if (paths.size() > 0) {
|
||||||
|
clauses.add(Intervals.or(paths.toArray(new IntervalsSource[0])));
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
Iterator<TokenStream> it = graph.getFiniteStrings(start, end);
|
||||||
|
TokenStream ts = it.next();
|
||||||
|
clauses.addAll(analyzeTerms(ts));
|
||||||
|
assert it.hasNext() == false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return clauses;
|
||||||
|
}
|
||||||
|
|
||||||
|
static final IntervalsSource NO_INTERVALS =
|
||||||
|
new IntervalsSource() {
|
||||||
|
@Override
|
||||||
|
public IntervalIterator intervals(String field, LeafReaderContext ctx) {
|
||||||
|
return new IntervalIterator() {
|
||||||
|
@Override
|
||||||
|
public int start() {
|
||||||
|
return NO_MORE_INTERVALS;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int end() {
|
||||||
|
return NO_MORE_INTERVALS;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int gaps() {
|
||||||
|
throw new UnsupportedOperationException();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int nextInterval() {
|
||||||
|
return NO_MORE_INTERVALS;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public float matchCost() {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int docID() {
|
||||||
|
return NO_MORE_DOCS;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int nextDoc() {
|
||||||
|
return NO_MORE_DOCS;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int advance(int target) {
|
||||||
|
return NO_MORE_DOCS;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long cost() {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public IntervalMatchesIterator matches(String field, LeafReaderContext ctx, int doc) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void visit(String field, QueryVisitor visitor) {}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int minExtent() {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Collection<IntervalsSource> pullUpDisjunctions() {
|
||||||
|
return Collections.emptyList();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int hashCode() {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean equals(Object other) {
|
||||||
|
return other == this;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
return "no_match";
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
|
@ -17,9 +17,13 @@
|
||||||
|
|
||||||
package org.apache.lucene.queries.intervals;
|
package org.apache.lucene.queries.intervals;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.function.Predicate;
|
import java.util.function.Predicate;
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.CachingTokenFilter;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.index.Term;
|
import org.apache.lucene.index.Term;
|
||||||
import org.apache.lucene.search.PrefixQuery;
|
import org.apache.lucene.search.PrefixQuery;
|
||||||
import org.apache.lucene.search.WildcardQuery;
|
import org.apache.lucene.search.WildcardQuery;
|
||||||
|
@ -30,17 +34,17 @@ import org.apache.lucene.util.automaton.CompiledAutomaton;
|
||||||
* Constructor functions for {@link IntervalsSource} types
|
* Constructor functions for {@link IntervalsSource} types
|
||||||
*
|
*
|
||||||
* <p>These sources implement minimum-interval algorithms taken from the paper <a
|
* <p>These sources implement minimum-interval algorithms taken from the paper <a
|
||||||
* href="http://vigna.di.unimi.it/ftp/papers/EfficientAlgorithmsMinimalIntervalSemantics.pdf">
|
* href="https://vigna.di.unimi.it/ftp/papers/EfficientLazy.pdf">Efficient Optimally Lazy Algorithms
|
||||||
* Efficient Optimally Lazy Algorithms for Minimal-Interval Semantics</a>
|
* for Minimal-Interval Semantics</a>
|
||||||
*
|
*
|
||||||
* <p>By default, sources that are sensitive to internal gaps (e.g. PHRASE and MAXGAPS) will rewrite
|
* <p>By default, sources that are sensitive to internal gaps (e.g. {@code PHRASE} and {@code
|
||||||
* their sub-sources so that disjunctions of different lengths are pulled up to the top of the
|
* MAXGAPS}) will rewrite their sub-sources so that disjunctions of different lengths are pulled up
|
||||||
* interval tree. For example, PHRASE(or(PHRASE("a", "b", "c"), "b"), "c") will automatically
|
* to the top of the interval tree. For example, {@code PHRASE(or(PHRASE("a", "b", "c"), "b"), "c")}
|
||||||
* rewrite itself to OR(PHRASE("a", "b", "c", "c"), PHRASE("b", "c")) to ensure that documents
|
* will automatically rewrite itself to {@code OR(PHRASE("a", "b", "c", "c"), PHRASE("b", "c"))} to
|
||||||
* containing "b c" are matched. This can lead to less efficient queries, as more terms need to be
|
* ensure that documents containing {@code "b c"} are matched. This can lead to less efficient
|
||||||
* loaded (for example, the "c" iterator above is loaded twice), so if you care more about speed
|
* queries, as more terms need to be loaded (for example, the {@code "c"} iterator above is loaded
|
||||||
* than about accuracy you can use the {@link #or(boolean, IntervalsSource...)} factory method to
|
* twice), so if you care more about speed than about accuracy you can use the {@link #or(boolean,
|
||||||
* prevent rewriting.
|
* IntervalsSource...)} factory method to prevent rewriting.
|
||||||
*/
|
*/
|
||||||
public final class Intervals {
|
public final class Intervals {
|
||||||
|
|
||||||
|
@ -429,4 +433,50 @@ public final class Intervals {
|
||||||
source,
|
source,
|
||||||
Intervals.extend(new OffsetIntervalsSource(reference, false), 0, Integer.MAX_VALUE));
|
Intervals.extend(new OffsetIntervalsSource(reference, false), 0, Integer.MAX_VALUE));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns intervals that correspond to tokens from a {@link TokenStream} returned for {@code
|
||||||
|
* text} by applying the provided {@link Analyzer} as if {@code text} was the content of the given
|
||||||
|
* {@code field}. The intervals can be ordered or unordered and can have optional gaps inside.
|
||||||
|
*
|
||||||
|
* @param text The text to analyze.
|
||||||
|
* @param analyzer The {@link Analyzer} to use to acquire a {@link TokenStream} which is then
|
||||||
|
* converted into intervals.
|
||||||
|
* @param field The field {@code text} should be parsed as.
|
||||||
|
* @param maxGaps Maximum number of allowed gaps between sub-intervals resulting from tokens.
|
||||||
|
* @param ordered Whether sub-intervals should enforce token ordering or not.
|
||||||
|
* @return Returns an {@link IntervalsSource} that matches tokens acquired from analysis of {@code
|
||||||
|
* text}. Possibly an empty interval source, never {@code null}.
|
||||||
|
* @throws IOException If an I/O exception occurs.
|
||||||
|
*/
|
||||||
|
public static IntervalsSource analyzedText(
|
||||||
|
String text, Analyzer analyzer, String field, int maxGaps, boolean ordered)
|
||||||
|
throws IOException {
|
||||||
|
try (TokenStream ts = analyzer.tokenStream(field, text)) {
|
||||||
|
return analyzedText(ts, maxGaps, ordered);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns intervals that correspond to tokens from the provided {@link TokenStream}. This is a
|
||||||
|
* low-level counterpart to {@link #analyzedText(String, Analyzer, String, int, boolean)}. The
|
||||||
|
* intervals can be ordered or unordered and can have optional gaps inside.
|
||||||
|
*
|
||||||
|
* @param tokenStream The token stream to produce intervals for. The token stream may be fully or
|
||||||
|
* partially consumed after returning from this method.
|
||||||
|
* @param maxGaps Maximum number of allowed gaps between sub-intervals resulting from tokens.
|
||||||
|
* @param ordered Whether sub-intervals should enforce token ordering or not.
|
||||||
|
* @return Returns an {@link IntervalsSource} that matches tokens acquired from analysis of {@code
|
||||||
|
* text}. Possibly an empty interval source, never {@code null}.
|
||||||
|
* @throws IOException If an I/O exception occurs.
|
||||||
|
*/
|
||||||
|
public static IntervalsSource analyzedText(TokenStream tokenStream, int maxGaps, boolean ordered)
|
||||||
|
throws IOException {
|
||||||
|
CachingTokenFilter stream =
|
||||||
|
tokenStream instanceof CachingTokenFilter
|
||||||
|
? (CachingTokenFilter) tokenStream
|
||||||
|
: new CachingTokenFilter(tokenStream);
|
||||||
|
|
||||||
|
return IntervalBuilder.analyzeText(stream, maxGaps, ordered);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,216 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Code adopted from ASL-licensed Elasticsearch.
|
||||||
|
* https://github.com/elastic/elasticsearch/blob/7.10/server/src/test/java/org/elasticsearch/index/query/IntervalBuilderTests.java
|
||||||
|
*
|
||||||
|
* Licensed to Elasticsearch under one or more contributor
|
||||||
|
* license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright
|
||||||
|
* ownership. Elasticsearch licenses this file to you under
|
||||||
|
* the Apache License, Version 2.0 (the "License"); you may
|
||||||
|
* not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing,
|
||||||
|
* software distributed under the License is distributed on an
|
||||||
|
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
* KIND, either express or implied. See the License for the
|
||||||
|
* specific language governing permissions and limitations
|
||||||
|
* under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.lucene.queries.intervals;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import org.apache.lucene.analysis.CachingTokenFilter;
|
||||||
|
import org.apache.lucene.analysis.CannedTokenStream;
|
||||||
|
import org.apache.lucene.analysis.Token;
|
||||||
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
|
|
||||||
|
public class TestIntervalBuilder extends LuceneTestCase {
|
||||||
|
public void testSimpleTerm() throws IOException {
|
||||||
|
CannedTokenStream ts = new CannedTokenStream(new Token("term1", 1, 2));
|
||||||
|
|
||||||
|
IntervalsSource source = IntervalBuilder.analyzeText(new CachingTokenFilter(ts), -1, true);
|
||||||
|
IntervalsSource expected = Intervals.term("term1");
|
||||||
|
|
||||||
|
assertEquals(expected, source);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testOrdered() throws IOException {
|
||||||
|
CannedTokenStream ts =
|
||||||
|
new CannedTokenStream(
|
||||||
|
new Token("term1", 1, 2), new Token("term2", 3, 4), new Token("term3", 5, 6));
|
||||||
|
|
||||||
|
IntervalsSource source = IntervalBuilder.analyzeText(new CachingTokenFilter(ts), -1, true);
|
||||||
|
IntervalsSource expected =
|
||||||
|
Intervals.ordered(
|
||||||
|
Intervals.term("term1"), Intervals.term("term2"), Intervals.term("term3"));
|
||||||
|
|
||||||
|
assertEquals(expected, source);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testUnordered() throws IOException {
|
||||||
|
CannedTokenStream ts =
|
||||||
|
new CannedTokenStream(
|
||||||
|
new Token("term1", 1, 2), new Token("term2", 3, 4), new Token("term3", 5, 6));
|
||||||
|
|
||||||
|
IntervalsSource source = IntervalBuilder.analyzeText(new CachingTokenFilter(ts), -1, false);
|
||||||
|
IntervalsSource expected =
|
||||||
|
Intervals.unordered(
|
||||||
|
Intervals.term("term1"), Intervals.term("term2"), Intervals.term("term3"));
|
||||||
|
|
||||||
|
assertEquals(expected, source);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testPhrase() throws IOException {
|
||||||
|
CannedTokenStream ts =
|
||||||
|
new CannedTokenStream(
|
||||||
|
new Token("term1", 1, 2), new Token("term2", 3, 4), new Token("term3", 5, 6));
|
||||||
|
|
||||||
|
IntervalsSource source = IntervalBuilder.analyzeText(new CachingTokenFilter(ts), 0, true);
|
||||||
|
IntervalsSource expected =
|
||||||
|
Intervals.phrase(Intervals.term("term1"), Intervals.term("term2"), Intervals.term("term3"));
|
||||||
|
|
||||||
|
assertEquals(expected, source);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testPhraseWithStopword() throws IOException {
|
||||||
|
CannedTokenStream ts =
|
||||||
|
new CannedTokenStream(new Token("term1", 1, 1, 2), new Token("term3", 2, 5, 6));
|
||||||
|
|
||||||
|
IntervalsSource source = IntervalBuilder.analyzeText(new CachingTokenFilter(ts), 0, true);
|
||||||
|
IntervalsSource expected =
|
||||||
|
Intervals.phrase(Intervals.term("term1"), Intervals.extend(Intervals.term("term3"), 1, 0));
|
||||||
|
|
||||||
|
assertEquals(expected, source);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testEmptyTokenStream() throws IOException {
|
||||||
|
CannedTokenStream ts = new CannedTokenStream();
|
||||||
|
IntervalsSource source = IntervalBuilder.analyzeText(new CachingTokenFilter(ts), 0, true);
|
||||||
|
assertSame(IntervalBuilder.NO_INTERVALS, source);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testSimpleSynonyms() throws IOException {
|
||||||
|
CannedTokenStream ts =
|
||||||
|
new CannedTokenStream(
|
||||||
|
new Token("term1", 1, 2),
|
||||||
|
new Token("term2", 3, 4),
|
||||||
|
new Token("term4", 0, 3, 4),
|
||||||
|
new Token("term3", 5, 6));
|
||||||
|
|
||||||
|
IntervalsSource source = IntervalBuilder.analyzeText(new CachingTokenFilter(ts), -1, true);
|
||||||
|
IntervalsSource expected =
|
||||||
|
Intervals.ordered(
|
||||||
|
Intervals.term("term1"),
|
||||||
|
Intervals.or(Intervals.term("term2"), Intervals.term("term4")),
|
||||||
|
Intervals.term("term3"));
|
||||||
|
|
||||||
|
assertEquals(expected, source);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testSimpleSynonymsWithGap() throws IOException {
|
||||||
|
// term1 [] term2/term3/term4 term5
|
||||||
|
CannedTokenStream ts =
|
||||||
|
new CannedTokenStream(
|
||||||
|
new Token("term1", 1, 2),
|
||||||
|
new Token("term2", 2, 3, 4),
|
||||||
|
new Token("term3", 0, 3, 4),
|
||||||
|
new Token("term4", 0, 3, 4),
|
||||||
|
new Token("term5", 5, 6));
|
||||||
|
|
||||||
|
IntervalsSource source = IntervalBuilder.analyzeText(new CachingTokenFilter(ts), -1, true);
|
||||||
|
IntervalsSource expected =
|
||||||
|
Intervals.ordered(
|
||||||
|
Intervals.term("term1"),
|
||||||
|
Intervals.extend(
|
||||||
|
Intervals.or(
|
||||||
|
Intervals.term("term2"), Intervals.term("term3"), Intervals.term("term4")),
|
||||||
|
1,
|
||||||
|
0),
|
||||||
|
Intervals.term("term5"));
|
||||||
|
assertEquals(expected, source);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testGraphSynonyms() throws IOException {
|
||||||
|
// term1 term2:2/term3 term4 term5
|
||||||
|
CannedTokenStream ts =
|
||||||
|
new CannedTokenStream(
|
||||||
|
new Token("term1", 1, 2),
|
||||||
|
new Token("term2", 1, 3, 4, 2),
|
||||||
|
new Token("term3", 0, 3, 4),
|
||||||
|
new Token("term4", 5, 6),
|
||||||
|
new Token("term5", 6, 7));
|
||||||
|
|
||||||
|
IntervalsSource source = IntervalBuilder.analyzeText(new CachingTokenFilter(ts), -1, true);
|
||||||
|
IntervalsSource expected =
|
||||||
|
Intervals.ordered(
|
||||||
|
Intervals.term("term1"),
|
||||||
|
Intervals.or(Intervals.term("term2"), Intervals.phrase("term3", "term4")),
|
||||||
|
Intervals.term("term5"));
|
||||||
|
|
||||||
|
assertEquals(expected, source);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testGraphSynonymsWithGaps() throws IOException {
|
||||||
|
// term1 [] term2:4/term3 [] [] term4 term5
|
||||||
|
CannedTokenStream ts =
|
||||||
|
new CannedTokenStream(
|
||||||
|
new Token("term1", 1, 2),
|
||||||
|
new Token("term2", 2, 3, 4, 4),
|
||||||
|
new Token("term3", 0, 3, 4),
|
||||||
|
new Token("term4", 3, 5, 6),
|
||||||
|
new Token("term5", 6, 7));
|
||||||
|
|
||||||
|
IntervalsSource source = IntervalBuilder.analyzeText(new CachingTokenFilter(ts), -1, true);
|
||||||
|
IntervalsSource expected =
|
||||||
|
Intervals.ordered(
|
||||||
|
Intervals.term("term1"),
|
||||||
|
Intervals.or(
|
||||||
|
Intervals.extend(Intervals.term("term2"), 1, 0),
|
||||||
|
Intervals.phrase(
|
||||||
|
Intervals.extend(Intervals.term("term3"), 1, 0),
|
||||||
|
Intervals.extend(Intervals.term("term4"), 2, 0))),
|
||||||
|
Intervals.term("term5"));
|
||||||
|
|
||||||
|
assertEquals(expected, source);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testGraphTerminatesOnGap() throws IOException {
|
||||||
|
// term1 term2:2/term3 term4 [] term5
|
||||||
|
CannedTokenStream ts =
|
||||||
|
new CannedTokenStream(
|
||||||
|
new Token("term1", 1, 2),
|
||||||
|
new Token("term2", 1, 2, 3, 2),
|
||||||
|
new Token("term3", 0, 2, 3),
|
||||||
|
new Token("term4", 2, 3),
|
||||||
|
new Token("term5", 2, 6, 7));
|
||||||
|
|
||||||
|
IntervalsSource source = IntervalBuilder.analyzeText(new CachingTokenFilter(ts), -1, true);
|
||||||
|
IntervalsSource expected =
|
||||||
|
Intervals.ordered(
|
||||||
|
Intervals.term("term1"),
|
||||||
|
Intervals.or(Intervals.term("term2"), Intervals.phrase("term3", "term4")),
|
||||||
|
Intervals.extend(Intervals.term("term5"), 1, 0));
|
||||||
|
assertEquals(expected, source);
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue