From 32d7f52446ae363b45817b16d8f8ac106e1b461c Mon Sep 17 00:00:00 2001 From: Dawid Weiss Date: Fri, 5 Nov 2021 08:57:48 +0100 Subject: [PATCH] LUCENE-10220: Add an utility method to get IntervalSource from analyzed text (or token stream) (#427) --- lucene/CHANGES.txt | 4 +- .../matchhighlight/TestMatchHighlighter.java | 54 +++ .../queries/intervals/IntervalBuilder.java | 325 ++++++++++++++++++ .../lucene/queries/intervals/Intervals.java | 70 +++- .../intervals/TestIntervalBuilder.java | 216 ++++++++++++ 5 files changed, 658 insertions(+), 11 deletions(-) create mode 100644 lucene/queries/src/java/org/apache/lucene/queries/intervals/IntervalBuilder.java create mode 100644 lucene/queries/src/test/org/apache/lucene/queries/intervals/TestIntervalBuilder.java diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 7010a64a521..20b8554be04 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -37,7 +37,9 @@ API Changes New Features --------------------- -(No changes) + +* LUCENE-10220: Add an utility method to get IntervalSource from analyzed text (or token stream). + (Uwe Schindler, Dawid Weiss, Alan Woodward) Improvements --------------------- diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/matchhighlight/TestMatchHighlighter.java b/lucene/highlighter/src/test/org/apache/lucene/search/matchhighlight/TestMatchHighlighter.java index bafb75d4144..a441b903f54 100644 --- a/lucene/highlighter/src/test/org/apache/lucene/search/matchhighlight/TestMatchHighlighter.java +++ b/lucene/highlighter/src/test/org/apache/lucene/search/matchhighlight/TestMatchHighlighter.java @@ -32,10 +32,13 @@ import java.util.function.Function; import java.util.stream.Collectors; import java.util.stream.Stream; import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.LowerCaseFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.core.WhitespaceTokenizer; +import org.apache.lucene.analysis.en.PorterStemFilter; import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper; +import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.synonym.SynonymGraphFilter; import org.apache.lucene.analysis.synonym.SynonymMap; import org.apache.lucene.document.Field; @@ -264,6 +267,57 @@ public class TestMatchHighlighter extends LuceneTestCase { }); } + @Test + public void testAnalyzedTextIntervals() throws IOException { + SynonymMap synonymMap = + buildSynonymMap( + new String[][] { + {"moon\u0000shine", "firewater"}, + {"firewater", "moon\u0000shine"}, + }); + + Analyzer analyzer = + new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName) { + Tokenizer tokenizer = new StandardTokenizer(); + TokenStream ts = tokenizer; + ts = new LowerCaseFilter(ts); + ts = new SynonymGraphFilter(ts, synonymMap, true); + ts = new PorterStemFilter(ts); + return new TokenStreamComponents(tokenizer, ts); + } + }; + + new IndexBuilder(this::toField) + .doc(FLD_TEXT1, "Where the moon shine falls, firewater flows.") + .build( + analyzer, + reader -> { + IndexSearcher searcher = new IndexSearcher(reader); + Sort sortOrder = Sort.INDEXORDER; // So that results are consistently ordered. + + MatchHighlighter highlighter = + new MatchHighlighter(searcher, analyzer) + .appendFieldHighlighter( + FieldValueHighlighters.highlighted( + 80 * 3, 1, new PassageFormatter("...", ">", "<"), FLD_TEXT1::equals)) + .appendFieldHighlighter(FieldValueHighlighters.skipRemaining()); + + { + // [moon shine, firewater] are synonyms, tokens are lowercased. Porter stemming on. + Query query = + new IntervalQuery( + FLD_TEXT1, + Intervals.analyzedText("Firewater Fall", analyzer, FLD_TEXT1, 0, true)); + + assertHighlights( + toDocList(highlighter.highlight(searcher.search(query, 10, sortOrder), query)), + "0. text1: Where the >moon shine falls<, firewater flows."); + } + }); + } + @Test public void testCustomFieldHighlightHandling() throws IOException { // Match highlighter is a showcase of individual components in this package, suitable diff --git a/lucene/queries/src/java/org/apache/lucene/queries/intervals/IntervalBuilder.java b/lucene/queries/src/java/org/apache/lucene/queries/intervals/IntervalBuilder.java new file mode 100644 index 00000000000..b6e1727616c --- /dev/null +++ b/lucene/queries/src/java/org/apache/lucene/queries/intervals/IntervalBuilder.java @@ -0,0 +1,325 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Code adopted from ASL-licensed Elasticsearch. + * https://github.com/elastic/elasticsearch/blob/7.10/server/src/main/java/org/elasticsearch/index/query/IntervalBuilder.java + * + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.lucene.queries.intervals; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.Iterator; +import java.util.List; +import org.apache.lucene.analysis.CachingTokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; +import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.QueryVisitor; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.graph.GraphTokenStreamFiniteStrings; + +/** + * Constructs an {@link IntervalsSource} based on analyzed text. + * + *

Code adopted from ASL-licensed Elasticsearch. + * + * @see + * "https://github.com/elastic/elasticsearch/blob/7.10/server/src/main/java/org/elasticsearch/index/query/IntervalBuilder.java" + */ +final class IntervalBuilder { + static IntervalsSource analyzeText(CachingTokenFilter stream, int maxGaps, boolean ordered) + throws IOException { + assert stream != null; + + TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class); + PositionIncrementAttribute posIncAtt = stream.addAttribute(PositionIncrementAttribute.class); + PositionLengthAttribute posLenAtt = stream.addAttribute(PositionLengthAttribute.class); + + if (termAtt == null) { + return NO_INTERVALS; + } + + // phase 1: read through the stream and assess the situation: + // counting the number of tokens/positions and marking if we have any synonyms. + + int numTokens = 0; + boolean hasSynonyms = false; + boolean isGraph = false; + + stream.reset(); + while (stream.incrementToken()) { + numTokens++; + int positionIncrement = posIncAtt.getPositionIncrement(); + if (positionIncrement == 0) { + hasSynonyms = true; + } + int positionLength = posLenAtt.getPositionLength(); + if (positionLength > 1) { + isGraph = true; + } + } + + // phase 2: based on token count, presence of synonyms, and options + // formulate a single term, boolean, or phrase. + + if (numTokens == 0) { + return NO_INTERVALS; + } else if (numTokens == 1) { + // single term + return analyzeTerm(stream); + } else if (isGraph) { + // graph + return combineSources(analyzeGraph(stream), maxGaps, ordered); + } else { + // phrase + if (hasSynonyms) { + // phrase with single-term synonyms + return analyzeSynonyms(stream, maxGaps, ordered); + } else { + // simple phrase + return combineSources(analyzeTerms(stream), maxGaps, ordered); + } + } + } + + private static IntervalsSource analyzeTerm(TokenStream ts) throws IOException { + TermToBytesRefAttribute bytesAtt = ts.addAttribute(TermToBytesRefAttribute.class); + ts.reset(); + ts.incrementToken(); + return Intervals.term(BytesRef.deepCopyOf(bytesAtt.getBytesRef())); + } + + private static IntervalsSource combineSources( + List sources, int maxGaps, boolean ordered) { + if (sources.size() == 0) { + return NO_INTERVALS; + } + if (sources.size() == 1) { + return sources.get(0); + } + IntervalsSource[] sourcesArray = sources.toArray(new IntervalsSource[0]); + if (maxGaps == 0 && ordered) { + return Intervals.phrase(sourcesArray); + } + IntervalsSource inner = + ordered ? Intervals.ordered(sourcesArray) : Intervals.unordered(sourcesArray); + if (maxGaps == -1) { + return inner; + } + return Intervals.maxgaps(maxGaps, inner); + } + + private static List analyzeTerms(TokenStream ts) throws IOException { + List terms = new ArrayList<>(); + TermToBytesRefAttribute bytesAtt = ts.addAttribute(TermToBytesRefAttribute.class); + PositionIncrementAttribute posAtt = ts.addAttribute(PositionIncrementAttribute.class); + ts.reset(); + while (ts.incrementToken()) { + BytesRef term = bytesAtt.getBytesRef(); + int precedingSpaces = posAtt.getPositionIncrement() - 1; + terms.add(extend(Intervals.term(BytesRef.deepCopyOf(term)), precedingSpaces)); + } + ts.end(); + return terms; + } + + private static IntervalsSource extend(IntervalsSource source, int precedingSpaces) { + if (precedingSpaces == 0) { + return source; + } + return Intervals.extend(source, precedingSpaces, 0); + } + + private static IntervalsSource analyzeSynonyms(TokenStream ts, int maxGaps, boolean ordered) + throws IOException { + List terms = new ArrayList<>(); + List synonyms = new ArrayList<>(); + TermToBytesRefAttribute bytesAtt = ts.addAttribute(TermToBytesRefAttribute.class); + PositionIncrementAttribute posAtt = ts.addAttribute(PositionIncrementAttribute.class); + ts.reset(); + int spaces = 0; + while (ts.incrementToken()) { + int posInc = posAtt.getPositionIncrement(); + if (posInc > 0) { + if (synonyms.size() == 1) { + terms.add(extend(synonyms.get(0), spaces)); + } else if (synonyms.size() > 1) { + terms.add(extend(Intervals.or(synonyms.toArray(new IntervalsSource[0])), spaces)); + } + synonyms.clear(); + spaces = posInc - 1; + } + synonyms.add(Intervals.term(BytesRef.deepCopyOf(bytesAtt.getBytesRef()))); + } + if (synonyms.size() == 1) { + terms.add(extend(synonyms.get(0), spaces)); + } else { + terms.add(extend(Intervals.or(synonyms.toArray(new IntervalsSource[0])), spaces)); + } + return combineSources(terms, maxGaps, ordered); + } + + private static List analyzeGraph(TokenStream source) throws IOException { + source.reset(); + GraphTokenStreamFiniteStrings graph = new GraphTokenStreamFiniteStrings(source); + + List clauses = new ArrayList<>(); + int[] articulationPoints = graph.articulationPoints(); + int lastState = 0; + int maxClauseCount = BooleanQuery.getMaxClauseCount(); + for (int i = 0; i <= articulationPoints.length; i++) { + int start = lastState; + int end = -1; + if (i < articulationPoints.length) { + end = articulationPoints[i]; + } + lastState = end; + if (graph.hasSidePath(start)) { + List paths = new ArrayList<>(); + Iterator it = graph.getFiniteStrings(start, end); + while (it.hasNext()) { + TokenStream ts = it.next(); + IntervalsSource phrase = combineSources(analyzeTerms(ts), 0, true); + if (paths.size() >= maxClauseCount) { + throw new BooleanQuery.TooManyClauses(); + } + paths.add(phrase); + } + if (paths.size() > 0) { + clauses.add(Intervals.or(paths.toArray(new IntervalsSource[0]))); + } + } else { + Iterator it = graph.getFiniteStrings(start, end); + TokenStream ts = it.next(); + clauses.addAll(analyzeTerms(ts)); + assert it.hasNext() == false; + } + } + return clauses; + } + + static final IntervalsSource NO_INTERVALS = + new IntervalsSource() { + @Override + public IntervalIterator intervals(String field, LeafReaderContext ctx) { + return new IntervalIterator() { + @Override + public int start() { + return NO_MORE_INTERVALS; + } + + @Override + public int end() { + return NO_MORE_INTERVALS; + } + + @Override + public int gaps() { + throw new UnsupportedOperationException(); + } + + @Override + public int nextInterval() { + return NO_MORE_INTERVALS; + } + + @Override + public float matchCost() { + return 0; + } + + @Override + public int docID() { + return NO_MORE_DOCS; + } + + @Override + public int nextDoc() { + return NO_MORE_DOCS; + } + + @Override + public int advance(int target) { + return NO_MORE_DOCS; + } + + @Override + public long cost() { + return 0; + } + }; + } + + @Override + public IntervalMatchesIterator matches(String field, LeafReaderContext ctx, int doc) { + return null; + } + + @Override + public void visit(String field, QueryVisitor visitor) {} + + @Override + public int minExtent() { + return 0; + } + + @Override + public Collection pullUpDisjunctions() { + return Collections.emptyList(); + } + + @Override + public int hashCode() { + return 0; + } + + @Override + public boolean equals(Object other) { + return other == this; + } + + @Override + public String toString() { + return "no_match"; + } + }; +} diff --git a/lucene/queries/src/java/org/apache/lucene/queries/intervals/Intervals.java b/lucene/queries/src/java/org/apache/lucene/queries/intervals/Intervals.java index 1094dbeee19..b79d48fd2a4 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/intervals/Intervals.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/intervals/Intervals.java @@ -17,9 +17,13 @@ package org.apache.lucene.queries.intervals; +import java.io.IOException; import java.util.Arrays; import java.util.List; import java.util.function.Predicate; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.CachingTokenFilter; +import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.index.Term; import org.apache.lucene.search.PrefixQuery; import org.apache.lucene.search.WildcardQuery; @@ -30,17 +34,17 @@ import org.apache.lucene.util.automaton.CompiledAutomaton; * Constructor functions for {@link IntervalsSource} types * *

These sources implement minimum-interval algorithms taken from the paper - * Efficient Optimally Lazy Algorithms for Minimal-Interval Semantics + * href="https://vigna.di.unimi.it/ftp/papers/EfficientLazy.pdf">Efficient Optimally Lazy Algorithms + * for Minimal-Interval Semantics * - *

By default, sources that are sensitive to internal gaps (e.g. PHRASE and MAXGAPS) will rewrite - * their sub-sources so that disjunctions of different lengths are pulled up to the top of the - * interval tree. For example, PHRASE(or(PHRASE("a", "b", "c"), "b"), "c") will automatically - * rewrite itself to OR(PHRASE("a", "b", "c", "c"), PHRASE("b", "c")) to ensure that documents - * containing "b c" are matched. This can lead to less efficient queries, as more terms need to be - * loaded (for example, the "c" iterator above is loaded twice), so if you care more about speed - * than about accuracy you can use the {@link #or(boolean, IntervalsSource...)} factory method to - * prevent rewriting. + *

By default, sources that are sensitive to internal gaps (e.g. {@code PHRASE} and {@code + * MAXGAPS}) will rewrite their sub-sources so that disjunctions of different lengths are pulled up + * to the top of the interval tree. For example, {@code PHRASE(or(PHRASE("a", "b", "c"), "b"), "c")} + * will automatically rewrite itself to {@code OR(PHRASE("a", "b", "c", "c"), PHRASE("b", "c"))} to + * ensure that documents containing {@code "b c"} are matched. This can lead to less efficient + * queries, as more terms need to be loaded (for example, the {@code "c"} iterator above is loaded + * twice), so if you care more about speed than about accuracy you can use the {@link #or(boolean, + * IntervalsSource...)} factory method to prevent rewriting. */ public final class Intervals { @@ -429,4 +433,50 @@ public final class Intervals { source, Intervals.extend(new OffsetIntervalsSource(reference, false), 0, Integer.MAX_VALUE)); } + + /** + * Returns intervals that correspond to tokens from a {@link TokenStream} returned for {@code + * text} by applying the provided {@link Analyzer} as if {@code text} was the content of the given + * {@code field}. The intervals can be ordered or unordered and can have optional gaps inside. + * + * @param text The text to analyze. + * @param analyzer The {@link Analyzer} to use to acquire a {@link TokenStream} which is then + * converted into intervals. + * @param field The field {@code text} should be parsed as. + * @param maxGaps Maximum number of allowed gaps between sub-intervals resulting from tokens. + * @param ordered Whether sub-intervals should enforce token ordering or not. + * @return Returns an {@link IntervalsSource} that matches tokens acquired from analysis of {@code + * text}. Possibly an empty interval source, never {@code null}. + * @throws IOException If an I/O exception occurs. + */ + public static IntervalsSource analyzedText( + String text, Analyzer analyzer, String field, int maxGaps, boolean ordered) + throws IOException { + try (TokenStream ts = analyzer.tokenStream(field, text)) { + return analyzedText(ts, maxGaps, ordered); + } + } + + /** + * Returns intervals that correspond to tokens from the provided {@link TokenStream}. This is a + * low-level counterpart to {@link #analyzedText(String, Analyzer, String, int, boolean)}. The + * intervals can be ordered or unordered and can have optional gaps inside. + * + * @param tokenStream The token stream to produce intervals for. The token stream may be fully or + * partially consumed after returning from this method. + * @param maxGaps Maximum number of allowed gaps between sub-intervals resulting from tokens. + * @param ordered Whether sub-intervals should enforce token ordering or not. + * @return Returns an {@link IntervalsSource} that matches tokens acquired from analysis of {@code + * text}. Possibly an empty interval source, never {@code null}. + * @throws IOException If an I/O exception occurs. + */ + public static IntervalsSource analyzedText(TokenStream tokenStream, int maxGaps, boolean ordered) + throws IOException { + CachingTokenFilter stream = + tokenStream instanceof CachingTokenFilter + ? (CachingTokenFilter) tokenStream + : new CachingTokenFilter(tokenStream); + + return IntervalBuilder.analyzeText(stream, maxGaps, ordered); + } } diff --git a/lucene/queries/src/test/org/apache/lucene/queries/intervals/TestIntervalBuilder.java b/lucene/queries/src/test/org/apache/lucene/queries/intervals/TestIntervalBuilder.java new file mode 100644 index 00000000000..a51a69062b4 --- /dev/null +++ b/lucene/queries/src/test/org/apache/lucene/queries/intervals/TestIntervalBuilder.java @@ -0,0 +1,216 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Code adopted from ASL-licensed Elasticsearch. + * https://github.com/elastic/elasticsearch/blob/7.10/server/src/test/java/org/elasticsearch/index/query/IntervalBuilderTests.java + * + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.lucene.queries.intervals; + +import java.io.IOException; +import org.apache.lucene.analysis.CachingTokenFilter; +import org.apache.lucene.analysis.CannedTokenStream; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.util.LuceneTestCase; + +public class TestIntervalBuilder extends LuceneTestCase { + public void testSimpleTerm() throws IOException { + CannedTokenStream ts = new CannedTokenStream(new Token("term1", 1, 2)); + + IntervalsSource source = IntervalBuilder.analyzeText(new CachingTokenFilter(ts), -1, true); + IntervalsSource expected = Intervals.term("term1"); + + assertEquals(expected, source); + } + + public void testOrdered() throws IOException { + CannedTokenStream ts = + new CannedTokenStream( + new Token("term1", 1, 2), new Token("term2", 3, 4), new Token("term3", 5, 6)); + + IntervalsSource source = IntervalBuilder.analyzeText(new CachingTokenFilter(ts), -1, true); + IntervalsSource expected = + Intervals.ordered( + Intervals.term("term1"), Intervals.term("term2"), Intervals.term("term3")); + + assertEquals(expected, source); + } + + public void testUnordered() throws IOException { + CannedTokenStream ts = + new CannedTokenStream( + new Token("term1", 1, 2), new Token("term2", 3, 4), new Token("term3", 5, 6)); + + IntervalsSource source = IntervalBuilder.analyzeText(new CachingTokenFilter(ts), -1, false); + IntervalsSource expected = + Intervals.unordered( + Intervals.term("term1"), Intervals.term("term2"), Intervals.term("term3")); + + assertEquals(expected, source); + } + + public void testPhrase() throws IOException { + CannedTokenStream ts = + new CannedTokenStream( + new Token("term1", 1, 2), new Token("term2", 3, 4), new Token("term3", 5, 6)); + + IntervalsSource source = IntervalBuilder.analyzeText(new CachingTokenFilter(ts), 0, true); + IntervalsSource expected = + Intervals.phrase(Intervals.term("term1"), Intervals.term("term2"), Intervals.term("term3")); + + assertEquals(expected, source); + } + + public void testPhraseWithStopword() throws IOException { + CannedTokenStream ts = + new CannedTokenStream(new Token("term1", 1, 1, 2), new Token("term3", 2, 5, 6)); + + IntervalsSource source = IntervalBuilder.analyzeText(new CachingTokenFilter(ts), 0, true); + IntervalsSource expected = + Intervals.phrase(Intervals.term("term1"), Intervals.extend(Intervals.term("term3"), 1, 0)); + + assertEquals(expected, source); + } + + public void testEmptyTokenStream() throws IOException { + CannedTokenStream ts = new CannedTokenStream(); + IntervalsSource source = IntervalBuilder.analyzeText(new CachingTokenFilter(ts), 0, true); + assertSame(IntervalBuilder.NO_INTERVALS, source); + } + + public void testSimpleSynonyms() throws IOException { + CannedTokenStream ts = + new CannedTokenStream( + new Token("term1", 1, 2), + new Token("term2", 3, 4), + new Token("term4", 0, 3, 4), + new Token("term3", 5, 6)); + + IntervalsSource source = IntervalBuilder.analyzeText(new CachingTokenFilter(ts), -1, true); + IntervalsSource expected = + Intervals.ordered( + Intervals.term("term1"), + Intervals.or(Intervals.term("term2"), Intervals.term("term4")), + Intervals.term("term3")); + + assertEquals(expected, source); + } + + public void testSimpleSynonymsWithGap() throws IOException { + // term1 [] term2/term3/term4 term5 + CannedTokenStream ts = + new CannedTokenStream( + new Token("term1", 1, 2), + new Token("term2", 2, 3, 4), + new Token("term3", 0, 3, 4), + new Token("term4", 0, 3, 4), + new Token("term5", 5, 6)); + + IntervalsSource source = IntervalBuilder.analyzeText(new CachingTokenFilter(ts), -1, true); + IntervalsSource expected = + Intervals.ordered( + Intervals.term("term1"), + Intervals.extend( + Intervals.or( + Intervals.term("term2"), Intervals.term("term3"), Intervals.term("term4")), + 1, + 0), + Intervals.term("term5")); + assertEquals(expected, source); + } + + public void testGraphSynonyms() throws IOException { + // term1 term2:2/term3 term4 term5 + CannedTokenStream ts = + new CannedTokenStream( + new Token("term1", 1, 2), + new Token("term2", 1, 3, 4, 2), + new Token("term3", 0, 3, 4), + new Token("term4", 5, 6), + new Token("term5", 6, 7)); + + IntervalsSource source = IntervalBuilder.analyzeText(new CachingTokenFilter(ts), -1, true); + IntervalsSource expected = + Intervals.ordered( + Intervals.term("term1"), + Intervals.or(Intervals.term("term2"), Intervals.phrase("term3", "term4")), + Intervals.term("term5")); + + assertEquals(expected, source); + } + + public void testGraphSynonymsWithGaps() throws IOException { + // term1 [] term2:4/term3 [] [] term4 term5 + CannedTokenStream ts = + new CannedTokenStream( + new Token("term1", 1, 2), + new Token("term2", 2, 3, 4, 4), + new Token("term3", 0, 3, 4), + new Token("term4", 3, 5, 6), + new Token("term5", 6, 7)); + + IntervalsSource source = IntervalBuilder.analyzeText(new CachingTokenFilter(ts), -1, true); + IntervalsSource expected = + Intervals.ordered( + Intervals.term("term1"), + Intervals.or( + Intervals.extend(Intervals.term("term2"), 1, 0), + Intervals.phrase( + Intervals.extend(Intervals.term("term3"), 1, 0), + Intervals.extend(Intervals.term("term4"), 2, 0))), + Intervals.term("term5")); + + assertEquals(expected, source); + } + + public void testGraphTerminatesOnGap() throws IOException { + // term1 term2:2/term3 term4 [] term5 + CannedTokenStream ts = + new CannedTokenStream( + new Token("term1", 1, 2), + new Token("term2", 1, 2, 3, 2), + new Token("term3", 0, 2, 3), + new Token("term4", 2, 3), + new Token("term5", 2, 6, 7)); + + IntervalsSource source = IntervalBuilder.analyzeText(new CachingTokenFilter(ts), -1, true); + IntervalsSource expected = + Intervals.ordered( + Intervals.term("term1"), + Intervals.or(Intervals.term("term2"), Intervals.phrase("term3", "term4")), + Intervals.extend(Intervals.term("term5"), 1, 0)); + assertEquals(expected, source); + } +}