LUCENE-10220: Add an utility method to get IntervalSource from analyzed text (or token stream) (#427)

2021-11-05 08:57:48 +01:00 · 2021-11-05 08:57:48 +01:00 · 32d7f52446
parent bec8eaef70
commit 32d7f52446
5 changed files with 658 additions and 11 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -37,7 +37,9 @@ API Changes
 New Features
 ---------------------
-(No changes)
+
 * LUCENE-10220: Add an utility method to get IntervalSource from analyzed text (or token stream).
  (Uwe Schindler, Dawid Weiss, Alan Woodward)
 Improvements
 ---------------------
--- a/lucene/highlighter/src/test/org/apache/lucene/search/matchhighlight/TestMatchHighlighter.java
+++ b/lucene/highlighter/src/test/org/apache/lucene/search/matchhighlight/TestMatchHighlighter.java
@ -32,10 +32,13 @@ import java.util.function.Function;
 import java.util.stream.Collectors;
 import java.util.stream.Stream;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.LowerCaseFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.core.WhitespaceTokenizer;
 import org.apache.lucene.analysis.en.PorterStemFilter;
 import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
 import org.apache.lucene.analysis.standard.StandardTokenizer;
 import org.apache.lucene.analysis.synonym.SynonymGraphFilter;
 import org.apache.lucene.analysis.synonym.SynonymMap;
 import org.apache.lucene.document.Field;
@ -264,6 +267,57 @@ public class TestMatchHighlighter extends LuceneTestCase {
            });
  }
  @Test
  public void testAnalyzedTextIntervals() throws IOException {
    SynonymMap synonymMap =
        buildSynonymMap(
            new String[][] {
              {"moon\u0000shine", "firewater"},
              {"firewater", "moon\u0000shine"},
            });
    Analyzer analyzer =
        new Analyzer() {
          @Override
          protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new StandardTokenizer();
            TokenStream ts = tokenizer;
            ts = new LowerCaseFilter(ts);
            ts = new SynonymGraphFilter(ts, synonymMap, true);
            ts = new PorterStemFilter(ts);
            return new TokenStreamComponents(tokenizer, ts);
          }
        };
    new IndexBuilder(this::toField)
        .doc(FLD_TEXT1, "Where the moon shine falls, firewater flows.")
        .build(
            analyzer,
            reader -> {
              IndexSearcher searcher = new IndexSearcher(reader);
              Sort sortOrder = Sort.INDEXORDER; // So that results are consistently ordered.
              MatchHighlighter highlighter =
                  new MatchHighlighter(searcher, analyzer)
                      .appendFieldHighlighter(
                          FieldValueHighlighters.highlighted(
                              80 * 3, 1, new PassageFormatter("...", ">", "<"), FLD_TEXT1::equals))
                      .appendFieldHighlighter(FieldValueHighlighters.skipRemaining());
              {
                // [moon shine, firewater] are synonyms, tokens are lowercased. Porter stemming on.
                Query query =
                    new IntervalQuery(
                        FLD_TEXT1,
                        Intervals.analyzedText("Firewater Fall", analyzer, FLD_TEXT1, 0, true));
                assertHighlights(
                    toDocList(highlighter.highlight(searcher.search(query, 10, sortOrder), query)),
                    "0. text1: Where the >moon shine falls<, firewater flows.");
              }
            });
  }
  @Test
  public void testCustomFieldHighlightHandling() throws IOException {
    // Match highlighter is a showcase of individual components in this package, suitable
--- a/lucene/queries/src/java/org/apache/lucene/queries/intervals/IntervalBuilder.java
+++ b/lucene/queries/src/java/org/apache/lucene/queries/intervals/IntervalBuilder.java
@ -0,0 +1,325 @@
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 /*
 * Code adopted from ASL-licensed Elasticsearch.
 * https://github.com/elastic/elasticsearch/blob/7.10/server/src/main/java/org/elasticsearch/index/query/IntervalBuilder.java
 *
 * Licensed to Elasticsearch under one or more contributor
 * license agreements. See the NOTICE file distributed with
 * this work for additional information regarding copyright
 * ownership. Elasticsearch licenses this file to you under
 * the Apache License, Version 2.0 (the "License"); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
 package org.apache.lucene.queries.intervals;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.Collections;
 import java.util.Iterator;
 import java.util.List;
 import org.apache.lucene.analysis.CachingTokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
 import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
 import org.apache.lucene.index.LeafReaderContext;
 import org.apache.lucene.search.BooleanQuery;
 import org.apache.lucene.search.QueryVisitor;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.graph.GraphTokenStreamFiniteStrings;
 /**
 * Constructs an {@link IntervalsSource} based on analyzed text.
 *
 * <p>Code adopted from ASL-licensed <a
 * href="https://github.com/elastic/elasticsearch">Elasticsearch</a>.
 *
 * @see
 *     "https://github.com/elastic/elasticsearch/blob/7.10/server/src/main/java/org/elasticsearch/index/query/IntervalBuilder.java"
 */
 final class IntervalBuilder {
  static IntervalsSource analyzeText(CachingTokenFilter stream, int maxGaps, boolean ordered)
      throws IOException {
    assert stream != null;
    TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
    PositionIncrementAttribute posIncAtt = stream.addAttribute(PositionIncrementAttribute.class);
    PositionLengthAttribute posLenAtt = stream.addAttribute(PositionLengthAttribute.class);
    if (termAtt == null) {
      return NO_INTERVALS;
    }
    // phase 1: read through the stream and assess the situation:
    // counting the number of tokens/positions and marking if we have any synonyms.
    int numTokens = 0;
    boolean hasSynonyms = false;
    boolean isGraph = false;
    stream.reset();
    while (stream.incrementToken()) {
      numTokens++;
      int positionIncrement = posIncAtt.getPositionIncrement();
      if (positionIncrement == 0) {
        hasSynonyms = true;
      }
      int positionLength = posLenAtt.getPositionLength();
      if (positionLength > 1) {
        isGraph = true;
      }
    }
    // phase 2: based on token count, presence of synonyms, and options
    // formulate a single term, boolean, or phrase.
    if (numTokens == 0) {
      return NO_INTERVALS;
    } else if (numTokens == 1) {
      // single term
      return analyzeTerm(stream);
    } else if (isGraph) {
      // graph
      return combineSources(analyzeGraph(stream), maxGaps, ordered);
    } else {
      // phrase
      if (hasSynonyms) {
        // phrase with single-term synonyms
        return analyzeSynonyms(stream, maxGaps, ordered);
      } else {
        // simple phrase
        return combineSources(analyzeTerms(stream), maxGaps, ordered);
      }
    }
  }
  private static IntervalsSource analyzeTerm(TokenStream ts) throws IOException {
    TermToBytesRefAttribute bytesAtt = ts.addAttribute(TermToBytesRefAttribute.class);
    ts.reset();
    ts.incrementToken();
    return Intervals.term(BytesRef.deepCopyOf(bytesAtt.getBytesRef()));
  }
  private static IntervalsSource combineSources(
      List<IntervalsSource> sources, int maxGaps, boolean ordered) {
    if (sources.size() == 0) {
      return NO_INTERVALS;
    }
    if (sources.size() == 1) {
      return sources.get(0);
    }
    IntervalsSource[] sourcesArray = sources.toArray(new IntervalsSource[0]);
    if (maxGaps == 0 && ordered) {
      return Intervals.phrase(sourcesArray);
    }
    IntervalsSource inner =
        ordered ? Intervals.ordered(sourcesArray) : Intervals.unordered(sourcesArray);
    if (maxGaps == -1) {
      return inner;
    }
    return Intervals.maxgaps(maxGaps, inner);
  }
  private static List<IntervalsSource> analyzeTerms(TokenStream ts) throws IOException {
    List<IntervalsSource> terms = new ArrayList<>();
    TermToBytesRefAttribute bytesAtt = ts.addAttribute(TermToBytesRefAttribute.class);
    PositionIncrementAttribute posAtt = ts.addAttribute(PositionIncrementAttribute.class);
    ts.reset();
    while (ts.incrementToken()) {
      BytesRef term = bytesAtt.getBytesRef();
      int precedingSpaces = posAtt.getPositionIncrement() - 1;
      terms.add(extend(Intervals.term(BytesRef.deepCopyOf(term)), precedingSpaces));
    }
    ts.end();
    return terms;
  }
  private static IntervalsSource extend(IntervalsSource source, int precedingSpaces) {
    if (precedingSpaces == 0) {
      return source;
    }
    return Intervals.extend(source, precedingSpaces, 0);
  }
  private static IntervalsSource analyzeSynonyms(TokenStream ts, int maxGaps, boolean ordered)
      throws IOException {
    List<IntervalsSource> terms = new ArrayList<>();
    List<IntervalsSource> synonyms = new ArrayList<>();
    TermToBytesRefAttribute bytesAtt = ts.addAttribute(TermToBytesRefAttribute.class);
    PositionIncrementAttribute posAtt = ts.addAttribute(PositionIncrementAttribute.class);
    ts.reset();
    int spaces = 0;
    while (ts.incrementToken()) {
      int posInc = posAtt.getPositionIncrement();
      if (posInc > 0) {
        if (synonyms.size() == 1) {
          terms.add(extend(synonyms.get(0), spaces));
        } else if (synonyms.size() > 1) {
          terms.add(extend(Intervals.or(synonyms.toArray(new IntervalsSource[0])), spaces));
        }
        synonyms.clear();
        spaces = posInc - 1;
      }
      synonyms.add(Intervals.term(BytesRef.deepCopyOf(bytesAtt.getBytesRef())));
    }
    if (synonyms.size() == 1) {
      terms.add(extend(synonyms.get(0), spaces));
    } else {
      terms.add(extend(Intervals.or(synonyms.toArray(new IntervalsSource[0])), spaces));
    }
    return combineSources(terms, maxGaps, ordered);
  }
  private static List<IntervalsSource> analyzeGraph(TokenStream source) throws IOException {
    source.reset();
    GraphTokenStreamFiniteStrings graph = new GraphTokenStreamFiniteStrings(source);
    List<IntervalsSource> clauses = new ArrayList<>();
    int[] articulationPoints = graph.articulationPoints();
    int lastState = 0;
    int maxClauseCount = BooleanQuery.getMaxClauseCount();
    for (int i = 0; i <= articulationPoints.length; i++) {
      int start = lastState;
      int end = -1;
      if (i < articulationPoints.length) {
        end = articulationPoints[i];
      }
      lastState = end;
      if (graph.hasSidePath(start)) {
        List<IntervalsSource> paths = new ArrayList<>();
        Iterator<TokenStream> it = graph.getFiniteStrings(start, end);
        while (it.hasNext()) {
          TokenStream ts = it.next();
          IntervalsSource phrase = combineSources(analyzeTerms(ts), 0, true);
          if (paths.size() >= maxClauseCount) {
            throw new BooleanQuery.TooManyClauses();
          }
          paths.add(phrase);
        }
        if (paths.size() > 0) {
          clauses.add(Intervals.or(paths.toArray(new IntervalsSource[0])));
        }
      } else {
        Iterator<TokenStream> it = graph.getFiniteStrings(start, end);
        TokenStream ts = it.next();
        clauses.addAll(analyzeTerms(ts));
        assert it.hasNext() == false;
      }
    }
    return clauses;
  }
  static final IntervalsSource NO_INTERVALS =
      new IntervalsSource() {
        @Override
        public IntervalIterator intervals(String field, LeafReaderContext ctx) {
          return new IntervalIterator() {
            @Override
            public int start() {
              return NO_MORE_INTERVALS;
            }
            @Override
            public int end() {
              return NO_MORE_INTERVALS;
            }
            @Override
            public int gaps() {
              throw new UnsupportedOperationException();
            }
            @Override
            public int nextInterval() {
              return NO_MORE_INTERVALS;
            }
            @Override
            public float matchCost() {
              return 0;
            }
            @Override
            public int docID() {
              return NO_MORE_DOCS;
            }
            @Override
            public int nextDoc() {
              return NO_MORE_DOCS;
            }
            @Override
            public int advance(int target) {
              return NO_MORE_DOCS;
            }
            @Override
            public long cost() {
              return 0;
            }
          };
        }
        @Override
        public IntervalMatchesIterator matches(String field, LeafReaderContext ctx, int doc) {
          return null;
        }
        @Override
        public void visit(String field, QueryVisitor visitor) {}
        @Override
        public int minExtent() {
          return 0;
        }
        @Override
        public Collection<IntervalsSource> pullUpDisjunctions() {
          return Collections.emptyList();
        }
        @Override
        public int hashCode() {
          return 0;
        }
        @Override
        public boolean equals(Object other) {
          return other == this;
        }
        @Override
        public String toString() {
          return "no_match";
        }
      };
 }
--- a/lucene/queries/src/java/org/apache/lucene/queries/intervals/Intervals.java
+++ b/lucene/queries/src/java/org/apache/lucene/queries/intervals/Intervals.java
@ -17,9 +17,13 @@
 package org.apache.lucene.queries.intervals;
 import java.io.IOException;
 import java.util.Arrays;
 import java.util.List;
 import java.util.function.Predicate;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.CachingTokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.search.PrefixQuery;
 import org.apache.lucene.search.WildcardQuery;
@ -30,17 +34,17 @@ import org.apache.lucene.util.automaton.CompiledAutomaton;
 * Constructor functions for {@link IntervalsSource} types
 *
 * <p>These sources implement minimum-interval algorithms taken from the paper <a
- * href="http://vigna.di.unimi.it/ftp/papers/EfficientAlgorithmsMinimalIntervalSemantics.pdf">
+ * href="https://vigna.di.unimi.it/ftp/papers/EfficientLazy.pdf">Efficient Optimally Lazy Algorithms
- * Efficient Optimally Lazy Algorithms for Minimal-Interval Semantics</a>
+ * for Minimal-Interval Semantics</a>
 *
- * <p>By default, sources that are sensitive to internal gaps (e.g. PHRASE and MAXGAPS) will rewrite
+ * <p>By default, sources that are sensitive to internal gaps (e.g. {@code PHRASE} and {@code
- * their sub-sources so that disjunctions of different lengths are pulled up to the top of the
+ * MAXGAPS}) will rewrite their sub-sources so that disjunctions of different lengths are pulled up
- * interval tree. For example, PHRASE(or(PHRASE("a", "b", "c"), "b"), "c") will automatically
+ * to the top of the interval tree. For example, {@code PHRASE(or(PHRASE("a", "b", "c"), "b"), "c")}
- * rewrite itself to OR(PHRASE("a", "b", "c", "c"), PHRASE("b", "c")) to ensure that documents
+ * will automatically rewrite itself to {@code OR(PHRASE("a", "b", "c", "c"), PHRASE("b", "c"))} to
- * containing "b c" are matched. This can lead to less efficient queries, as more terms need to be
+ * ensure that documents containing {@code "b c"} are matched. This can lead to less efficient
- * loaded (for example, the "c" iterator above is loaded twice), so if you care more about speed
+ * queries, as more terms need to be loaded (for example, the {@code "c"} iterator above is loaded
- * than about accuracy you can use the {@link #or(boolean, IntervalsSource...)} factory method to
+ * twice), so if you care more about speed than about accuracy you can use the {@link #or(boolean,
- * prevent rewriting.
+ * IntervalsSource...)} factory method to prevent rewriting.
 */
 public final class Intervals {
@ -429,4 +433,50 @@ public final class Intervals {
        source,
        Intervals.extend(new OffsetIntervalsSource(reference, false), 0, Integer.MAX_VALUE));
  }
  /**
   * Returns intervals that correspond to tokens from a {@link TokenStream} returned for {@code
   * text} by applying the provided {@link Analyzer} as if {@code text} was the content of the given
   * {@code field}. The intervals can be ordered or unordered and can have optional gaps inside.
   *
   * @param text The text to analyze.
   * @param analyzer The {@link Analyzer} to use to acquire a {@link TokenStream} which is then
   *     converted into intervals.
   * @param field The field {@code text} should be parsed as.
   * @param maxGaps Maximum number of allowed gaps between sub-intervals resulting from tokens.
   * @param ordered Whether sub-intervals should enforce token ordering or not.
   * @return Returns an {@link IntervalsSource} that matches tokens acquired from analysis of {@code
   *     text}. Possibly an empty interval source, never {@code null}.
   * @throws IOException If an I/O exception occurs.
   */
  public static IntervalsSource analyzedText(
      String text, Analyzer analyzer, String field, int maxGaps, boolean ordered)
      throws IOException {
    try (TokenStream ts = analyzer.tokenStream(field, text)) {
      return analyzedText(ts, maxGaps, ordered);
    }
  }
  /**
   * Returns intervals that correspond to tokens from the provided {@link TokenStream}. This is a
   * low-level counterpart to {@link #analyzedText(String, Analyzer, String, int, boolean)}. The
   * intervals can be ordered or unordered and can have optional gaps inside.
   *
   * @param tokenStream The token stream to produce intervals for. The token stream may be fully or
   *     partially consumed after returning from this method.
   * @param maxGaps Maximum number of allowed gaps between sub-intervals resulting from tokens.
   * @param ordered Whether sub-intervals should enforce token ordering or not.
   * @return Returns an {@link IntervalsSource} that matches tokens acquired from analysis of {@code
   *     text}. Possibly an empty interval source, never {@code null}.
   * @throws IOException If an I/O exception occurs.
   */
  public static IntervalsSource analyzedText(TokenStream tokenStream, int maxGaps, boolean ordered)
      throws IOException {
    CachingTokenFilter stream =
        tokenStream instanceof CachingTokenFilter
            ? (CachingTokenFilter) tokenStream
            : new CachingTokenFilter(tokenStream);
    return IntervalBuilder.analyzeText(stream, maxGaps, ordered);
  }
 }
--- a/lucene/queries/src/test/org/apache/lucene/queries/intervals/TestIntervalBuilder.java
+++ b/lucene/queries/src/test/org/apache/lucene/queries/intervals/TestIntervalBuilder.java
@ -0,0 +1,216 @@
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 /*
 * Code adopted from ASL-licensed Elasticsearch.
 * https://github.com/elastic/elasticsearch/blob/7.10/server/src/test/java/org/elasticsearch/index/query/IntervalBuilderTests.java
 *
 * Licensed to Elasticsearch under one or more contributor
 * license agreements. See the NOTICE file distributed with
 * this work for additional information regarding copyright
 * ownership. Elasticsearch licenses this file to you under
 * the Apache License, Version 2.0 (the "License"); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
 package org.apache.lucene.queries.intervals;
 import java.io.IOException;
 import org.apache.lucene.analysis.CachingTokenFilter;
 import org.apache.lucene.analysis.CannedTokenStream;
 import org.apache.lucene.analysis.Token;
 import org.apache.lucene.util.LuceneTestCase;
 public class TestIntervalBuilder extends LuceneTestCase {
  public void testSimpleTerm() throws IOException {
    CannedTokenStream ts = new CannedTokenStream(new Token("term1", 1, 2));
    IntervalsSource source = IntervalBuilder.analyzeText(new CachingTokenFilter(ts), -1, true);
    IntervalsSource expected = Intervals.term("term1");
    assertEquals(expected, source);
  }
  public void testOrdered() throws IOException {
    CannedTokenStream ts =
        new CannedTokenStream(
            new Token("term1", 1, 2), new Token("term2", 3, 4), new Token("term3", 5, 6));
    IntervalsSource source = IntervalBuilder.analyzeText(new CachingTokenFilter(ts), -1, true);
    IntervalsSource expected =
        Intervals.ordered(
            Intervals.term("term1"), Intervals.term("term2"), Intervals.term("term3"));
    assertEquals(expected, source);
  }
  public void testUnordered() throws IOException {
    CannedTokenStream ts =
        new CannedTokenStream(
            new Token("term1", 1, 2), new Token("term2", 3, 4), new Token("term3", 5, 6));
    IntervalsSource source = IntervalBuilder.analyzeText(new CachingTokenFilter(ts), -1, false);
    IntervalsSource expected =
        Intervals.unordered(
            Intervals.term("term1"), Intervals.term("term2"), Intervals.term("term3"));
    assertEquals(expected, source);
  }
  public void testPhrase() throws IOException {
    CannedTokenStream ts =
        new CannedTokenStream(
            new Token("term1", 1, 2), new Token("term2", 3, 4), new Token("term3", 5, 6));
    IntervalsSource source = IntervalBuilder.analyzeText(new CachingTokenFilter(ts), 0, true);
    IntervalsSource expected =
        Intervals.phrase(Intervals.term("term1"), Intervals.term("term2"), Intervals.term("term3"));
    assertEquals(expected, source);
  }
  public void testPhraseWithStopword() throws IOException {
    CannedTokenStream ts =
        new CannedTokenStream(new Token("term1", 1, 1, 2), new Token("term3", 2, 5, 6));
    IntervalsSource source = IntervalBuilder.analyzeText(new CachingTokenFilter(ts), 0, true);
    IntervalsSource expected =
        Intervals.phrase(Intervals.term("term1"), Intervals.extend(Intervals.term("term3"), 1, 0));
    assertEquals(expected, source);
  }
  public void testEmptyTokenStream() throws IOException {
    CannedTokenStream ts = new CannedTokenStream();
    IntervalsSource source = IntervalBuilder.analyzeText(new CachingTokenFilter(ts), 0, true);
    assertSame(IntervalBuilder.NO_INTERVALS, source);
  }
  public void testSimpleSynonyms() throws IOException {
    CannedTokenStream ts =
        new CannedTokenStream(
            new Token("term1", 1, 2),
            new Token("term2", 3, 4),
            new Token("term4", 0, 3, 4),
            new Token("term3", 5, 6));
    IntervalsSource source = IntervalBuilder.analyzeText(new CachingTokenFilter(ts), -1, true);
    IntervalsSource expected =
        Intervals.ordered(
            Intervals.term("term1"),
            Intervals.or(Intervals.term("term2"), Intervals.term("term4")),
            Intervals.term("term3"));
    assertEquals(expected, source);
  }
  public void testSimpleSynonymsWithGap() throws IOException {
    // term1 [] term2/term3/term4 term5
    CannedTokenStream ts =
        new CannedTokenStream(
            new Token("term1", 1, 2),
            new Token("term2", 2, 3, 4),
            new Token("term3", 0, 3, 4),
            new Token("term4", 0, 3, 4),
            new Token("term5", 5, 6));
    IntervalsSource source = IntervalBuilder.analyzeText(new CachingTokenFilter(ts), -1, true);
    IntervalsSource expected =
        Intervals.ordered(
            Intervals.term("term1"),
            Intervals.extend(
                Intervals.or(
                    Intervals.term("term2"), Intervals.term("term3"), Intervals.term("term4")),
                1,
                0),
            Intervals.term("term5"));
    assertEquals(expected, source);
  }
  public void testGraphSynonyms() throws IOException {
    // term1 term2:2/term3 term4 term5
    CannedTokenStream ts =
        new CannedTokenStream(
            new Token("term1", 1, 2),
            new Token("term2", 1, 3, 4, 2),
            new Token("term3", 0, 3, 4),
            new Token("term4", 5, 6),
            new Token("term5", 6, 7));
    IntervalsSource source = IntervalBuilder.analyzeText(new CachingTokenFilter(ts), -1, true);
    IntervalsSource expected =
        Intervals.ordered(
            Intervals.term("term1"),
            Intervals.or(Intervals.term("term2"), Intervals.phrase("term3", "term4")),
            Intervals.term("term5"));
    assertEquals(expected, source);
  }
  public void testGraphSynonymsWithGaps() throws IOException {
    // term1 [] term2:4/term3 [] [] term4 term5
    CannedTokenStream ts =
        new CannedTokenStream(
            new Token("term1", 1, 2),
            new Token("term2", 2, 3, 4, 4),
            new Token("term3", 0, 3, 4),
            new Token("term4", 3, 5, 6),
            new Token("term5", 6, 7));
    IntervalsSource source = IntervalBuilder.analyzeText(new CachingTokenFilter(ts), -1, true);
    IntervalsSource expected =
        Intervals.ordered(
            Intervals.term("term1"),
            Intervals.or(
                Intervals.extend(Intervals.term("term2"), 1, 0),
                Intervals.phrase(
                    Intervals.extend(Intervals.term("term3"), 1, 0),
                    Intervals.extend(Intervals.term("term4"), 2, 0))),
            Intervals.term("term5"));
    assertEquals(expected, source);
  }
  public void testGraphTerminatesOnGap() throws IOException {
    // term1 term2:2/term3 term4 [] term5
    CannedTokenStream ts =
        new CannedTokenStream(
            new Token("term1", 1, 2),
            new Token("term2", 1, 2, 3, 2),
            new Token("term3", 0, 2, 3),
            new Token("term4", 2, 3),
            new Token("term5", 2, 6, 7));
    IntervalsSource source = IntervalBuilder.analyzeText(new CachingTokenFilter(ts), -1, true);
    IntervalsSource expected =
        Intervals.ordered(
            Intervals.term("term1"),
            Intervals.or(Intervals.term("term2"), Intervals.phrase("term3", "term4")),
            Intervals.extend(Intervals.term("term5"), 1, 0));
    assertEquals(expected, source);
  }
 }