LUCENE-8646: Multi-term intervals

2025-02-10 03:55:46 +00:00 · 2019-01-17 13:41:19 +00:00 · 2019-01-17 13:41:19 +00:00 · 7d7ab14776
commit 7d7ab14776
parent 87d68c8253
5 changed files with 191 additions and 1 deletions
--- a/lucene/sandbox/src/java/org/apache/lucene/search/intervals/DisjunctionIntervalsSource.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/search/intervals/DisjunctionIntervalsSource.java
@ -99,7 +99,7 @@ class DisjunctionIntervalsSource extends IntervalsSource {
    return minExtent;
  }

-  private static class DisjunctionIntervalIterator extends IntervalIterator {
+  static class DisjunctionIntervalIterator extends IntervalIterator {

    final DocIdSetIterator approximation;
    final PriorityQueue<IntervalIterator> intervalQueue;
--- a/lucene/sandbox/src/java/org/apache/lucene/search/intervals/Intervals.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/search/intervals/Intervals.java
@ -19,7 +19,11 @@ package org.apache.lucene.search.intervals;

 import java.util.Arrays;

+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.PrefixQuery;
+import org.apache.lucene.search.WildcardQuery;
 import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.automaton.CompiledAutomaton;

 /**
 * Constructor functions for {@link IntervalsSource} types
@ -75,6 +79,28 @@ public final class Intervals {
    return new DisjunctionIntervalsSource(Arrays.asList(subSources));
  }

+  /**
+   * Return an {@link IntervalsSource} over the disjunction of all terms that begin with a prefix
+   *
+   * @throws IllegalStateException if the prefix expands to more than 128 terms
+   */
+  public static IntervalsSource prefix(String prefix) {
+    CompiledAutomaton ca = new CompiledAutomaton(PrefixQuery.toAutomaton(new BytesRef(prefix)));
+    return new MultiTermIntervalsSource(ca, 128, prefix);
+  }
+
+  /**
+   * Return an {@link IntervalsSource} over the disjunction of all terms that match a wildcard glob
+   *
+   * @throws IllegalStateException if the wildcard glob expands to more than 128 terms
+   *
+   * @see WildcardQuery for glob format
+   */
+  public static IntervalsSource wildcard(String wildcard) {
+    CompiledAutomaton ca = new CompiledAutomaton(WildcardQuery.toAutomaton(new Term("", wildcard)));
+    return new MultiTermIntervalsSource(ca, 128, wildcard);
+  }
+
  /**
   * Create an {@link IntervalsSource} that filters a sub-source by the width of its intervals
   * @param width       the maximum width of intervals in the sub-source to filter
--- a/lucene/sandbox/src/java/org/apache/lucene/search/intervals/MultiTermIntervalsSource.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/search/intervals/MultiTermIntervalsSource.java
@ -0,0 +1,120 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.search.intervals;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Objects;
+import java.util.Set;
+
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.search.MatchesIterator;
+import org.apache.lucene.search.MatchesUtils;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.automaton.CompiledAutomaton;
+
+class MultiTermIntervalsSource extends IntervalsSource {
+
+  private final CompiledAutomaton automaton;
+  private final int maxExpansions;
+  private final String pattern;
+
+  MultiTermIntervalsSource(CompiledAutomaton automaton, int maxExpansions, String pattern) {
+    this.automaton = automaton;
+    this.maxExpansions = maxExpansions;
+    this.pattern = pattern;
+  }
+
+  @Override
+  public IntervalIterator intervals(String field, LeafReaderContext ctx) throws IOException {
+    Terms terms = ctx.reader().terms(field);
+    if (terms == null) {
+      return null;
+    }
+    List<IntervalIterator> subSources = new ArrayList<>();
+    TermsEnum te = automaton.getTermsEnum(terms);
+    BytesRef term;
+    int count = 0;
+    while ((term = te.next()) != null) {
+      subSources.add(TermIntervalsSource.intervals(term, te));
+      if (count++ > maxExpansions) {
+        throw new IllegalStateException("Automaton " + this.pattern + " expanded to too many terms (limit " + maxExpansions + ")");
+      }
+    }
+    if (subSources.size() == 0) {
+      return null;
+    }
+    return new DisjunctionIntervalsSource.DisjunctionIntervalIterator(subSources);
+  }
+
+  @Override
+  public MatchesIterator matches(String field, LeafReaderContext ctx, int doc) throws IOException {
+    Terms terms = ctx.reader().terms(field);
+    if (terms == null) {
+      return null;
+    }
+    List<MatchesIterator> subMatches = new ArrayList<>();
+    TermsEnum te = automaton.getTermsEnum(terms);
+    BytesRef term;
+    int count = 0;
+    while ((term = te.next()) != null) {
+      MatchesIterator mi = TermIntervalsSource.matches(te, doc);
+      if (mi != null) {
+        subMatches.add(mi);
+        if (count++ > maxExpansions) {
+          throw new IllegalStateException("Automaton " + term + " expanded to too many terms (limit " + maxExpansions + ")");
+        }
+      }
+    }
+    return MatchesUtils.disjunction(subMatches);
+  }
+
+  @Override
+  public void extractTerms(String field, Set<Term> terms) {
+
+  }
+
+  @Override
+  public int minExtent() {
+    return 1;
+  }
+
+  @Override
+  public boolean equals(Object o) {
+    if (this == o) return true;
+    if (o == null || getClass() != o.getClass()) return false;
+    MultiTermIntervalsSource that = (MultiTermIntervalsSource) o;
+    return maxExpansions == that.maxExpansions &&
+        Objects.equals(automaton, that.automaton) &&
+        Objects.equals(pattern, that.pattern);
+  }
+
+  @Override
+  public int hashCode() {
+    return Objects.hash(automaton, maxExpansions, pattern);
+  }
+
+  @Override
+  public String toString() {
+    return "MultiTerm(" + pattern + ")";
+  }
+}
--- a/lucene/sandbox/src/java/org/apache/lucene/search/intervals/TermIntervalsSource.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/search/intervals/TermIntervalsSource.java
@ -55,6 +55,10 @@ class TermIntervalsSource extends IntervalsSource {
    if (te.seekExact(term) == false) {
      return null;
    }
+    return intervals(term, te);
+  }
+
+  static IntervalIterator intervals(BytesRef term, TermsEnum te) throws IOException {
    PostingsEnum pe = te.postings(null, PostingsEnum.POSITIONS);
    float cost = termPositionsCost(te);
    return new IntervalIterator() {
@ -143,6 +147,10 @@ class TermIntervalsSource extends IntervalsSource {
    if (te.seekExact(term) == false) {
      return null;
    }
+    return matches(te, doc);
+  }
+
+  static MatchesIterator matches(TermsEnum te, int doc) throws IOException {
    PostingsEnum pe = te.postings(null, PostingsEnum.OFFSETS);
    if (pe.advance(doc) != doc) {
      return null;
--- a/lucene/sandbox/src/test/org/apache/lucene/search/intervals/TestIntervals.java
+++ b/lucene/sandbox/src/test/org/apache/lucene/search/intervals/TestIntervals.java
@ -726,4 +726,40 @@ public class TestIntervals extends LuceneTestCase {

  }

+  public void testPrefix() throws IOException {
+    IntervalsSource source = Intervals.prefix("p");
+    checkIntervals(source, "field1", 5, new int[][]{
+        {},
+        { 0, 0, 1, 1, 3, 3, 4, 4, 6, 6, 7, 7, 10, 10, 27, 27 },
+        { 0, 0, 1, 1, 3, 3, 4, 4, 6, 6, 7, 7, 10, 10 },
+        { 7, 7 },
+        { 0, 0, 1, 1, 3, 3, 4, 4, 6, 6, 7, 7, 10, 10 },
+        { 0, 0 }
+    });
+    MatchesIterator mi = getMatches(source, 1, "field1");
+    assertNotNull(mi);
+    assertMatch(mi, 0, 0, 0, 5);
+    assertMatch(mi, 1, 1, 6, 14);
+
+    IntervalsSource noSuch = Intervals.prefix("qqq");
+    checkIntervals(noSuch, "field1", 0, new int[][]{});
+  }
+
+  public void testWildcard() throws IOException {
+    IntervalsSource source = Intervals.wildcard("?ot");
+    checkIntervals(source, "field1", 4, new int[][]{
+        {},
+        { 2, 2, 10, 10, 17, 17, 27, 27 },
+        { 5, 5, 10, 10, 21, 21 },
+        { 3, 3 },
+        { 2, 2, 10, 10, 17, 17 },
+        {}
+    });
+    MatchesIterator mi = getMatches(source, 4, "field1");
+    assertNotNull(mi);
+    assertMatch(mi, 2, 2, 15, 18);
+    assertMatch(mi, 10, 10, 63, 66);
+    assertMatch(mi, 17, 17, 97, 100);
+  }
+
 }