LUCENE-7824: Fix graph query analysis for multi-word synonym rules with common terms (eg. new york, new york city).

2017-05-14 21:12:42 +02:00 · 2017-05-14 21:12:42 +02:00 · 21362a3ba4
parent e11bc03098
commit 21362a3ba4
3 changed files with 64 additions and 26 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -131,6 +131,9 @@ Bug Fixes
 "lucene"/standard query parser, should require " TO " in range queries,
  and accept "TO" as endpoints in range queries. (hossman, Steve Rowe)
 * LUCENE-7824: Fix graph query analysis for multi-word synonym rules with common terms (eg. new york, new york city).
  (Jim Ferenczi)
 Improvements
 * LUCENE-7782: OfflineSorter now passes the total number of items it
--- a/lucene/core/src/java/org/apache/lucene/util/graph/GraphTokenStreamFiniteStrings.java
+++ b/lucene/core/src/java/org/apache/lucene/util/graph/GraphTokenStreamFiniteStrings.java
@ -48,7 +48,6 @@ import static org.apache.lucene.util.automaton.Operations.DEFAULT_MAX_DETERMINIZ
 * This class also provides helpers to explore the different paths of the {@link Automaton}.
 */
 public final class GraphTokenStreamFiniteStrings {
  private final Map<BytesRef, Integer> termToID = new HashMap<>();
  private final Map<Integer, BytesRef> idToTerm = new HashMap<>();
  private final Map<Integer, Integer> idToInc = new HashMap<>();
  private final Automaton det;
@ -247,36 +246,19 @@ public final class GraphTokenStreamFiniteStrings {
  }
  /**
-   * Gets an integer id for a given term.
+   * Gets an integer id for a given term and saves the position increment if needed.
   *
   * If there is no position gaps for this token then we can reuse the id for the same term if it appeared at another
   * position without a gap.  If we have a position gap generate a new id so we can keep track of the position
   * increment.
   */
  private int getTermID(int incr, int prevIncr, BytesRef term) {
    assert term != null;
    boolean isStackedGap = incr == 0 && prevIncr > 1;
-    boolean hasGap = incr > 1;
+    int id = idToTerm.size();
    Integer id;
    if (hasGap || isStackedGap) {
      id = idToTerm.size();
    idToTerm.put(id, BytesRef.deepCopyOf(term));
    // stacked token should have the same increment as original token at this position
    if (isStackedGap) {
      idToInc.put(id, prevIncr);
-      } else {
+    } else if (incr > 1) {
      idToInc.put(id, incr);
    }
    } else {
      id = termToID.get(term);
      if (id == null) {
        term = BytesRef.deepCopyOf(term);
        id = idToTerm.size();
        termToID.put(term, id);
        idToTerm.put(id, term);
      }
    }
    return id;
  }
--- a/lucene/core/src/test/org/apache/lucene/util/graph/TestGraphTokenStreamFiniteStrings.java
+++ b/lucene/core/src/test/org/apache/lucene/util/graph/TestGraphTokenStreamFiniteStrings.java
@ -378,6 +378,59 @@ public class TestGraphTokenStreamFiniteStrings extends LuceneTestCase {
    assertArrayEquals(terms, new Term[] {new Term("field", "network")});
  }
  public void testStackedGraphWithRepeat() throws Exception {
    TokenStream ts = new CannedTokenStream(
        token("ny", 1, 4),
        token("new", 0, 1),
        token("new", 0, 3),
        token("york", 1, 1),
        token("city", 1, 2),
        token("york", 1, 1),
        token("is", 1, 1),
        token("great", 1, 1)
    );
    GraphTokenStreamFiniteStrings graph = new GraphTokenStreamFiniteStrings(ts);
    Iterator<TokenStream> it = graph.getFiniteStrings();
    assertTrue(it.hasNext());
    assertTokenStream(it.next(), new String[]{"ny", "is", "great"}, new int[]{1, 1, 1});
    assertTrue(it.hasNext());
    assertTokenStream(it.next(), new String[]{"new", "york", "city", "is", "great"}, new int[]{1, 1, 1, 1, 1});
    assertTrue(it.hasNext());
    assertTokenStream(it.next(), new String[]{"new", "york", "is", "great"}, new int[]{1, 1, 1, 1});
    assertFalse(it.hasNext());
    int[] points = graph.articulationPoints();
    assertArrayEquals(points, new int[] {4, 5});
    assertTrue(graph.hasSidePath(0));
    it = graph.getFiniteStrings(0, 4);
    assertTrue(it.hasNext());
    assertTokenStream(it.next(), new String[]{"ny"}, new int[]{1});
    assertTrue(it.hasNext());
    assertTokenStream(it.next(), new String[]{"new", "york", "city"}, new int[]{1, 1, 1});
    assertTrue(it.hasNext());
    assertTokenStream(it.next(), new String[]{"new", "york"}, new int[]{1, 1});
    assertFalse(it.hasNext());
    assertFalse(graph.hasSidePath(4));
    it = graph.getFiniteStrings(4, 5);
    assertTrue(it.hasNext());
    assertTokenStream(it.next(), new String[]{"is"}, new int[] {1});
    assertFalse(it.hasNext());
    Term[] terms = graph.getTerms("field", 4);
    assertArrayEquals(terms, new Term[] {new Term("field", "is")});
    assertFalse(graph.hasSidePath(5));
    it = graph.getFiniteStrings(5, -1);
    assertTrue(it.hasNext());
    assertTokenStream(it.next(), new String[]{"great"}, new int[] {1});
    assertFalse(it.hasNext());
    terms = graph.getTerms("field", 5);
    assertArrayEquals(terms, new Term[] {new Term("field", "great")});
  }
  public void testGraphWithRegularSynonym() throws Exception {
    TokenStream ts = new CannedTokenStream(
        token("fast", 1, 1),