Add stopword support to IntervalBuilder (#39637)

The match interval builder analyses input text and converts it to an IntervalSource, and as such may generate token streams with stopwords. This commit deals with these by using the extend factory to cover the gaps produced by these stopwords so that phrase and ordered queries work correctly.
2019-03-05 10:31:44 +00:00 · 2019-03-05 10:31:44 +00:00 · 0b14782b23
parent e7dbfda5d3
commit 0b14782b23
2 changed files with 101 additions and 11 deletions
--- a/server/src/main/java/org/elasticsearch/index/query/IntervalBuilder.java
+++ b/server/src/main/java/org/elasticsearch/index/query/IntervalBuilder.java
@ -143,38 +143,50 @@ public class IntervalBuilder {
    protected List<IntervalsSource> analyzeTerms(TokenStream ts) throws IOException {
        List<IntervalsSource> terms = new ArrayList<>();
        TermToBytesRefAttribute bytesAtt = ts.addAttribute(TermToBytesRefAttribute.class);
+        PositionIncrementAttribute posAtt = ts.addAttribute(PositionIncrementAttribute.class);
        ts.reset();
        while (ts.incrementToken()) {
            BytesRef term = bytesAtt.getBytesRef();
-            terms.add(Intervals.term(BytesRef.deepCopyOf(term)));
+            int precedingSpaces = posAtt.getPositionIncrement() - 1;
+            terms.add(extend(Intervals.term(BytesRef.deepCopyOf(term)), precedingSpaces));
        }
        ts.end();
        return terms;
    }

+    public static IntervalsSource extend(IntervalsSource source, int precedingSpaces) {
+        if (precedingSpaces == 0) {
+            return source;
+        }
+        return Intervals.extend(source, precedingSpaces, 0);
+    }
+
    protected IntervalsSource analyzeSynonyms(TokenStream ts, int maxGaps, boolean ordered) throws IOException {
        List<IntervalsSource> terms = new ArrayList<>();
        List<IntervalsSource> synonyms = new ArrayList<>();
        TermToBytesRefAttribute bytesAtt = ts.addAttribute(TermToBytesRefAttribute.class);
        PositionIncrementAttribute posAtt = ts.addAttribute(PositionIncrementAttribute.class);
        ts.reset();
+        int spaces = 0;
        while (ts.incrementToken()) {
-            if (posAtt.getPositionIncrement() == 1) {
+            int posInc = posAtt.getPositionIncrement();
+            if (posInc > 0) {
                if (synonyms.size() == 1) {
-                    terms.add(synonyms.get(0));
+                    terms.add(extend(synonyms.get(0), spaces));
                }
                else if (synonyms.size() > 1) {
-                    terms.add(Intervals.or(synonyms.toArray(new IntervalsSource[0])));
+                    terms.add(extend(Intervals.or(synonyms.toArray(new IntervalsSource[0])), spaces));
                }
                synonyms.clear();
+                spaces = posInc - 1;
            }
            synonyms.add(Intervals.term(BytesRef.deepCopyOf(bytesAtt.getBytesRef())));
        }
        if (synonyms.size() == 1) {
-            terms.add(synonyms.get(0));
+            terms.add(extend(synonyms.get(0), spaces));
        }
        else {
-            terms.add(Intervals.or(synonyms.toArray(new IntervalsSource[0])));
+            terms.add(extend(Intervals.or(synonyms.toArray(new IntervalsSource[0])), spaces));
        }
        return combineSources(terms, maxGaps, ordered);
    }
--- a/server/src/test/java/org/elasticsearch/index/query/IntervalBuilderTests.java
+++ b/server/src/test/java/org/elasticsearch/index/query/IntervalBuilderTests.java
@ -94,6 +94,22 @@ public class IntervalBuilderTests extends ESTestCase {

    }

+    public void testPhraseWithStopword() throws IOException {
+
+        CannedTokenStream ts = new CannedTokenStream(
+            new Token("term1", 1, 1, 2),
+            new Token("term3", 2, 5, 6)
+        );
+
+        IntervalsSource source = BUILDER.analyzeText(new CachingTokenFilter(ts), 0, true);
+        IntervalsSource expected = Intervals.phrase(
+            Intervals.term("term1"), Intervals.extend(Intervals.term("term3"), 1, 0)
+        );
+
+        assertEquals(expected, source);
+
+    }
+
    public void testSimpleSynonyms() throws IOException {

        CannedTokenStream ts = new CannedTokenStream(
@ -112,16 +128,32 @@ public class IntervalBuilderTests extends ESTestCase {

    }

+    public void testSimpleSynonymsWithGap() throws IOException {
+        // term1 [] term2/term3/term4 term5
+        CannedTokenStream ts = new CannedTokenStream(
+            new Token("term1", 1, 2),
+            new Token("term2", 2, 3, 4),
+            new Token("term3", 0, 3, 4),
+            new Token("term4", 0, 3, 4),
+            new Token("term5", 5, 6)
+        );
+
+        IntervalsSource source = BUILDER.analyzeText(new CachingTokenFilter(ts), -1, true);
+        IntervalsSource expected = Intervals.ordered(
+            Intervals.term("term1"),
+            Intervals.extend(Intervals.or(Intervals.term("term2"), Intervals.term("term3"), Intervals.term("term4")), 1, 0),
+            Intervals.term("term5")
+        );
+        assertEquals(expected, source);
+    }
+
    public void testGraphSynonyms() throws IOException {

-        // term1 term2/term3:2 term4 term5
-
-        Token graphToken = new Token("term2", 3, 4);
-        graphToken.setPositionLength(2);
+        // term1 term2:2/term3 term4 term5

        CannedTokenStream ts = new CannedTokenStream(
            new Token("term1", 1, 2),
-            graphToken,
+            new Token("term2", 1, 3, 4, 2),
            new Token("term3", 0, 3, 4),
            new Token("term4", 5, 6),
            new Token("term5", 6, 7)
@ -138,4 +170,50 @@ public class IntervalBuilderTests extends ESTestCase {

    }

+    public void testGraphSynonymsWithGaps() throws IOException {
+
+        // term1 [] term2:4/term3 [] [] term4 term5
+
+        CannedTokenStream ts = new CannedTokenStream(
+            new Token("term1", 1, 2),
+            new Token("term2", 2, 3, 4, 4),
+            new Token("term3", 0, 3, 4),
+            new Token("term4", 3, 5, 6),
+            new Token("term5", 6, 7)
+        );
+
+        IntervalsSource source = BUILDER.analyzeText(new CachingTokenFilter(ts), -1, true);
+        IntervalsSource expected = Intervals.ordered(
+            Intervals.term("term1"),
+            Intervals.or(
+                Intervals.extend(Intervals.term("term2"), 1, 0),
+                Intervals.phrase(
+                    Intervals.extend(Intervals.term("term3"), 1, 0),
+                    Intervals.extend(Intervals.term("term4"), 2, 0))),
+            Intervals.term("term5")
+        );
+
+        assertEquals(expected, source);
+
+    }
+
+    public void testGraphTerminatesOnGap() throws IOException {
+        // term1 term2:2/term3 term4 [] term5
+        CannedTokenStream ts = new CannedTokenStream(
+            new Token("term1", 1, 2),
+            new Token("term2", 1, 2, 3, 2),
+            new Token("term3", 0, 2, 3),
+            new Token("term4", 2, 3),
+            new Token("term5", 2, 6, 7)
+        );
+
+        IntervalsSource source = BUILDER.analyzeText(new CachingTokenFilter(ts), -1, true);
+        IntervalsSource expected = Intervals.ordered(
+            Intervals.term("term1"),
+            Intervals.or(Intervals.term("term2"), Intervals.phrase("term3", "term4")),
+            Intervals.extend(Intervals.term("term5"), 1, 0)
+        );
+        assertEquals(expected, source);
+    }
+
 }