Don't use index_phrases on graph queries (#44340)

Due to https://issues.apache.org/jira/browse/LUCENE-8916, when you try to use a synonym filter with the index_phrases option on a text field, you can end up with null values in a Phrase query, leading to weird exceptions further down the querying chain. As a workaround, this commit disables the index_phrases optimization for queries that produce token graphs. Fixes #43976
2025-03-25 01:19:02 +00:00 · 2019-07-17 16:08:28 +01:00 · 2019-07-17 16:08:28 +01:00 · b6a0f098e6
commit b6a0f098e6
parent ddd740162e
3 changed files with 63 additions and 3 deletions
--- a/modules/analysis-common/src/test/resources/rest-api-spec/test/search.query/60_synonym_graph.yml
+++ b/modules/analysis-common/src/test/resources/rest-api-spec/test/search.query/60_synonym_graph.yml
@ -27,6 +27,9 @@ setup:
            properties:
              field:
                type: text
+              phrase_field:
+                type: text
+                index_phrases: true

  - do:
      index:
@ -204,3 +207,26 @@ setup:
  - match: { hits.hits.2._id: "1" }
  - match: { hits.hits.3._id: "8" }
  - match: { hits.hits.4._id: "2" }
+
+---
+"index_phrases":
+
+  - do:
+      index:
+        index: test
+        id: 9
+        body:
+          phrase_field: "bar baz"
+        refresh: true
+
+  - do:
+      search:
+        rest_total_hits_as_int: true
+        body:
+          query:
+            match:
+              phrase_field:
+                query: bar baz
+                analyzer: lower_graph_syns
+  - match: { hits.total: 1 }
+
--- a/server/src/main/java/org/elasticsearch/index/mapper/TextFieldMapper.java
+++ b/server/src/main/java/org/elasticsearch/index/mapper/TextFieldMapper.java
@ -26,12 +26,15 @@ import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter;
 import org.apache.lucene.analysis.shingle.FixedShingleFilter;
+import org.apache.lucene.analysis.tokenattributes.BytesTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.index.IndexOptions;
 import org.apache.lucene.index.IndexableField;
 import org.apache.lucene.index.Term;
+import org.apache.lucene.queries.intervals.Intervals;
+import org.apache.lucene.queries.intervals.IntervalsSource;
 import org.apache.lucene.search.AutomatonQuery;
 import org.apache.lucene.search.BooleanClause;
 import org.apache.lucene.search.BooleanQuery;
@ -44,8 +47,6 @@ import org.apache.lucene.search.PrefixQuery;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.SynonymQuery;
 import org.apache.lucene.search.TermQuery;
-import org.apache.lucene.queries.intervals.Intervals;
-import org.apache.lucene.queries.intervals.IntervalsSource;
 import org.apache.lucene.search.spans.FieldMaskingSpanQuery;
 import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper;
 import org.apache.lucene.search.spans.SpanNearQuery;
@ -688,7 +689,10 @@ public class TextFieldMapper extends FieldMapper {
        @Override
        public Query phraseQuery(TokenStream stream, int slop, boolean enablePosIncrements) throws IOException {
            String field = name();
-            if (indexPhrases && slop == 0 && hasGaps(stream) == false) {
+            // we can't use the index_phrases shortcut with slop, if there are gaps in the stream,
+            // or if the incoming token stream is the output of a token graph due to
+            // https://issues.apache.org/jira/browse/LUCENE-8916
+            if (indexPhrases && slop == 0 && hasGaps(stream) == false && stream.hasAttribute(BytesTermAttribute.class) == false) {
                stream = new FixedShingleFilter(stream, 2);
                field = field + FAST_PHRASE_SUFFIX;
            }
@ -701,6 +705,9 @@ public class TextFieldMapper extends FieldMapper {

            stream.reset();
            while (stream.incrementToken()) {
+                if (termAtt.getBytesRef() == null) {
+                    throw new IllegalStateException("Null term while building phrase query");
+                }
                if (enablePosIncrements) {
                    position += posIncrAtt.getPositionIncrement();
                }
--- a/server/src/test/java/org/elasticsearch/index/mapper/TextFieldMapperTests.java
+++ b/server/src/test/java/org/elasticsearch/index/mapper/TextFieldMapperTests.java
@ -19,7 +19,10 @@

 package org.elasticsearch.index.mapper;

+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CannedTokenStream;
 import org.apache.lucene.analysis.MockSynonymAnalyzer;
+import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.index.DocValuesType;
@ -30,6 +33,8 @@ import org.apache.lucene.index.LeafReader;
 import org.apache.lucene.index.PostingsEnum;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.search.BooleanClause;
+import org.apache.lucene.search.BooleanQuery;
 import org.apache.lucene.search.MultiPhraseQuery;
 import org.apache.lucene.search.PhraseQuery;
 import org.apache.lucene.search.Query;
@ -831,6 +836,28 @@ public class TextFieldMapperTests extends ESSingleNodeTestCase {
                new Term("synfield._index_phrase", "motor dog")})
            .build()));

+        // https://github.com/elastic/elasticsearch/issues/43976
+        CannedTokenStream cts = new CannedTokenStream(
+            new Token("foo", 1, 0, 2, 2),
+            new Token("bar", 0, 0, 2),
+            new Token("baz", 1, 0, 2)
+        );
+        Analyzer synonymAnalyzer = new Analyzer() {
+            @Override
+            protected TokenStreamComponents createComponents(String fieldName) {
+                return new TokenStreamComponents(reader -> {}, cts);
+            }
+        };
+        matchQuery.setAnalyzer(synonymAnalyzer);
+        Query q7 = matchQuery.parse(MatchQuery.Type.BOOLEAN, "synfield", "foo");
+        assertThat(q7, is(new BooleanQuery.Builder().add(new BooleanQuery.Builder()
+            .add(new TermQuery(new Term("synfield", "foo")), BooleanClause.Occur.SHOULD)
+            .add(new PhraseQuery.Builder()
+                .add(new Term("synfield", "bar"))
+                .add(new Term("synfield", "baz"))
+                .build(), BooleanClause.Occur.SHOULD)
+            .build(), BooleanClause.Occur.SHOULD).build()));
+
        ParsedDocument doc = mapper.parse(new SourceToParse("test", "type", "1", BytesReference
                .bytes(XContentFactory.jsonBuilder()
                    .startObject()