Allow word_delimiter_graph_filter to not adjust internal offsets (#36699)

This commit adds an adjust_offsets parameter to the word_delimiter_graph token filter, defaulting to true. Most of the time you'd want sub-tokens emitted by this filter to have offsets that are adjusted to their real position in the token stream; however, some token filters can change the length or starting position of a token (eg trim) without changing their offset attributes, and this can lead to word_delimiter_graph emitting illegal offsets. Setting adjust_offsets to false in these cases will allow indexing again. Fixes #34741, #33710
2018-12-18 13:20:51 +00:00 · 2018-12-18 13:20:51 +00:00 · af57575838
parent 0ff1f1fa18
commit af57575838
4 changed files with 58 additions and 2 deletions
--- a/docs/reference/analysis/tokenfilters/word-delimiter-graph-tokenfilter.asciidoc
+++ b/docs/reference/analysis/tokenfilters/word-delimiter-graph-tokenfilter.asciidoc
@ -71,6 +71,15 @@ Advance settings include:
    to a file configured with protected words (one on each line).
    Automatically resolves to `config/` based location if exists.

+`adjust_offsets`::
+    By default, the filter tries to output subtokens with adjusted offsets
+    to reflect their actual position in the token stream.  However, when
+    used in combination with other filters that alter the length or starting
+    position of tokens without changing their offsets
+    (e.g. <<analysis-trim-tokenfilter,`trim`>>) this can cause tokens with
+    illegal offsets to be emitted.  Setting `adjust_offsets` to false will
+    stop `word_delimiter_graph` from adjusting these internal offsets.
+
 `type_table`::
    A custom type mapping table, for example (when configured
    using `type_table_path`):
--- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/WordDelimiterGraphTokenFilterFactory.java
+++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/WordDelimiterGraphTokenFilterFactory.java
@ -55,6 +55,7 @@ public class WordDelimiterGraphTokenFilterFactory extends AbstractTokenFilterFac
    private final byte[] charTypeTable;
    private final int flags;
    private final CharArraySet protoWords;
+    private final boolean adjustOffsets;

    public WordDelimiterGraphTokenFilterFactory(IndexSettings indexSettings, Environment env,
            String name, Settings settings) {
@ -95,11 +96,12 @@ public class WordDelimiterGraphTokenFilterFactory extends AbstractTokenFilterFac
        Set<?> protectedWords = Analysis.getWordSet(env, settings, "protected_words");
        this.protoWords = protectedWords == null ? null : CharArraySet.copy(protectedWords);
        this.flags = flags;
+        this.adjustOffsets = settings.getAsBoolean("adjust_offsets", true);
    }

    @Override
    public TokenStream create(TokenStream tokenStream) {
-        return new WordDelimiterGraphFilter(tokenStream, true, charTypeTable, flags, protoWords);
+        return new WordDelimiterGraphFilter(tokenStream, adjustOffsets, charTypeTable, flags, protoWords);
    }

    @Override
--- a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/WordDelimiterGraphTokenFilterFactoryTests.java
+++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/WordDelimiterGraphTokenFilterFactoryTests.java
@ -76,10 +76,35 @@ public class WordDelimiterGraphTokenFilterFactoryTests
        String source = "PowerShot";
        int[] expectedIncr = new int[]{1, 0, 1};
        int[] expectedPosLen = new int[]{2, 1, 1};
+        int[] expectedStartOffsets = new int[]{0, 0, 5};
+        int[] expectedEndOffsets = new int[]{9, 5, 9};
        String[] expected = new String[]{"PowerShot", "Power", "Shot" };
        Tokenizer tokenizer = new WhitespaceTokenizer();
        tokenizer.setReader(new StringReader(source));
-        assertTokenStreamContents(tokenFilter.create(tokenizer), expected, null, null, null,
+        assertTokenStreamContents(tokenFilter.create(tokenizer), expected, expectedStartOffsets, expectedEndOffsets, null,
+            expectedIncr, expectedPosLen, null);
+    }
+
+    public void testAdjustingOffsets() throws IOException {
+        ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
+            Settings.builder()
+                .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
+                .put("index.analysis.filter.my_word_delimiter.type", type)
+                .put("index.analysis.filter.my_word_delimiter.catenate_words", "true")
+                .put("index.analysis.filter.my_word_delimiter.generate_word_parts", "true")
+                .put("index.analysis.filter.my_word_delimiter.adjust_offsets", "false")
+                .build(),
+            new CommonAnalysisPlugin());
+        TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
+        String source = "PowerShot";
+        int[] expectedIncr = new int[]{1, 0, 1};
+        int[] expectedPosLen = new int[]{2, 1, 1};
+        int[] expectedStartOffsets = new int[]{0, 0, 0};
+        int[] expectedEndOffsets = new int[]{9, 9, 9};
+        String[] expected = new String[]{"PowerShot", "Power", "Shot" };
+        Tokenizer tokenizer = new WhitespaceTokenizer();
+        tokenizer.setReader(new StringReader(source));
+        assertTokenStreamContents(tokenFilter.create(tokenizer), expected, expectedStartOffsets, expectedEndOffsets, null,
            expectedIncr, expectedPosLen, null);
    }
 }
--- a/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/40_token_filters.yml
+++ b/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/40_token_filters.yml
@ -157,6 +157,26 @@
    - match:  { tokens.2.token: brown }
    - match:  { tokens.3.token: fox }

+    - do:
+        indices.analyze:
+          body:
+            text:      the qu1ck brown fox
+            tokenizer: standard
+            filter:
+              - type: word_delimiter_graph
+                adjust_offsets: false
+    - length: { tokens: 6 }
+    - match:  { tokens.0.token: the }
+    - match:  { tokens.1.token: qu }
+    - match:  { tokens.1.start_offset: 4 }
+    - match:  { tokens.1.end_offset: 9 }
+    - match:  { tokens.2.token: "1" }
+    - match:  { tokens.2.start_offset: 4 }
+    - match:  { tokens.2.end_offset: 9 }
+    - match:  { tokens.3.token: ck }
+    - match:  { tokens.3.start_offset: 4 }
+    - match:  { tokens.3.end_offset: 9 }
+
    - do:
        indices.analyze:
          body: