From d6ca53cd2691017c30789d82ceb29628be6858cb Mon Sep 17 00:00:00 2001 From: David Kyle Date: Thu, 8 Feb 2018 16:18:48 +0000 Subject: [PATCH] [ML] Fix exception categorising an empty string (elastic/x-pack-elasticsearch#3870) Original commit: elastic/x-pack-elasticsearch@1840a74415f8cc7ec0c1a27017cab023d2f81612 --- .../ml/job/categorization/MlClassicTokenizer.java | 2 +- .../categorization/CategorizationAnalyzerTests.java | 7 +++++++ .../job/categorization/MlClassicTokenizerTests.java | 12 ++++++++++++ 3 files changed, 20 insertions(+), 1 deletion(-) diff --git a/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/categorization/MlClassicTokenizer.java b/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/categorization/MlClassicTokenizer.java index cb1e36d4f46..72fd2242105 100644 --- a/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/categorization/MlClassicTokenizer.java +++ b/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/categorization/MlClassicTokenizer.java @@ -105,7 +105,7 @@ public class MlClassicTokenizer extends Tokenizer { public final void end() throws IOException { super.end(); // Set final offset - int finalOffset = nextOffset + (int) input.skip(Integer.MAX_VALUE) - 1; + int finalOffset = Math.max(0, nextOffset + (int) input.skip(Integer.MAX_VALUE) - 1); offsetAtt.setOffset(finalOffset, finalOffset); // Adjust any skipped tokens posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions); diff --git a/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/job/categorization/CategorizationAnalyzerTests.java b/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/job/categorization/CategorizationAnalyzerTests.java index 418b4439262..a6c5a6d0bd5 100644 --- a/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/job/categorization/CategorizationAnalyzerTests.java +++ b/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/job/categorization/CategorizationAnalyzerTests.java @@ -172,6 +172,13 @@ public class CategorizationAnalyzerTests extends ESTestCase { } } + public void testEmptyString() throws IOException { + CategorizationAnalyzerConfig defaultConfig = CategorizationAnalyzerConfig.buildDefaultCategorizationAnalyzer(null); + try (CategorizationAnalyzer categorizationAnalyzer = new CategorizationAnalyzer(analysisRegistry, environment, defaultConfig)) { + assertEquals(Collections.emptyList(), categorizationAnalyzer.tokenizeField("foo", "")); + } + } + public void testThaiAnalyzer() throws IOException { CategorizationAnalyzerConfig config = new CategorizationAnalyzerConfig.Builder().setAnalyzer("thai").build(); try (CategorizationAnalyzer categorizationAnalyzer = new CategorizationAnalyzer(analysisRegistry, environment, config)) { diff --git a/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/job/categorization/MlClassicTokenizerTests.java b/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/job/categorization/MlClassicTokenizerTests.java index a525cbd6b3d..8aba5642255 100644 --- a/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/job/categorization/MlClassicTokenizerTests.java +++ b/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/job/categorization/MlClassicTokenizerTests.java @@ -45,4 +45,16 @@ public class MlClassicTokenizerTests extends ESTestCase { tokenizer.end(); } } + + public void testTokenize_emptyString() throws IOException { + String testData = ""; + try (Tokenizer tokenizer = new MlClassicTokenizer()) { + tokenizer.setReader(new StringReader(testData)); + tokenizer.reset(); + CharTermAttribute term = tokenizer.addAttribute(CharTermAttribute.class); + assertFalse(tokenizer.incrementToken()); + assertEquals("", term.toString()); + tokenizer.end(); + } + } }