From 181ee3ae0b69dc2fc84d557c417a62ed09fdac92 Mon Sep 17 00:00:00 2001
From: Benjamin Trent <ben.w.trent@gmail.com>
Date: Thu, 5 Mar 2020 09:50:52 -0500
Subject: [PATCH] [ML] specifying missing_field_value value and using it
 instead of empty_string (#53108) (#53165)

For analytics, we need a consistent way of indicating when a value is missing. Inheriting from anomaly detection, analysis sent `""` when a field is missing. This works fine with numbers, but the underlying analytics process actually treats `""` as a category in categorical values.

Consequently, you end up with this situation in the resulting model
```
{
              "frequency_encoding" : {
                "field" : "RainToday",
                "feature_name" : "RainToday_frequency",
                "frequency_map" : {
                  "" : 0.009844409027270245,
                  "No" : 0.6472019970785184,
                  "Yes" : 0.6472019970785184
                }
              }
            }
```
For inference this is a problem, because inference will treat missing values as `null`. And thus not include them on the infer call against the model.

This PR takes advantage of our new `missing_field_value` option and supplies `\0` as the value.
---
 .../xpack/ml/dataframe/extractor/DataFrameDataExtractor.java | 4 ++--
 .../customprocessing/DatasetSplittingCustomProcessor.java    | 5 ++---
 .../ml/dataframe/extractor/DataFrameDataExtractorTests.java  | 3 ++-
 .../DatasetSplittingCustomProcessorTests.java                | 3 ++-
 4 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/dataframe/extractor/DataFrameDataExtractor.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/dataframe/extractor/DataFrameDataExtractor.java
index a2ce8cf60d4..6d9b6fb04b8 100644
--- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/dataframe/extractor/DataFrameDataExtractor.java
+++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/dataframe/extractor/DataFrameDataExtractor.java
@@ -52,7 +52,7 @@ public class DataFrameDataExtractor {
     private static final Logger LOGGER = LogManager.getLogger(DataFrameDataExtractor.class);
     private static final TimeValue SCROLL_TIMEOUT = new TimeValue(30, TimeUnit.MINUTES);
 
-    private static final String EMPTY_STRING = "";
+    public static final String NULL_VALUE = "\0";
 
     private final Client client;
     private final DataFrameDataExtractorContext context;
@@ -189,7 +189,7 @@ public class DataFrameDataExtractor {
             } else {
                 if (values.length == 0 && context.includeRowsWithMissingValues) {
                     // if values is empty then it means it's a missing value
-                    extractedValues[i] = EMPTY_STRING;
+                    extractedValues[i] = NULL_VALUE;
                 } else {
                     // we are here if we have a missing value but the analysis does not support those
                     // or the value type is not supported (e.g. arrays, etc.)
diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/dataframe/process/customprocessing/DatasetSplittingCustomProcessor.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/dataframe/process/customprocessing/DatasetSplittingCustomProcessor.java
index bf6284aa7a5..6e6acfb271e 100644
--- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/dataframe/process/customprocessing/DatasetSplittingCustomProcessor.java
+++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/dataframe/process/customprocessing/DatasetSplittingCustomProcessor.java
@@ -6,6 +6,7 @@
 package org.elasticsearch.xpack.ml.dataframe.process.customprocessing;
 
 import org.elasticsearch.xpack.core.ml.utils.ExceptionsHelper;
+import org.elasticsearch.xpack.ml.dataframe.extractor.DataFrameDataExtractor;
 
 import java.util.List;
 import java.util.Random;
@@ -18,8 +19,6 @@ import java.util.Random;
  */
 class DatasetSplittingCustomProcessor implements CustomProcessor {
 
-    private static final String EMPTY = "";
-
     private final int dependentVariableIndex;
     private final double trainingPercent;
     private final Random random;
@@ -47,7 +46,7 @@ class DatasetSplittingCustomProcessor implements CustomProcessor {
                 // Let's make sure we have at least one training row
                 isFirstRow = false;
             } else if (isRandomlyExcludedFromTraining()) {
-                row[dependentVariableIndex] = EMPTY;
+                row[dependentVariableIndex] = DataFrameDataExtractor.NULL_VALUE;
             }
         }
     }
diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/dataframe/extractor/DataFrameDataExtractorTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/dataframe/extractor/DataFrameDataExtractorTests.java
index 2fe90074d75..65e33da40f3 100644
--- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/dataframe/extractor/DataFrameDataExtractorTests.java
+++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/dataframe/extractor/DataFrameDataExtractorTests.java
@@ -377,7 +377,8 @@ public class DataFrameDataExtractorTests extends ESTestCase {
         assertThat(rows.get().size(), equalTo(3));
 
         assertThat(rows.get().get(0).getValues(), equalTo(new String[] {"11", "21"}));
-        assertThat(rows.get().get(1).getValues(), equalTo(new String[] {"", "22"}));
+        assertThat(rows.get().get(1).getValues()[0], equalTo(DataFrameDataExtractor.NULL_VALUE));
+        assertThat(rows.get().get(1).getValues()[1], equalTo("22"));
         assertThat(rows.get().get(2).getValues(), equalTo(new String[] {"13", "23"}));
 
         assertThat(rows.get().get(0).shouldSkip(), is(false));
diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/dataframe/process/customprocessing/DatasetSplittingCustomProcessorTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/dataframe/process/customprocessing/DatasetSplittingCustomProcessorTests.java
index d18adc3dcdb..ac897413a4e 100644
--- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/dataframe/process/customprocessing/DatasetSplittingCustomProcessorTests.java
+++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/dataframe/process/customprocessing/DatasetSplittingCustomProcessorTests.java
@@ -6,6 +6,7 @@
 package org.elasticsearch.xpack.ml.dataframe.process.customprocessing;
 
 import org.elasticsearch.test.ESTestCase;
+import org.elasticsearch.xpack.ml.dataframe.extractor.DataFrameDataExtractor;
 import org.junit.Before;
 
 import java.util.ArrayList;
@@ -98,7 +99,7 @@ public class DatasetSplittingCustomProcessorTests extends ESTestCase {
                         assertThat(processedRow[fieldIndex], equalTo(row[fieldIndex]));
                     }
                 }
-                if (processedRow[dependentVariableIndex].length() > 0) {
+                if (DataFrameDataExtractor.NULL_VALUE.equals(processedRow[dependentVariableIndex]) == false) {
                     assertThat(processedRow[dependentVariableIndex], equalTo(row[dependentVariableIndex]));
                     trainingRows++;
                 }