From adc5509edad8d13b1f3e605171209ad40c09e3c8 Mon Sep 17 00:00:00 2001 From: David Roberts Date: Mon, 26 Oct 2020 09:05:49 +0000 Subject: [PATCH] [ML] Support the unsigned_long type in data frame analytics (#64072) Adds support for the unsigned_long type to data frame analytics. This type is handled in the same way as the long type. Values sent to the ML native processes are converted to floats and hence will lose accuracy when outside the range where a float can uniquely represent long values. Backport of #64066 --- .../core/ml/dataframe/analyses/Types.java | 7 ++++--- .../DataFrameAnalysisCustomFeatureIT.java | 2 +- .../ExplainDataFrameAnalyticsIT.java | 4 ++-- .../xpack/ml/integration/RegressionIT.java | 2 +- .../integration/RunDataFrameAnalyticsIT.java | 4 ++-- .../ExtractedFieldsDetectorTests.java | 20 ++++++++++--------- .../test/ml/explain_data_frame_analytics.yml | 4 ++-- 7 files changed, 23 insertions(+), 20 deletions(-) diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/dataframe/analyses/Types.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/dataframe/analyses/Types.java index d9a1343b5b5..f2cf6d06696 100644 --- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/dataframe/analyses/Types.java +++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/dataframe/analyses/Types.java @@ -30,13 +30,14 @@ public final class Types { private static final Set NUMERICAL_TYPES = Collections.unmodifiableSet( - Stream.concat(Stream.of(NumberType.values()).map(NumberType::typeName), Stream.of("scaled_float")) + Stream.concat(Stream.of(NumberType.values()).map(NumberType::typeName), Stream.of("scaled_float", "unsigned_long")) .collect(Collectors.toSet())); private static final Set DISCRETE_NUMERICAL_TYPES = Collections.unmodifiableSet( - Stream.of(NumberType.BYTE, NumberType.SHORT, NumberType.INTEGER, NumberType.LONG) - .map(NumberType::typeName) + Stream.concat( + Stream.of(NumberType.BYTE, NumberType.SHORT, NumberType.INTEGER, NumberType.LONG).map(NumberType::typeName), + Stream.of("unsigned_long")) .collect(Collectors.toSet())); private static final Set BOOL_TYPES = Collections.singleton(BooleanFieldMapper.CONTENT_TYPE); diff --git a/x-pack/plugin/ml/qa/native-multi-node-tests/src/javaRestTest/java/org/elasticsearch/xpack/ml/integration/DataFrameAnalysisCustomFeatureIT.java b/x-pack/plugin/ml/qa/native-multi-node-tests/src/javaRestTest/java/org/elasticsearch/xpack/ml/integration/DataFrameAnalysisCustomFeatureIT.java index fe29c2bf141..61ce2a3c65d 100644 --- a/x-pack/plugin/ml/qa/native-multi-node-tests/src/javaRestTest/java/org/elasticsearch/xpack/ml/integration/DataFrameAnalysisCustomFeatureIT.java +++ b/x-pack/plugin/ml/qa/native-multi-node-tests/src/javaRestTest/java/org/elasticsearch/xpack/ml/integration/DataFrameAnalysisCustomFeatureIT.java @@ -168,7 +168,7 @@ public class DataFrameAnalysisCustomFeatureIT extends MlNativeDataFrameAnalytics " \"type\": \"double\"\n" + " }," + " \""+ DISCRETE_NUMERICAL_FIELD + "\": {\n" + - " \"type\": \"integer\"\n" + + " \"type\": \"unsigned_long\"\n" + " }," + " \""+ TEXT_FIELD + "\": {\n" + " \"type\": \"text\"\n" + diff --git a/x-pack/plugin/ml/qa/native-multi-node-tests/src/javaRestTest/java/org/elasticsearch/xpack/ml/integration/ExplainDataFrameAnalyticsIT.java b/x-pack/plugin/ml/qa/native-multi-node-tests/src/javaRestTest/java/org/elasticsearch/xpack/ml/integration/ExplainDataFrameAnalyticsIT.java index 1e1e521652a..da86970941d 100644 --- a/x-pack/plugin/ml/qa/native-multi-node-tests/src/javaRestTest/java/org/elasticsearch/xpack/ml/integration/ExplainDataFrameAnalyticsIT.java +++ b/x-pack/plugin/ml/qa/native-multi-node-tests/src/javaRestTest/java/org/elasticsearch/xpack/ml/integration/ExplainDataFrameAnalyticsIT.java @@ -52,7 +52,7 @@ public class ExplainDataFrameAnalyticsIT extends MlNativeDataFrameAnalyticsInteg client().admin().indices().prepareCreate(sourceIndex) .addMapping("_doc", "numeric_1", "type=double", - "numeric_2", "type=float", + "numeric_2", "type=unsigned_long", "categorical", "type=keyword", "filtered_field", "type=keyword") .get(); @@ -64,7 +64,7 @@ public class ExplainDataFrameAnalyticsIT extends MlNativeDataFrameAnalyticsInteg IndexRequest indexRequest = new IndexRequest(sourceIndex); indexRequest.source( "numeric_1", 1.0, - "numeric_2", 2.0, + "numeric_2", 2, "categorical", i % 2 == 0 ? "class_1" : "class_2", "filtered_field", i < 2 ? "bingo" : "rest"); // We tag bingo on the first two docs to ensure we have 2 classes bulkRequestBuilder.add(indexRequest); diff --git a/x-pack/plugin/ml/qa/native-multi-node-tests/src/javaRestTest/java/org/elasticsearch/xpack/ml/integration/RegressionIT.java b/x-pack/plugin/ml/qa/native-multi-node-tests/src/javaRestTest/java/org/elasticsearch/xpack/ml/integration/RegressionIT.java index 318a2c75fc4..3ed7aff974f 100644 --- a/x-pack/plugin/ml/qa/native-multi-node-tests/src/javaRestTest/java/org/elasticsearch/xpack/ml/integration/RegressionIT.java +++ b/x-pack/plugin/ml/qa/native-multi-node-tests/src/javaRestTest/java/org/elasticsearch/xpack/ml/integration/RegressionIT.java @@ -675,7 +675,7 @@ public class RegressionIT extends MlNativeDataFrameAnalyticsIntegTestCase { " \"type\": \"double\"\n" + " }," + " \"" + DISCRETE_NUMERICAL_FEATURE_FIELD + "\": {\n" + - " \"type\": \"long\"\n" + + " \"type\": \"unsigned_long\"\n" + " }," + " \"" + DEPENDENT_VARIABLE_FIELD + "\": {\n" + " \"type\": \"double\"\n" + diff --git a/x-pack/plugin/ml/qa/native-multi-node-tests/src/javaRestTest/java/org/elasticsearch/xpack/ml/integration/RunDataFrameAnalyticsIT.java b/x-pack/plugin/ml/qa/native-multi-node-tests/src/javaRestTest/java/org/elasticsearch/xpack/ml/integration/RunDataFrameAnalyticsIT.java index 64a36c85161..eefcbfa2f88 100644 --- a/x-pack/plugin/ml/qa/native-multi-node-tests/src/javaRestTest/java/org/elasticsearch/xpack/ml/integration/RunDataFrameAnalyticsIT.java +++ b/x-pack/plugin/ml/qa/native-multi-node-tests/src/javaRestTest/java/org/elasticsearch/xpack/ml/integration/RunDataFrameAnalyticsIT.java @@ -72,7 +72,7 @@ public class RunDataFrameAnalyticsIT extends MlNativeDataFrameAnalyticsIntegTest String sourceIndex = "test-outlier-detection-with-few-docs"; client().admin().indices().prepareCreate(sourceIndex) - .addMapping("_doc", "numeric_1", "type=double", "numeric_2", "type=float", "categorical_1", "type=keyword") + .addMapping("_doc", "numeric_1", "type=double", "numeric_2", "type=unsigned_long", "categorical_1", "type=keyword") .get(); BulkRequestBuilder bulkRequestBuilder = client().prepareBulk(); @@ -84,7 +84,7 @@ public class RunDataFrameAnalyticsIT extends MlNativeDataFrameAnalyticsIntegTest // We insert one odd value out of 5 for one feature String docId = i == 0 ? "outlier" : "normal" + i; indexRequest.id(docId); - indexRequest.source("numeric_1", i == 0 ? 100.0 : 1.0, "numeric_2", 1.0, "categorical_1", "foo_" + i); + indexRequest.source("numeric_1", i == 0 ? 100.0 : 1.0, "numeric_2", 1, "categorical_1", "foo_" + i); bulkRequestBuilder.add(indexRequest); } BulkResponse bulkResponse = bulkRequestBuilder.get(); diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/dataframe/extractor/ExtractedFieldsDetectorTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/dataframe/extractor/ExtractedFieldsDetectorTests.java index 744452439ac..adb006f2fa6 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/dataframe/extractor/ExtractedFieldsDetectorTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/dataframe/extractor/ExtractedFieldsDetectorTests.java @@ -105,7 +105,7 @@ public class ExtractedFieldsDetectorTests extends ESTestCase { assertThat(fieldExtraction.v2().get(0).getName(), equalTo("some_keyword")); assertThat(fieldExtraction.v2().get(0).isIncluded(), is(false)); assertThat(fieldExtraction.v2().get(0).getReason(), equalTo("unsupported type; supported types are " + - "[boolean, byte, double, float, half_float, integer, long, scaled_float, short]")); + "[boolean, byte, double, float, half_float, integer, long, scaled_float, short, unsigned_long]")); } public void testDetect_GivenOutlierDetectionAndFieldWithNumericAndNonNumericTypes() { @@ -121,7 +121,7 @@ public class ExtractedFieldsDetectorTests extends ESTestCase { assertThat(fieldExtraction.v2().get(0).getName(), equalTo("indecisive_field")); assertThat(fieldExtraction.v2().get(0).isIncluded(), is(false)); assertThat(fieldExtraction.v2().get(0).getReason(), equalTo("unsupported type; supported types are " + - "[boolean, byte, double, float, half_float, integer, long, scaled_float, short]")); + "[boolean, byte, double, float, half_float, integer, long, scaled_float, short, unsigned_long]")); } public void testDetect_GivenOutlierDetectionAndMultipleFields() { @@ -147,7 +147,7 @@ public class ExtractedFieldsDetectorTests extends ESTestCase { FieldSelection.included("some_boolean", Collections.singleton("boolean"), false, FieldSelection.FeatureType.NUMERICAL), FieldSelection.included("some_float", Collections.singleton("float"), false, FieldSelection.FeatureType.NUMERICAL), FieldSelection.excluded("some_keyword", Collections.singleton("keyword"), "unsupported type; " + - "supported types are [boolean, byte, double, float, half_float, integer, long, scaled_float, short]"), + "supported types are [boolean, byte, double, float, half_float, integer, long, scaled_float, short, unsigned_long]"), FieldSelection.included("some_long", Collections.singleton("long"), false, FieldSelection.FeatureType.NUMERICAL) ); } @@ -282,7 +282,7 @@ public class ExtractedFieldsDetectorTests extends ESTestCase { ElasticsearchStatusException e = expectThrows(ElasticsearchStatusException.class, extractedFieldsDetector::detect); assertThat(e.getMessage(), equalTo("invalid types [keyword] for required field [foo]; " + - "expected types are [byte, double, float, half_float, integer, long, scaled_float, short]")); + "expected types are [byte, double, float, half_float, integer, long, scaled_float, short, unsigned_long]")); } public void testDetect_GivenClassificationAndRequiredFieldHasInvalidType() { @@ -298,7 +298,7 @@ public class ExtractedFieldsDetectorTests extends ESTestCase { ElasticsearchStatusException e = expectThrows(ElasticsearchStatusException.class, extractedFieldsDetector::detect); assertThat(e.getMessage(), equalTo("invalid types [float] for required field [some_float]; " + - "expected types are [boolean, byte, integer, ip, keyword, long, short, text]")); + "expected types are [boolean, byte, integer, ip, keyword, long, short, text, unsigned_long]")); } public void testDetect_GivenClassificationAndDependentVariableHasInvalidCardinality() { @@ -371,7 +371,8 @@ public class ExtractedFieldsDetectorTests extends ESTestCase { assertFieldSelectionContains(fieldExtraction.v2(), FieldSelection.excluded("categorical", Collections.singleton("keyword"), - "unsupported type; supported types are [boolean, byte, double, float, half_float, integer, long, scaled_float, short]"), + "unsupported type; supported types are " + + "[boolean, byte, double, float, half_float, integer, long, scaled_float, short, unsigned_long]"), FieldSelection.included("numeric", Collections.singleton("float"), false, FieldSelection.FeatureType.NUMERICAL) ); } @@ -471,7 +472,7 @@ public class ExtractedFieldsDetectorTests extends ESTestCase { ElasticsearchStatusException e = expectThrows(ElasticsearchStatusException.class, extractedFieldsDetector::detect); assertThat(e.getMessage(), equalTo("field [your_keyword] has unsupported type [keyword]. " + - "Supported types are [boolean, byte, double, float, half_float, integer, long, scaled_float, short].")); + "Supported types are [boolean, byte, double, float, half_float, integer, long, scaled_float, short, unsigned_long].")); } public void testDetect_GivenNotIncludedFieldHasUnsupportedType() { @@ -492,7 +493,8 @@ public class ExtractedFieldsDetectorTests extends ESTestCase { assertFieldSelectionContains(fieldExtraction.v2(), FieldSelection.excluded("categorical", Collections.singleton("keyword"), - "unsupported type; supported types are [boolean, byte, double, float, half_float, integer, long, scaled_float, short]"), + "unsupported type; supported types are " + + "[boolean, byte, double, float, half_float, integer, long, scaled_float, short, unsigned_long]"), FieldSelection.included("numeric", Collections.singleton("float"), false, FieldSelection.FeatureType.NUMERICAL) ); } @@ -517,7 +519,7 @@ public class ExtractedFieldsDetectorTests extends ESTestCase { FieldSelection.included("my_field1", Collections.singleton("float"), false, FieldSelection.FeatureType.NUMERICAL), FieldSelection.included("your_field2", Collections.singleton("float"), false, FieldSelection.FeatureType.NUMERICAL), FieldSelection.excluded("your_keyword", Collections.singleton("keyword"), "unsupported type; supported types " + - "are [boolean, byte, double, float, half_float, integer, long, scaled_float, short]") + "are [boolean, byte, double, float, half_float, integer, long, scaled_float, short, unsigned_long]") ); } diff --git a/x-pack/plugin/src/test/resources/rest-api-spec/test/ml/explain_data_frame_analytics.yml b/x-pack/plugin/src/test/resources/rest-api-spec/test/ml/explain_data_frame_analytics.yml index 1a7a51dff3d..165ec26d4a4 100644 --- a/x-pack/plugin/src/test/resources/rest-api-spec/test/ml/explain_data_frame_analytics.yml +++ b/x-pack/plugin/src/test/resources/rest-api-spec/test/ml/explain_data_frame_analytics.yml @@ -225,7 +225,7 @@ - match: { field_selection.2.is_included: false } - match: { field_selection.2.is_required: false } - is_false: field_selection.2.feature_type - - match: { field_selection.2.reason: "unsupported type; supported types are [boolean, byte, double, float, half_float, integer, ip, keyword, long, scaled_float, short, text]" } + - match: { field_selection.2.reason: "unsupported type; supported types are [boolean, byte, double, float, half_float, integer, ip, keyword, long, scaled_float, short, text, unsigned_long]" } - match: { field_selection.3.name: "field_4" } - match: { field_selection.3.mapping_types: ["text"] } - match: { field_selection.3.is_included: false } @@ -299,7 +299,7 @@ - match: { field_selection.2.is_included: false } - match: { field_selection.2.is_required: false } - is_false: field_selection.2.feature_type - - match: { field_selection.2.reason: "unsupported type; supported types are [boolean, byte, double, float, half_float, integer, ip, keyword, long, scaled_float, short, text]" } + - match: { field_selection.2.reason: "unsupported type; supported types are [boolean, byte, double, float, half_float, integer, ip, keyword, long, scaled_float, short, text, unsigned_long]" } - match: { field_selection.3.name: "field_4" } - match: { field_selection.3.mapping_types: ["text"] } - match: { field_selection.3.is_included: false }