[ML] Support the unsigned_long type in data frame analytics (#64072)

Adds support for the unsigned_long type to data frame analytics.

This type is handled in the same way as the long type.  Values
sent to the ML native processes are converted to floats and
hence will lose accuracy when outside the range where a float
can uniquely represent long values.

Backport of #64066
This commit is contained in:
David Roberts 2020-10-26 09:05:49 +00:00 committed by GitHub
parent 96407268a0
commit adc5509eda
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 23 additions and 20 deletions

View File

@ -30,13 +30,14 @@ public final class Types {
private static final Set<String> NUMERICAL_TYPES =
Collections.unmodifiableSet(
Stream.concat(Stream.of(NumberType.values()).map(NumberType::typeName), Stream.of("scaled_float"))
Stream.concat(Stream.of(NumberType.values()).map(NumberType::typeName), Stream.of("scaled_float", "unsigned_long"))
.collect(Collectors.toSet()));
private static final Set<String> DISCRETE_NUMERICAL_TYPES =
Collections.unmodifiableSet(
Stream.of(NumberType.BYTE, NumberType.SHORT, NumberType.INTEGER, NumberType.LONG)
.map(NumberType::typeName)
Stream.concat(
Stream.of(NumberType.BYTE, NumberType.SHORT, NumberType.INTEGER, NumberType.LONG).map(NumberType::typeName),
Stream.of("unsigned_long"))
.collect(Collectors.toSet()));
private static final Set<String> BOOL_TYPES = Collections.singleton(BooleanFieldMapper.CONTENT_TYPE);

View File

@ -168,7 +168,7 @@ public class DataFrameAnalysisCustomFeatureIT extends MlNativeDataFrameAnalytics
" \"type\": \"double\"\n" +
" }," +
" \""+ DISCRETE_NUMERICAL_FIELD + "\": {\n" +
" \"type\": \"integer\"\n" +
" \"type\": \"unsigned_long\"\n" +
" }," +
" \""+ TEXT_FIELD + "\": {\n" +
" \"type\": \"text\"\n" +

View File

@ -52,7 +52,7 @@ public class ExplainDataFrameAnalyticsIT extends MlNativeDataFrameAnalyticsInteg
client().admin().indices().prepareCreate(sourceIndex)
.addMapping("_doc",
"numeric_1", "type=double",
"numeric_2", "type=float",
"numeric_2", "type=unsigned_long",
"categorical", "type=keyword",
"filtered_field", "type=keyword")
.get();
@ -64,7 +64,7 @@ public class ExplainDataFrameAnalyticsIT extends MlNativeDataFrameAnalyticsInteg
IndexRequest indexRequest = new IndexRequest(sourceIndex);
indexRequest.source(
"numeric_1", 1.0,
"numeric_2", 2.0,
"numeric_2", 2,
"categorical", i % 2 == 0 ? "class_1" : "class_2",
"filtered_field", i < 2 ? "bingo" : "rest"); // We tag bingo on the first two docs to ensure we have 2 classes
bulkRequestBuilder.add(indexRequest);

View File

@ -675,7 +675,7 @@ public class RegressionIT extends MlNativeDataFrameAnalyticsIntegTestCase {
" \"type\": \"double\"\n" +
" }," +
" \"" + DISCRETE_NUMERICAL_FEATURE_FIELD + "\": {\n" +
" \"type\": \"long\"\n" +
" \"type\": \"unsigned_long\"\n" +
" }," +
" \"" + DEPENDENT_VARIABLE_FIELD + "\": {\n" +
" \"type\": \"double\"\n" +

View File

@ -72,7 +72,7 @@ public class RunDataFrameAnalyticsIT extends MlNativeDataFrameAnalyticsIntegTest
String sourceIndex = "test-outlier-detection-with-few-docs";
client().admin().indices().prepareCreate(sourceIndex)
.addMapping("_doc", "numeric_1", "type=double", "numeric_2", "type=float", "categorical_1", "type=keyword")
.addMapping("_doc", "numeric_1", "type=double", "numeric_2", "type=unsigned_long", "categorical_1", "type=keyword")
.get();
BulkRequestBuilder bulkRequestBuilder = client().prepareBulk();
@ -84,7 +84,7 @@ public class RunDataFrameAnalyticsIT extends MlNativeDataFrameAnalyticsIntegTest
// We insert one odd value out of 5 for one feature
String docId = i == 0 ? "outlier" : "normal" + i;
indexRequest.id(docId);
indexRequest.source("numeric_1", i == 0 ? 100.0 : 1.0, "numeric_2", 1.0, "categorical_1", "foo_" + i);
indexRequest.source("numeric_1", i == 0 ? 100.0 : 1.0, "numeric_2", 1, "categorical_1", "foo_" + i);
bulkRequestBuilder.add(indexRequest);
}
BulkResponse bulkResponse = bulkRequestBuilder.get();

View File

@ -105,7 +105,7 @@ public class ExtractedFieldsDetectorTests extends ESTestCase {
assertThat(fieldExtraction.v2().get(0).getName(), equalTo("some_keyword"));
assertThat(fieldExtraction.v2().get(0).isIncluded(), is(false));
assertThat(fieldExtraction.v2().get(0).getReason(), equalTo("unsupported type; supported types are " +
"[boolean, byte, double, float, half_float, integer, long, scaled_float, short]"));
"[boolean, byte, double, float, half_float, integer, long, scaled_float, short, unsigned_long]"));
}
public void testDetect_GivenOutlierDetectionAndFieldWithNumericAndNonNumericTypes() {
@ -121,7 +121,7 @@ public class ExtractedFieldsDetectorTests extends ESTestCase {
assertThat(fieldExtraction.v2().get(0).getName(), equalTo("indecisive_field"));
assertThat(fieldExtraction.v2().get(0).isIncluded(), is(false));
assertThat(fieldExtraction.v2().get(0).getReason(), equalTo("unsupported type; supported types are " +
"[boolean, byte, double, float, half_float, integer, long, scaled_float, short]"));
"[boolean, byte, double, float, half_float, integer, long, scaled_float, short, unsigned_long]"));
}
public void testDetect_GivenOutlierDetectionAndMultipleFields() {
@ -147,7 +147,7 @@ public class ExtractedFieldsDetectorTests extends ESTestCase {
FieldSelection.included("some_boolean", Collections.singleton("boolean"), false, FieldSelection.FeatureType.NUMERICAL),
FieldSelection.included("some_float", Collections.singleton("float"), false, FieldSelection.FeatureType.NUMERICAL),
FieldSelection.excluded("some_keyword", Collections.singleton("keyword"), "unsupported type; " +
"supported types are [boolean, byte, double, float, half_float, integer, long, scaled_float, short]"),
"supported types are [boolean, byte, double, float, half_float, integer, long, scaled_float, short, unsigned_long]"),
FieldSelection.included("some_long", Collections.singleton("long"), false, FieldSelection.FeatureType.NUMERICAL)
);
}
@ -282,7 +282,7 @@ public class ExtractedFieldsDetectorTests extends ESTestCase {
ElasticsearchStatusException e = expectThrows(ElasticsearchStatusException.class, extractedFieldsDetector::detect);
assertThat(e.getMessage(), equalTo("invalid types [keyword] for required field [foo]; " +
"expected types are [byte, double, float, half_float, integer, long, scaled_float, short]"));
"expected types are [byte, double, float, half_float, integer, long, scaled_float, short, unsigned_long]"));
}
public void testDetect_GivenClassificationAndRequiredFieldHasInvalidType() {
@ -298,7 +298,7 @@ public class ExtractedFieldsDetectorTests extends ESTestCase {
ElasticsearchStatusException e = expectThrows(ElasticsearchStatusException.class, extractedFieldsDetector::detect);
assertThat(e.getMessage(), equalTo("invalid types [float] for required field [some_float]; " +
"expected types are [boolean, byte, integer, ip, keyword, long, short, text]"));
"expected types are [boolean, byte, integer, ip, keyword, long, short, text, unsigned_long]"));
}
public void testDetect_GivenClassificationAndDependentVariableHasInvalidCardinality() {
@ -371,7 +371,8 @@ public class ExtractedFieldsDetectorTests extends ESTestCase {
assertFieldSelectionContains(fieldExtraction.v2(),
FieldSelection.excluded("categorical", Collections.singleton("keyword"),
"unsupported type; supported types are [boolean, byte, double, float, half_float, integer, long, scaled_float, short]"),
"unsupported type; supported types are " +
"[boolean, byte, double, float, half_float, integer, long, scaled_float, short, unsigned_long]"),
FieldSelection.included("numeric", Collections.singleton("float"), false, FieldSelection.FeatureType.NUMERICAL)
);
}
@ -471,7 +472,7 @@ public class ExtractedFieldsDetectorTests extends ESTestCase {
ElasticsearchStatusException e = expectThrows(ElasticsearchStatusException.class, extractedFieldsDetector::detect);
assertThat(e.getMessage(), equalTo("field [your_keyword] has unsupported type [keyword]. " +
"Supported types are [boolean, byte, double, float, half_float, integer, long, scaled_float, short]."));
"Supported types are [boolean, byte, double, float, half_float, integer, long, scaled_float, short, unsigned_long]."));
}
public void testDetect_GivenNotIncludedFieldHasUnsupportedType() {
@ -492,7 +493,8 @@ public class ExtractedFieldsDetectorTests extends ESTestCase {
assertFieldSelectionContains(fieldExtraction.v2(),
FieldSelection.excluded("categorical", Collections.singleton("keyword"),
"unsupported type; supported types are [boolean, byte, double, float, half_float, integer, long, scaled_float, short]"),
"unsupported type; supported types are " +
"[boolean, byte, double, float, half_float, integer, long, scaled_float, short, unsigned_long]"),
FieldSelection.included("numeric", Collections.singleton("float"), false, FieldSelection.FeatureType.NUMERICAL)
);
}
@ -517,7 +519,7 @@ public class ExtractedFieldsDetectorTests extends ESTestCase {
FieldSelection.included("my_field1", Collections.singleton("float"), false, FieldSelection.FeatureType.NUMERICAL),
FieldSelection.included("your_field2", Collections.singleton("float"), false, FieldSelection.FeatureType.NUMERICAL),
FieldSelection.excluded("your_keyword", Collections.singleton("keyword"), "unsupported type; supported types " +
"are [boolean, byte, double, float, half_float, integer, long, scaled_float, short]")
"are [boolean, byte, double, float, half_float, integer, long, scaled_float, short, unsigned_long]")
);
}

View File

@ -225,7 +225,7 @@
- match: { field_selection.2.is_included: false }
- match: { field_selection.2.is_required: false }
- is_false: field_selection.2.feature_type
- match: { field_selection.2.reason: "unsupported type; supported types are [boolean, byte, double, float, half_float, integer, ip, keyword, long, scaled_float, short, text]" }
- match: { field_selection.2.reason: "unsupported type; supported types are [boolean, byte, double, float, half_float, integer, ip, keyword, long, scaled_float, short, text, unsigned_long]" }
- match: { field_selection.3.name: "field_4" }
- match: { field_selection.3.mapping_types: ["text"] }
- match: { field_selection.3.is_included: false }
@ -299,7 +299,7 @@
- match: { field_selection.2.is_included: false }
- match: { field_selection.2.is_required: false }
- is_false: field_selection.2.feature_type
- match: { field_selection.2.reason: "unsupported type; supported types are [boolean, byte, double, float, half_float, integer, ip, keyword, long, scaled_float, short, text]" }
- match: { field_selection.2.reason: "unsupported type; supported types are [boolean, byte, double, float, half_float, integer, ip, keyword, long, scaled_float, short, text, unsigned_long]" }
- match: { field_selection.3.name: "field_4" }
- match: { field_selection.3.mapping_types: ["text"] }
- match: { field_selection.3.is_included: false }