[ML] fix edge case for data frame analytics where a field mapped as a keyword actually has boolean and string values in the _source (#64826) (#64862)

It is possible that a value mapped as a `keyword` has any scalar value type. This includes any numerical value, String, or boolean.

This commit allows `boolean` types to be considered as a part of the categorical feature collection when this is the case.
This commit is contained in:
Benjamin Trent 2020-11-10 08:46:52 -05:00 committed by GitHub
parent 09ff421d4f
commit dafafd7ec6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 39 additions and 3 deletions

View File

@ -360,7 +360,10 @@ public class DataFrameDataExtractor {
}
private static boolean isValidValue(Object value) {
return value instanceof Number || value instanceof String;
// We should allow a number, string or a boolean.
// It is possible for a field to be categorical and have a `keyword` mapping, but be any of these
// three types, in the same index.
return value instanceof Number || value instanceof String || value instanceof Boolean;
}
public static class DataSummary {

View File

@ -506,6 +506,39 @@ public class DataFrameDataExtractorTests extends ESTestCase {
assertThat(rows.get().get(2).shouldSkip(), is(false));
}
public void testExtractionWithMultipleScalarTypesInSource() throws IOException {
extractedFields = new ExtractedFields(Arrays.asList(
new DocValueField("field_1", Collections.singleton("keyword")),
new DocValueField("field_2", Collections.singleton("keyword"))),
Collections.emptyList(),
Collections.emptyMap());
TestExtractor dataExtractor = createExtractor(true, true);
// First and only batch
SearchResponse response1 = createSearchResponse(Arrays.asList(1, "true", false), Arrays.asList(2_1, 2_2, 2_3));
dataExtractor.setNextResponse(response1);
// Empty
SearchResponse lastAndEmptyResponse = createEmptySearchResponse();
dataExtractor.setNextResponse(lastAndEmptyResponse);
assertThat(dataExtractor.hasNext(), is(true));
// First batch
Optional<List<DataFrameDataExtractor.Row>> rows = dataExtractor.next();
assertThat(rows.isPresent(), is(true));
assertThat(rows.get().size(), equalTo(3));
assertThat(rows.get().get(0).getValues(), equalTo(new String[] {"1", "21",}));
assertThat(rows.get().get(1).getValues(), equalTo(new String[] {"true", "22"}));
assertThat(rows.get().get(2).getValues(), equalTo(new String[] {"false", "23"}));
assertThat(rows.get().get(0).shouldSkip(), is(false));
assertThat(rows.get().get(1).shouldSkip(), is(false));
assertThat(rows.get().get(2).shouldSkip(), is(false));
}
private TestExtractor createExtractor(boolean includeSource, boolean supportsRowsWithMissingValues) {
DataFrameDataExtractorContext context = new DataFrameDataExtractorContext(JOB_ID, extractedFields, indices, query, scrollSize,
headers, includeSource, supportsRowsWithMissingValues, trainTestSplitterFactory);
@ -522,7 +555,7 @@ public class DataFrameDataExtractorTests extends ESTestCase {
true);
}
private SearchResponse createSearchResponse(List<Number> field1Values, List<Number> field2Values) {
private SearchResponse createSearchResponse(List<Object> field1Values, List<Object> field2Values) {
assertThat(field1Values.size(), equalTo(field2Values.size()));
SearchResponse searchResponse = mock(SearchResponse.class);
List<SearchHit> hits = new ArrayList<>();
@ -539,7 +572,7 @@ public class DataFrameDataExtractorTests extends ESTestCase {
return searchResponse;
}
private static void addField(SearchHitBuilder searchHitBuilder, String field, @Nullable Number value) {
private static void addField(SearchHitBuilder searchHitBuilder, String field, @Nullable Object value) {
searchHitBuilder.addField(field, value == null ? Collections.emptyList() : Collections.singletonList(value));
}