[ML] fix edge case for data frame analytics where a field mapped as a keyword actually has boolean and string values in the _source (#64826) (#64862)
It is possible that a value mapped as a `keyword` has any scalar value type. This includes any numerical value, String, or boolean. This commit allows `boolean` types to be considered as a part of the categorical feature collection when this is the case.
This commit is contained in:
parent
09ff421d4f
commit
dafafd7ec6
|
@ -360,7 +360,10 @@ public class DataFrameDataExtractor {
|
|||
}
|
||||
|
||||
private static boolean isValidValue(Object value) {
|
||||
return value instanceof Number || value instanceof String;
|
||||
// We should allow a number, string or a boolean.
|
||||
// It is possible for a field to be categorical and have a `keyword` mapping, but be any of these
|
||||
// three types, in the same index.
|
||||
return value instanceof Number || value instanceof String || value instanceof Boolean;
|
||||
}
|
||||
|
||||
public static class DataSummary {
|
||||
|
|
|
@ -506,6 +506,39 @@ public class DataFrameDataExtractorTests extends ESTestCase {
|
|||
assertThat(rows.get().get(2).shouldSkip(), is(false));
|
||||
}
|
||||
|
||||
public void testExtractionWithMultipleScalarTypesInSource() throws IOException {
|
||||
extractedFields = new ExtractedFields(Arrays.asList(
|
||||
new DocValueField("field_1", Collections.singleton("keyword")),
|
||||
new DocValueField("field_2", Collections.singleton("keyword"))),
|
||||
Collections.emptyList(),
|
||||
Collections.emptyMap());
|
||||
|
||||
TestExtractor dataExtractor = createExtractor(true, true);
|
||||
|
||||
// First and only batch
|
||||
SearchResponse response1 = createSearchResponse(Arrays.asList(1, "true", false), Arrays.asList(2_1, 2_2, 2_3));
|
||||
dataExtractor.setNextResponse(response1);
|
||||
|
||||
// Empty
|
||||
SearchResponse lastAndEmptyResponse = createEmptySearchResponse();
|
||||
dataExtractor.setNextResponse(lastAndEmptyResponse);
|
||||
|
||||
assertThat(dataExtractor.hasNext(), is(true));
|
||||
|
||||
// First batch
|
||||
Optional<List<DataFrameDataExtractor.Row>> rows = dataExtractor.next();
|
||||
assertThat(rows.isPresent(), is(true));
|
||||
assertThat(rows.get().size(), equalTo(3));
|
||||
|
||||
assertThat(rows.get().get(0).getValues(), equalTo(new String[] {"1", "21",}));
|
||||
assertThat(rows.get().get(1).getValues(), equalTo(new String[] {"true", "22"}));
|
||||
assertThat(rows.get().get(2).getValues(), equalTo(new String[] {"false", "23"}));
|
||||
|
||||
assertThat(rows.get().get(0).shouldSkip(), is(false));
|
||||
assertThat(rows.get().get(1).shouldSkip(), is(false));
|
||||
assertThat(rows.get().get(2).shouldSkip(), is(false));
|
||||
}
|
||||
|
||||
private TestExtractor createExtractor(boolean includeSource, boolean supportsRowsWithMissingValues) {
|
||||
DataFrameDataExtractorContext context = new DataFrameDataExtractorContext(JOB_ID, extractedFields, indices, query, scrollSize,
|
||||
headers, includeSource, supportsRowsWithMissingValues, trainTestSplitterFactory);
|
||||
|
@ -522,7 +555,7 @@ public class DataFrameDataExtractorTests extends ESTestCase {
|
|||
true);
|
||||
}
|
||||
|
||||
private SearchResponse createSearchResponse(List<Number> field1Values, List<Number> field2Values) {
|
||||
private SearchResponse createSearchResponse(List<Object> field1Values, List<Object> field2Values) {
|
||||
assertThat(field1Values.size(), equalTo(field2Values.size()));
|
||||
SearchResponse searchResponse = mock(SearchResponse.class);
|
||||
List<SearchHit> hits = new ArrayList<>();
|
||||
|
@ -539,7 +572,7 @@ public class DataFrameDataExtractorTests extends ESTestCase {
|
|||
return searchResponse;
|
||||
}
|
||||
|
||||
private static void addField(SearchHitBuilder searchHitBuilder, String field, @Nullable Number value) {
|
||||
private static void addField(SearchHitBuilder searchHitBuilder, String field, @Nullable Object value) {
|
||||
searchHitBuilder.addField(field, value == null ? Collections.emptyList() : Collections.singletonList(value));
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue