Field mapping detection is done via grok patterns. This commit adds well-known text (WKT) formatted geometry detection. If everything is a `POINT`, then a `geo_point` mapping is preferred. Otherwise, if all the fields are WKT geometries a `geo_shape` mapping is preferred. This does **NOT** detect other types of formatted geometries (geohash, comma delimited points, etc.) closes https://github.com/elastic/elasticsearch/issues/56967
This commit is contained in:
parent
9af31109fa
commit
f00dfb2d5f
|
@ -15,6 +15,7 @@ import java.util.ArrayList;
|
|||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
@ -35,11 +36,35 @@ public final class FileStructureUtils {
|
|||
public static final Set<String> CONVERTIBLE_TYPES =
|
||||
Collections.unmodifiableSet(Sets.newHashSet("integer", "long", "float", "double", "boolean"));
|
||||
|
||||
private static final Map<String, String> EXTENDED_PATTERNS;
|
||||
static {
|
||||
Map<String, String> patterns = new HashMap<>();
|
||||
patterns.put("GEO_POINT", "%{NUMBER} %{NUMBER}");
|
||||
patterns.put("GEO_POINT_GROUP", "\\(%{GEO_POINT}, (?:%{GEO_POINT}, )*%{GEO_POINT}\\)");
|
||||
patterns.put("GEO_POINT_GROUP_GROUP", "\\(%{GEO_POINT_GROUP}(?:, %{GEO_POINT_GROUP})*\\)");
|
||||
patterns.put("WKT_POINT", "POINT \\(%{GEO_POINT}\\)");
|
||||
patterns.put("WKT_LINESTRING", "LINESTRING %{GEO_POINT_GROUP}");
|
||||
patterns.put("WKT_MULTIPOINT", "MULTIPOINT %{GEO_POINT_GROUP}");
|
||||
patterns.put("WKT_POLYGON", "POLYGON %{GEO_POINT_GROUP_GROUP}");
|
||||
patterns.put("WKT_MULTILINESTRING", "MULTILINESTRING %{GEO_POINT_GROUP_GROUP}");
|
||||
patterns.put("WKT_MULTIPOLYGON", "MULTIPOLYGON \\(%{GEO_POINT_GROUP_GROUP}(?:, %{GEO_POINT_GROUP_GROUP})*\\)");
|
||||
patterns.put("WKT_BBOX", "BBOX \\(%{NUMBER}, %{NUMBER}, %{NUMBER}, %{NUMBER}\\)");
|
||||
patterns.put(
|
||||
"WKT_ANY",
|
||||
"(?:%{WKT_POINT}|%{WKT_LINESTRING}|%{WKT_MULTIPOINT}|%{WKT_POLYGON}|%{WKT_MULTILINESTRING}|%{WKT_MULTIPOLYGON}|%{WKT_BBOX})"
|
||||
);
|
||||
patterns.put("WKT_GEOMETRYCOLLECTION", "GEOMETRYCOLLECTION \\(%{WKT_ANY}(?:, %{WKT_ANY})\\)");
|
||||
patterns.putAll(Grok.getBuiltinPatterns());
|
||||
EXTENDED_PATTERNS = Collections.unmodifiableMap(patterns);
|
||||
}
|
||||
|
||||
private static final int NUM_TOP_HITS = 10;
|
||||
// NUMBER Grok pattern doesn't support scientific notation, so we extend it
|
||||
private static final Grok NUMBER_GROK = new Grok(Grok.getBuiltinPatterns(), "^%{NUMBER}(?:[eE][+-]?[0-3]?[0-9]{1,2})?$",
|
||||
TimeoutChecker.watchdog);
|
||||
private static final Grok IP_GROK = new Grok(Grok.getBuiltinPatterns(), "^%{IP}$", TimeoutChecker.watchdog);
|
||||
private static final Grok GEO_POINT_WKT = new Grok(EXTENDED_PATTERNS, "^%{WKT_POINT}$", TimeoutChecker.watchdog);
|
||||
private static final Grok GEO_WKT = new Grok(EXTENDED_PATTERNS, "^(?:%{WKT_ANY}|%{WKT_GEOMETRYCOLLECTION})$", TimeoutChecker.watchdog);
|
||||
private static final int KEYWORD_MAX_LEN = 256;
|
||||
private static final int KEYWORD_MAX_SPACES = 5;
|
||||
|
||||
|
@ -317,6 +342,11 @@ public final class FileStructureUtils {
|
|||
}
|
||||
else if (fieldValues.stream().allMatch(IP_GROK::match)) {
|
||||
return Collections.singletonMap(MAPPING_TYPE_SETTING, "ip");
|
||||
// geo_point mapping MUST be checked before geo_shape as geo_shape also contains a matcher for geo_point
|
||||
} else if (fieldValues.stream().allMatch(GEO_POINT_WKT::match)) {
|
||||
return Collections.singletonMap(MAPPING_TYPE_SETTING, "geo_point");
|
||||
} else if (fieldValues.stream().allMatch(GEO_WKT::match)) {
|
||||
return Collections.singletonMap(MAPPING_TYPE_SETTING, "geo_shape");
|
||||
}
|
||||
|
||||
if (fieldValues.stream().anyMatch(FileStructureUtils::isMoreLikelyTextThanKeyword)) {
|
||||
|
|
|
@ -17,6 +17,7 @@ import java.util.Map;
|
|||
import java.util.SortedMap;
|
||||
|
||||
import static org.elasticsearch.xpack.ml.filestructurefinder.FileStructureOverrides.EMPTY_OVERRIDES;
|
||||
import static org.elasticsearch.xpack.ml.filestructurefinder.FileStructureUtils.MAPPING_TYPE_SETTING;
|
||||
import static org.hamcrest.Matchers.contains;
|
||||
import static org.hamcrest.Matchers.equalTo;
|
||||
import static org.hamcrest.Matchers.instanceOf;
|
||||
|
@ -238,26 +239,26 @@ public class FileStructureUtilsTests extends FileStructureTestCase {
|
|||
}
|
||||
|
||||
public void testGuessMappingGivenKeyword() {
|
||||
Map<String, String> expected = Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "keyword");
|
||||
Map<String, String> expected = Collections.singletonMap(MAPPING_TYPE_SETTING, "keyword");
|
||||
|
||||
assertEquals(expected, guessMapping(explanation, "foo", Arrays.asList("ERROR", "INFO", "DEBUG")));
|
||||
assertEquals(expected, guessMapping(explanation, "foo", Arrays.asList("2018-06-11T13:26:47Z", "not a date")));
|
||||
}
|
||||
|
||||
public void testGuessMappingGivenText() {
|
||||
Map<String, String> expected = Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "text");
|
||||
Map<String, String> expected = Collections.singletonMap(MAPPING_TYPE_SETTING, "text");
|
||||
|
||||
assertEquals(expected, guessMapping(explanation, "foo", Arrays.asList("a", "the quick brown fox jumped over the lazy dog")));
|
||||
}
|
||||
|
||||
public void testGuessMappingGivenIp() {
|
||||
Map<String, String> expected = Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "ip");
|
||||
Map<String, String> expected = Collections.singletonMap(MAPPING_TYPE_SETTING, "ip");
|
||||
|
||||
assertEquals(expected, guessMapping(explanation, "foo", Arrays.asList("10.0.0.1", "172.16.0.1", "192.168.0.1")));
|
||||
}
|
||||
|
||||
public void testGuessMappingGivenDouble() {
|
||||
Map<String, String> expected = Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "double");
|
||||
Map<String, String> expected = Collections.singletonMap(MAPPING_TYPE_SETTING, "double");
|
||||
|
||||
assertEquals(expected, guessMapping(explanation, "foo", Arrays.asList("3.14159265359", "0", "-8")));
|
||||
// 12345678901234567890 is too long for long
|
||||
|
@ -267,7 +268,7 @@ public class FileStructureUtilsTests extends FileStructureTestCase {
|
|||
}
|
||||
|
||||
public void testGuessMappingGivenLong() {
|
||||
Map<String, String> expected = Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "long");
|
||||
Map<String, String> expected = Collections.singletonMap(MAPPING_TYPE_SETTING, "long");
|
||||
|
||||
assertEquals(expected, guessMapping(explanation, "foo", Arrays.asList("500", "3", "-3")));
|
||||
assertEquals(expected, guessMapping(explanation, "foo", Arrays.asList(500, 6, 0)));
|
||||
|
@ -275,31 +276,31 @@ public class FileStructureUtilsTests extends FileStructureTestCase {
|
|||
|
||||
public void testGuessMappingGivenDate() {
|
||||
Map<String, String> expected = new HashMap<>();
|
||||
expected.put(FileStructureUtils.MAPPING_TYPE_SETTING, "date");
|
||||
expected.put(MAPPING_TYPE_SETTING, "date");
|
||||
expected.put(FileStructureUtils.MAPPING_FORMAT_SETTING, "iso8601");
|
||||
|
||||
assertEquals(expected, guessMapping(explanation, "foo", Arrays.asList("2018-06-11T13:26:47Z", "2018-06-11T13:27:12Z")));
|
||||
}
|
||||
|
||||
public void testGuessMappingGivenBoolean() {
|
||||
Map<String, String> expected = Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "boolean");
|
||||
Map<String, String> expected = Collections.singletonMap(MAPPING_TYPE_SETTING, "boolean");
|
||||
|
||||
assertEquals(expected, guessMapping(explanation, "foo", Arrays.asList("false", "true")));
|
||||
assertEquals(expected, guessMapping(explanation, "foo", Arrays.asList(true, false)));
|
||||
}
|
||||
|
||||
public void testGuessMappingGivenArray() {
|
||||
Map<String, String> expected = Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "long");
|
||||
Map<String, String> expected = Collections.singletonMap(MAPPING_TYPE_SETTING, "long");
|
||||
|
||||
assertEquals(expected, guessMapping(explanation, "foo", Arrays.asList(42, Arrays.asList(1, -99))));
|
||||
|
||||
expected = Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "keyword");
|
||||
expected = Collections.singletonMap(MAPPING_TYPE_SETTING, "keyword");
|
||||
|
||||
assertEquals(expected, guessMapping(explanation, "foo", Arrays.asList(new String[]{ "x", "y" }, "z")));
|
||||
}
|
||||
|
||||
public void testGuessMappingGivenObject() {
|
||||
Map<String, String> expected = Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "object");
|
||||
Map<String, String> expected = Collections.singletonMap(MAPPING_TYPE_SETTING, "object");
|
||||
|
||||
assertEquals(expected, guessMapping(explanation, "foo",
|
||||
Arrays.asList(Collections.singletonMap("name", "value1"), Collections.singletonMap("name", "value2"))));
|
||||
|
@ -330,12 +331,12 @@ public class FileStructureUtilsTests extends FileStructureTestCase {
|
|||
|
||||
Map<String, Object> mappings = mappingsAndFieldStats.v1();
|
||||
assertNotNull(mappings);
|
||||
assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "keyword"), mappings.get("foo"));
|
||||
assertEquals(Collections.singletonMap(MAPPING_TYPE_SETTING, "keyword"), mappings.get("foo"));
|
||||
Map<String, String> expectedTimeMapping = new HashMap<>();
|
||||
expectedTimeMapping.put(FileStructureUtils.MAPPING_TYPE_SETTING, "date");
|
||||
expectedTimeMapping.put(MAPPING_TYPE_SETTING, "date");
|
||||
expectedTimeMapping.put(FileStructureUtils.MAPPING_FORMAT_SETTING, "yyyy-MM-dd HH:mm:ss,SSS");
|
||||
assertEquals(expectedTimeMapping, mappings.get("time"));
|
||||
assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "long"), mappings.get("bar"));
|
||||
assertEquals(Collections.singletonMap(MAPPING_TYPE_SETTING, "long"), mappings.get("bar"));
|
||||
assertNull(mappings.get("nothing"));
|
||||
|
||||
Map<String, FieldStats> fieldStats = mappingsAndFieldStats.v2();
|
||||
|
@ -446,7 +447,7 @@ public class FileStructureUtilsTests extends FileStructureTestCase {
|
|||
String mappingType = expectConversion ? randomFrom("long", "double", "boolean") : randomFrom("keyword", "text", "date");
|
||||
String firstTargetField = ((List<String>) csvProcessorSettings.get("target_fields")).get(0);
|
||||
Map<String, Object> mappingsForConversions =
|
||||
Collections.singletonMap(firstTargetField, Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, mappingType));
|
||||
Collections.singletonMap(firstTargetField, Collections.singletonMap(MAPPING_TYPE_SETTING, mappingType));
|
||||
|
||||
Map<String, Object> pipeline = FileStructureUtils.makeIngestPipelineDefinition(null, Collections.emptyMap(), csvProcessorSettings,
|
||||
mappingsForConversions, null, null, false);
|
||||
|
@ -558,6 +559,55 @@ public class FileStructureUtilsTests extends FileStructureTestCase {
|
|||
assertEquals(Collections.emptyMap(), pipeline);
|
||||
}
|
||||
|
||||
public void testGuessGeoPoint() {
|
||||
Map<String, String> mapping = FileStructureUtils.guessScalarMapping(
|
||||
explanation,
|
||||
"foo",
|
||||
Arrays.asList("POINT (-77.03653 38.897676)", "POINT (-50.03653 28.8973)"),
|
||||
NOOP_TIMEOUT_CHECKER
|
||||
);
|
||||
assertThat(mapping.get(MAPPING_TYPE_SETTING), equalTo("geo_point"));
|
||||
|
||||
mapping = FileStructureUtils.guessScalarMapping(
|
||||
explanation,
|
||||
"foo",
|
||||
Arrays.asList("POINT (-77.03653 38.897676)", "bar"),
|
||||
NOOP_TIMEOUT_CHECKER
|
||||
);
|
||||
assertThat(mapping.get(MAPPING_TYPE_SETTING), equalTo("keyword"));
|
||||
}
|
||||
|
||||
public void testGuessGeoShape() {
|
||||
Map<String, String> mapping = FileStructureUtils.guessScalarMapping(
|
||||
explanation,
|
||||
"foo",
|
||||
Arrays.asList(
|
||||
"POINT (-77.03653 38.897676)",
|
||||
"LINESTRING (-77.03653 38.897676, -77.009051 38.889939)",
|
||||
"POLYGON ((100.0 0.0, 101.0 0.0, 101.0 1.0, 100.0 1.0, 100.0 0.0))",
|
||||
"POLYGON ((100.0 0.0, 101.0 0.0, 101.0 1.0, 100.0 1.0, 100.0 0.0), " +
|
||||
"(100.2 0.2, 100.8 0.2, 100.8 0.8, 100.2 0.8, 100.2 0.2))",
|
||||
"MULTIPOINT (102.0 2.0, 103.0 2.0)",
|
||||
"MULTILINESTRING ((102.0 2.0, 103.0 2.0, 103.0 3.0, 102.0 3.0), (100.0 0.0, 101.0 0.0, 101.0 1.0, 100.0 1.0)," +
|
||||
" (100.2 0.2, 100.8 0.2, 100.8 0.8, 100.2 0.8))",
|
||||
"MULTIPOLYGON (((102.0 2.0, 103.0 2.0, 103.0 3.0, 102.0 3.0, 102.0 2.0)), ((100.0 0.0, 101.0 0.0, 101.0 1.0, " +
|
||||
"100.0 1.0, 100.0 0.0), (100.2 0.2, 100.8 0.2, 100.8 0.8, 100.2 0.8, 100.2 0.2)))",
|
||||
"GEOMETRYCOLLECTION (POINT (100.0 0.0), LINESTRING (101.0 0.0, 102.0 1.0))",
|
||||
"BBOX (100.0, 102.0, 2.0, 0.0)"
|
||||
),
|
||||
NOOP_TIMEOUT_CHECKER
|
||||
);
|
||||
assertThat(mapping.get(MAPPING_TYPE_SETTING), equalTo("geo_shape"));
|
||||
|
||||
mapping = FileStructureUtils.guessScalarMapping(
|
||||
explanation,
|
||||
"foo",
|
||||
Arrays.asList("POINT (-77.03653 38.897676)", "LINESTRING (-77.03653 38.897676, -77.009051 38.889939)", "bar"),
|
||||
NOOP_TIMEOUT_CHECKER
|
||||
);
|
||||
assertThat(mapping.get(MAPPING_TYPE_SETTING), equalTo("keyword"));
|
||||
}
|
||||
|
||||
private Map<String, String> guessMapping(List<String> explanation, String fieldName, List<Object> fieldValues) {
|
||||
Tuple<Map<String, String>, FieldStats> mappingAndFieldStats = FileStructureUtils.guessMappingAndCalculateFieldStats(explanation,
|
||||
fieldName, fieldValues, NOOP_TIMEOUT_CHECKER);
|
||||
|
|
Loading…
Reference in New Issue