From f00dfb2d5f8a41f12d43c49b12889f3fa1692ab1 Mon Sep 17 00:00:00 2001 From: Benjamin Trent Date: Thu, 21 May 2020 08:22:51 -0400 Subject: [PATCH] [ML] adds WKT support in filestructurefinder (#57014) (#57032) Field mapping detection is done via grok patterns. This commit adds well-known text (WKT) formatted geometry detection. If everything is a `POINT`, then a `geo_point` mapping is preferred. Otherwise, if all the fields are WKT geometries a `geo_shape` mapping is preferred. This does **NOT** detect other types of formatted geometries (geohash, comma delimited points, etc.) closes https://github.com/elastic/elasticsearch/issues/56967 --- .../FileStructureUtils.java | 30 +++++++ .../FileStructureUtilsTests.java | 78 +++++++++++++++---- 2 files changed, 94 insertions(+), 14 deletions(-) diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureUtils.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureUtils.java index 6807dfd8e4b..b3e0a14e377 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureUtils.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureUtils.java @@ -15,6 +15,7 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.Collections; +import java.util.HashMap; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; @@ -35,11 +36,35 @@ public final class FileStructureUtils { public static final Set CONVERTIBLE_TYPES = Collections.unmodifiableSet(Sets.newHashSet("integer", "long", "float", "double", "boolean")); + private static final Map EXTENDED_PATTERNS; + static { + Map patterns = new HashMap<>(); + patterns.put("GEO_POINT", "%{NUMBER} %{NUMBER}"); + patterns.put("GEO_POINT_GROUP", "\\(%{GEO_POINT}, (?:%{GEO_POINT}, )*%{GEO_POINT}\\)"); + patterns.put("GEO_POINT_GROUP_GROUP", "\\(%{GEO_POINT_GROUP}(?:, %{GEO_POINT_GROUP})*\\)"); + patterns.put("WKT_POINT", "POINT \\(%{GEO_POINT}\\)"); + patterns.put("WKT_LINESTRING", "LINESTRING %{GEO_POINT_GROUP}"); + patterns.put("WKT_MULTIPOINT", "MULTIPOINT %{GEO_POINT_GROUP}"); + patterns.put("WKT_POLYGON", "POLYGON %{GEO_POINT_GROUP_GROUP}"); + patterns.put("WKT_MULTILINESTRING", "MULTILINESTRING %{GEO_POINT_GROUP_GROUP}"); + patterns.put("WKT_MULTIPOLYGON", "MULTIPOLYGON \\(%{GEO_POINT_GROUP_GROUP}(?:, %{GEO_POINT_GROUP_GROUP})*\\)"); + patterns.put("WKT_BBOX", "BBOX \\(%{NUMBER}, %{NUMBER}, %{NUMBER}, %{NUMBER}\\)"); + patterns.put( + "WKT_ANY", + "(?:%{WKT_POINT}|%{WKT_LINESTRING}|%{WKT_MULTIPOINT}|%{WKT_POLYGON}|%{WKT_MULTILINESTRING}|%{WKT_MULTIPOLYGON}|%{WKT_BBOX})" + ); + patterns.put("WKT_GEOMETRYCOLLECTION", "GEOMETRYCOLLECTION \\(%{WKT_ANY}(?:, %{WKT_ANY})\\)"); + patterns.putAll(Grok.getBuiltinPatterns()); + EXTENDED_PATTERNS = Collections.unmodifiableMap(patterns); + } + private static final int NUM_TOP_HITS = 10; // NUMBER Grok pattern doesn't support scientific notation, so we extend it private static final Grok NUMBER_GROK = new Grok(Grok.getBuiltinPatterns(), "^%{NUMBER}(?:[eE][+-]?[0-3]?[0-9]{1,2})?$", TimeoutChecker.watchdog); private static final Grok IP_GROK = new Grok(Grok.getBuiltinPatterns(), "^%{IP}$", TimeoutChecker.watchdog); + private static final Grok GEO_POINT_WKT = new Grok(EXTENDED_PATTERNS, "^%{WKT_POINT}$", TimeoutChecker.watchdog); + private static final Grok GEO_WKT = new Grok(EXTENDED_PATTERNS, "^(?:%{WKT_ANY}|%{WKT_GEOMETRYCOLLECTION})$", TimeoutChecker.watchdog); private static final int KEYWORD_MAX_LEN = 256; private static final int KEYWORD_MAX_SPACES = 5; @@ -317,6 +342,11 @@ public final class FileStructureUtils { } else if (fieldValues.stream().allMatch(IP_GROK::match)) { return Collections.singletonMap(MAPPING_TYPE_SETTING, "ip"); + // geo_point mapping MUST be checked before geo_shape as geo_shape also contains a matcher for geo_point + } else if (fieldValues.stream().allMatch(GEO_POINT_WKT::match)) { + return Collections.singletonMap(MAPPING_TYPE_SETTING, "geo_point"); + } else if (fieldValues.stream().allMatch(GEO_WKT::match)) { + return Collections.singletonMap(MAPPING_TYPE_SETTING, "geo_shape"); } if (fieldValues.stream().anyMatch(FileStructureUtils::isMoreLikelyTextThanKeyword)) { diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureUtilsTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureUtilsTests.java index d52c3cf87dd..5237e5f0e8c 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureUtilsTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureUtilsTests.java @@ -17,6 +17,7 @@ import java.util.Map; import java.util.SortedMap; import static org.elasticsearch.xpack.ml.filestructurefinder.FileStructureOverrides.EMPTY_OVERRIDES; +import static org.elasticsearch.xpack.ml.filestructurefinder.FileStructureUtils.MAPPING_TYPE_SETTING; import static org.hamcrest.Matchers.contains; import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.instanceOf; @@ -238,26 +239,26 @@ public class FileStructureUtilsTests extends FileStructureTestCase { } public void testGuessMappingGivenKeyword() { - Map expected = Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "keyword"); + Map expected = Collections.singletonMap(MAPPING_TYPE_SETTING, "keyword"); assertEquals(expected, guessMapping(explanation, "foo", Arrays.asList("ERROR", "INFO", "DEBUG"))); assertEquals(expected, guessMapping(explanation, "foo", Arrays.asList("2018-06-11T13:26:47Z", "not a date"))); } public void testGuessMappingGivenText() { - Map expected = Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "text"); + Map expected = Collections.singletonMap(MAPPING_TYPE_SETTING, "text"); assertEquals(expected, guessMapping(explanation, "foo", Arrays.asList("a", "the quick brown fox jumped over the lazy dog"))); } public void testGuessMappingGivenIp() { - Map expected = Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "ip"); + Map expected = Collections.singletonMap(MAPPING_TYPE_SETTING, "ip"); assertEquals(expected, guessMapping(explanation, "foo", Arrays.asList("10.0.0.1", "172.16.0.1", "192.168.0.1"))); } public void testGuessMappingGivenDouble() { - Map expected = Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "double"); + Map expected = Collections.singletonMap(MAPPING_TYPE_SETTING, "double"); assertEquals(expected, guessMapping(explanation, "foo", Arrays.asList("3.14159265359", "0", "-8"))); // 12345678901234567890 is too long for long @@ -267,7 +268,7 @@ public class FileStructureUtilsTests extends FileStructureTestCase { } public void testGuessMappingGivenLong() { - Map expected = Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "long"); + Map expected = Collections.singletonMap(MAPPING_TYPE_SETTING, "long"); assertEquals(expected, guessMapping(explanation, "foo", Arrays.asList("500", "3", "-3"))); assertEquals(expected, guessMapping(explanation, "foo", Arrays.asList(500, 6, 0))); @@ -275,31 +276,31 @@ public class FileStructureUtilsTests extends FileStructureTestCase { public void testGuessMappingGivenDate() { Map expected = new HashMap<>(); - expected.put(FileStructureUtils.MAPPING_TYPE_SETTING, "date"); + expected.put(MAPPING_TYPE_SETTING, "date"); expected.put(FileStructureUtils.MAPPING_FORMAT_SETTING, "iso8601"); assertEquals(expected, guessMapping(explanation, "foo", Arrays.asList("2018-06-11T13:26:47Z", "2018-06-11T13:27:12Z"))); } public void testGuessMappingGivenBoolean() { - Map expected = Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "boolean"); + Map expected = Collections.singletonMap(MAPPING_TYPE_SETTING, "boolean"); assertEquals(expected, guessMapping(explanation, "foo", Arrays.asList("false", "true"))); assertEquals(expected, guessMapping(explanation, "foo", Arrays.asList(true, false))); } public void testGuessMappingGivenArray() { - Map expected = Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "long"); + Map expected = Collections.singletonMap(MAPPING_TYPE_SETTING, "long"); assertEquals(expected, guessMapping(explanation, "foo", Arrays.asList(42, Arrays.asList(1, -99)))); - expected = Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "keyword"); + expected = Collections.singletonMap(MAPPING_TYPE_SETTING, "keyword"); assertEquals(expected, guessMapping(explanation, "foo", Arrays.asList(new String[]{ "x", "y" }, "z"))); } public void testGuessMappingGivenObject() { - Map expected = Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "object"); + Map expected = Collections.singletonMap(MAPPING_TYPE_SETTING, "object"); assertEquals(expected, guessMapping(explanation, "foo", Arrays.asList(Collections.singletonMap("name", "value1"), Collections.singletonMap("name", "value2")))); @@ -330,12 +331,12 @@ public class FileStructureUtilsTests extends FileStructureTestCase { Map mappings = mappingsAndFieldStats.v1(); assertNotNull(mappings); - assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "keyword"), mappings.get("foo")); + assertEquals(Collections.singletonMap(MAPPING_TYPE_SETTING, "keyword"), mappings.get("foo")); Map expectedTimeMapping = new HashMap<>(); - expectedTimeMapping.put(FileStructureUtils.MAPPING_TYPE_SETTING, "date"); + expectedTimeMapping.put(MAPPING_TYPE_SETTING, "date"); expectedTimeMapping.put(FileStructureUtils.MAPPING_FORMAT_SETTING, "yyyy-MM-dd HH:mm:ss,SSS"); assertEquals(expectedTimeMapping, mappings.get("time")); - assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "long"), mappings.get("bar")); + assertEquals(Collections.singletonMap(MAPPING_TYPE_SETTING, "long"), mappings.get("bar")); assertNull(mappings.get("nothing")); Map fieldStats = mappingsAndFieldStats.v2(); @@ -446,7 +447,7 @@ public class FileStructureUtilsTests extends FileStructureTestCase { String mappingType = expectConversion ? randomFrom("long", "double", "boolean") : randomFrom("keyword", "text", "date"); String firstTargetField = ((List) csvProcessorSettings.get("target_fields")).get(0); Map mappingsForConversions = - Collections.singletonMap(firstTargetField, Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, mappingType)); + Collections.singletonMap(firstTargetField, Collections.singletonMap(MAPPING_TYPE_SETTING, mappingType)); Map pipeline = FileStructureUtils.makeIngestPipelineDefinition(null, Collections.emptyMap(), csvProcessorSettings, mappingsForConversions, null, null, false); @@ -558,6 +559,55 @@ public class FileStructureUtilsTests extends FileStructureTestCase { assertEquals(Collections.emptyMap(), pipeline); } + public void testGuessGeoPoint() { + Map mapping = FileStructureUtils.guessScalarMapping( + explanation, + "foo", + Arrays.asList("POINT (-77.03653 38.897676)", "POINT (-50.03653 28.8973)"), + NOOP_TIMEOUT_CHECKER + ); + assertThat(mapping.get(MAPPING_TYPE_SETTING), equalTo("geo_point")); + + mapping = FileStructureUtils.guessScalarMapping( + explanation, + "foo", + Arrays.asList("POINT (-77.03653 38.897676)", "bar"), + NOOP_TIMEOUT_CHECKER + ); + assertThat(mapping.get(MAPPING_TYPE_SETTING), equalTo("keyword")); + } + + public void testGuessGeoShape() { + Map mapping = FileStructureUtils.guessScalarMapping( + explanation, + "foo", + Arrays.asList( + "POINT (-77.03653 38.897676)", + "LINESTRING (-77.03653 38.897676, -77.009051 38.889939)", + "POLYGON ((100.0 0.0, 101.0 0.0, 101.0 1.0, 100.0 1.0, 100.0 0.0))", + "POLYGON ((100.0 0.0, 101.0 0.0, 101.0 1.0, 100.0 1.0, 100.0 0.0), " + + "(100.2 0.2, 100.8 0.2, 100.8 0.8, 100.2 0.8, 100.2 0.2))", + "MULTIPOINT (102.0 2.0, 103.0 2.0)", + "MULTILINESTRING ((102.0 2.0, 103.0 2.0, 103.0 3.0, 102.0 3.0), (100.0 0.0, 101.0 0.0, 101.0 1.0, 100.0 1.0)," + + " (100.2 0.2, 100.8 0.2, 100.8 0.8, 100.2 0.8))", + "MULTIPOLYGON (((102.0 2.0, 103.0 2.0, 103.0 3.0, 102.0 3.0, 102.0 2.0)), ((100.0 0.0, 101.0 0.0, 101.0 1.0, " + + "100.0 1.0, 100.0 0.0), (100.2 0.2, 100.8 0.2, 100.8 0.8, 100.2 0.8, 100.2 0.2)))", + "GEOMETRYCOLLECTION (POINT (100.0 0.0), LINESTRING (101.0 0.0, 102.0 1.0))", + "BBOX (100.0, 102.0, 2.0, 0.0)" + ), + NOOP_TIMEOUT_CHECKER + ); + assertThat(mapping.get(MAPPING_TYPE_SETTING), equalTo("geo_shape")); + + mapping = FileStructureUtils.guessScalarMapping( + explanation, + "foo", + Arrays.asList("POINT (-77.03653 38.897676)", "LINESTRING (-77.03653 38.897676, -77.009051 38.889939)", "bar"), + NOOP_TIMEOUT_CHECKER + ); + assertThat(mapping.get(MAPPING_TYPE_SETTING), equalTo("keyword")); + } + private Map guessMapping(List explanation, String fieldName, List fieldValues) { Tuple, FieldStats> mappingAndFieldStats = FileStructureUtils.guessMappingAndCalculateFieldStats(explanation, fieldName, fieldValues, NOOP_TIMEOUT_CHECKER);