diff --git a/docs/reference/ml/transforms.asciidoc b/docs/reference/ml/transforms.asciidoc index a2276895fc9..9b57c2053f4 100644 --- a/docs/reference/ml/transforms.asciidoc +++ b/docs/reference/ml/transforms.asciidoc @@ -490,10 +490,10 @@ PUT _xpack/ml/datafeeds/datafeed-test3 }, "script_fields":{ "sub":{ - "script":"return domainSplit(doc['query'].value, params).get(0);" + "script":"return domainSplit(doc['query'].value).get(0);" }, "hrd":{ - "script":"return domainSplit(doc['query'].value, params).get(1);" + "script":"return domainSplit(doc['query'].value).get(1);" } } } @@ -511,10 +511,6 @@ registered domain. For example, the highest registered domain of `domainSplit()` function returns an array of two values: the first value is the subdomain; the second value is the highest registered domain. -NOTE: The `domainSplit()` function takes two arguments. The first argument is -the string you want to split. The second argument is always `params`. This is a -technical implementation detail related to how Painless operates internally. - The preview {dfeed} API returns the following results, which show that "www.ml.elastic.co" has been split into "elastic.co" and "www.ml": diff --git a/x-pack/plugin/ml/build.gradle b/x-pack/plugin/ml/build.gradle index 5996458537a..8dd5e61bbc4 100644 --- a/x-pack/plugin/ml/build.gradle +++ b/x-pack/plugin/ml/build.gradle @@ -9,9 +9,8 @@ esplugin { description 'Elasticsearch Expanded Pack Plugin - Machine Learning' classname 'org.elasticsearch.xpack.ml.MachineLearning' hasNativeController true - extendedPlugins = ['x-pack-core'] + extendedPlugins = ['x-pack-core', 'lang-painless'] } -archivesBaseName = 'x-pack-ml' configurations { nativeBundle { @@ -42,6 +41,7 @@ compileTestJava.options.compilerArgs << "-Xlint:-deprecation,-rawtypes,-serial,- dependencies { // "org.elasticsearch.plugin:x-pack-core:${version}" doesn't work with idea because the testArtifacts are also here compileOnly project(path: xpackModule('core'), configuration: 'default') + compileOnly "org.elasticsearch.plugin:elasticsearch-scripting-painless-spi:${versions.elasticsearch}" testCompile project(path: xpackModule('core'), configuration: 'testArtifacts') // This should not be here testCompile project(path: xpackModule('security'), configuration: 'testArtifacts') diff --git a/x-pack/plugin/ml/qa/single-node-tests/src/test/java/org/elasticsearch/xpack/ml/transforms/PainlessDomainSplitIT.java b/x-pack/plugin/ml/qa/single-node-tests/src/test/java/org/elasticsearch/xpack/ml/transforms/PainlessDomainSplitIT.java index ffd869a4a6e..7af4453c2d4 100644 --- a/x-pack/plugin/ml/qa/single-node-tests/src/test/java/org/elasticsearch/xpack/ml/transforms/PainlessDomainSplitIT.java +++ b/x-pack/plugin/ml/qa/single-node-tests/src/test/java/org/elasticsearch/xpack/ml/transforms/PainlessDomainSplitIT.java @@ -13,7 +13,6 @@ import org.elasticsearch.common.Strings; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.test.rest.ESRestTestCase; import org.elasticsearch.xpack.ml.MachineLearning; -import org.elasticsearch.xpack.ml.utils.DomainSplitFunction; import org.joda.time.DateTime; import java.util.ArrayList; @@ -190,8 +189,7 @@ public class PainlessDomainSplitIT extends ESRestTestCase { Pattern pattern = Pattern.compile("domain_split\":\\[(.*?),(.*?)\\]"); - Map params = new HashMap<>(DomainSplitFunction.params.size() + 1); - params.putAll(DomainSplitFunction.params); + Map params = new HashMap<>(); for (TestConfiguration testConfig : tests) { params.put("host", testConfig.hostName); String mapAsJson = Strings.toString(jsonBuilder().map(params)); @@ -207,8 +205,8 @@ public class PainlessDomainSplitIT extends ESRestTestCase { " \"domain_split\" : {\n" + " \"script\" : {\n" + " \"lang\": \"painless\",\n" + - " \"inline\": \"" + DomainSplitFunction.function + - " return domainSplit(params['host'], params); \",\n" + + " \"inline\": \"" + + " return domainSplit(params['host']); \",\n" + " \"params\": " + mapAsJson + "\n" + " }\n" + " }\n" + diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/MachineLearningPainlessExtension.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/MachineLearningPainlessExtension.java new file mode 100644 index 00000000000..b55936acd06 --- /dev/null +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/MachineLearningPainlessExtension.java @@ -0,0 +1,26 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ +package org.elasticsearch.xpack.ml; + +import org.elasticsearch.painless.spi.PainlessExtension; +import org.elasticsearch.painless.spi.Whitelist; +import org.elasticsearch.painless.spi.WhitelistLoader; +import org.elasticsearch.script.ScriptContext; +import org.elasticsearch.script.SearchScript; + +import java.util.Collections; +import java.util.List; +import java.util.Map; + +public class MachineLearningPainlessExtension implements PainlessExtension { + private static final Whitelist WHITELIST = + WhitelistLoader.loadFromResourceFiles(MachineLearningPainlessExtension.class, "whitelist.txt"); + + @Override + public Map, List> getContextWhitelists() { + return Collections.singletonMap(SearchScript.CONTEXT, Collections.singletonList(WHITELIST)); + } +} diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/scroll/ScrollDataExtractor.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/scroll/ScrollDataExtractor.java index 57681a0aafb..7274343a999 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/scroll/ScrollDataExtractor.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/scroll/ScrollDataExtractor.java @@ -17,7 +17,6 @@ import org.elasticsearch.action.search.SearchScrollRequestBuilder; import org.elasticsearch.client.Client; import org.elasticsearch.common.logging.Loggers; import org.elasticsearch.common.unit.TimeValue; -import org.elasticsearch.script.Script; import org.elasticsearch.search.SearchHit; import org.elasticsearch.search.fetch.StoredFieldsContext; import org.elasticsearch.search.fetch.subphase.DocValueFieldsContext; @@ -25,14 +24,11 @@ import org.elasticsearch.search.sort.SortOrder; import org.elasticsearch.xpack.core.ClientHelper; import org.elasticsearch.xpack.core.ml.datafeed.extractor.DataExtractor; import org.elasticsearch.xpack.core.ml.datafeed.extractor.ExtractorUtils; -import org.elasticsearch.xpack.ml.utils.DomainSplitFunction; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; -import java.util.HashMap; -import java.util.Map; import java.util.NoSuchElementException; import java.util.Objects; import java.util.Optional; @@ -130,26 +126,10 @@ class ScrollDataExtractor implements DataExtractor { } else { searchRequestBuilder.setFetchSource(sourceFields, null); } - context.scriptFields.forEach(f -> searchRequestBuilder.addScriptField( - f.fieldName(), injectDomainSplit(f.script()))); + context.scriptFields.forEach(f -> searchRequestBuilder.addScriptField(f.fieldName(), f.script())); return searchRequestBuilder; } - private Script injectDomainSplit(Script script) { - String code = script.getIdOrCode(); - if (code.contains("domainSplit(") && script.getLang().equals("painless")) { - String modifiedCode = DomainSplitFunction.function + code; - Map modifiedParams = new HashMap<>(script.getParams().size() - + DomainSplitFunction.params.size()); - - modifiedParams.putAll(script.getParams()); - modifiedParams.putAll(DomainSplitFunction.params); - - return new Script(script.getType(), script.getLang(), modifiedCode, modifiedParams); - } - return script; - } - private InputStream processSearchResponse(SearchResponse searchResponse) throws IOException { if (searchResponse.getFailedShards() > 0 && searchHasShardFailure == false) { diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/utils/DomainSplitFunction.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/utils/DomainSplitFunction.java index 293885fb87f..332015bc137 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/utils/DomainSplitFunction.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/utils/DomainSplitFunction.java @@ -5,25 +5,33 @@ */ package org.elasticsearch.xpack.ml.utils; +import org.apache.logging.log4j.LogManager; import org.elasticsearch.common.io.Streams; +import org.elasticsearch.common.logging.DeprecationLogger; import java.io.InputStream; +import java.security.AccessController; +import java.security.PrivilegedAction; +import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.List; +import java.util.Locale; import java.util.Map; +import java.util.StringJoiner; public final class DomainSplitFunction { - public static final String function; - public static final Map params; + private static final DeprecationLogger DEPRECATION_LOGGER = + new DeprecationLogger(LogManager.getLogger(DomainSplitFunction.class)); - DomainSplitFunction() {} + private static final int MAX_DOMAIN_PART_LENGTH = 63; + private static final Map exact; + private static final Map under; + private static final Map excluded; static { - Map paramsMap = new HashMap<>(); - - Map exact = new HashMap<>(2048); + Map exactMap = new HashMap<>(2048); String exactResourceName = "org/elasticsearch/xpack/ml/transforms/exact.properties"; @@ -31,253 +39,205 @@ public final class DomainSplitFunction { List lines = Streams.readAllLines(resource); for (String line : lines) { String[] split = line.split("="); - exact.put(split[0].trim(), split[1].trim()); + exactMap.put(split[0].trim(), split[1].trim()); } } catch (Exception e) { throw new RuntimeException("Could not load DomainSplit resource", e); } - exact = Collections.unmodifiableMap(exact); + exact = Collections.unmodifiableMap(exactMap); - Map under = new HashMap<>(30); - under.put("bd", "i"); - under.put("np", "i"); - under.put("jm", "i"); - under.put("fj", "i"); - under.put("fk", "i"); - under.put("ye", "i"); - under.put("sch.uk", "i"); - under.put("bn", "i"); - under.put("kitakyushu.jp", "i"); - under.put("kobe.jp", "i"); - under.put("ke", "i"); - under.put("sapporo.jp", "i"); - under.put("kh", "i"); - under.put("mm", "i"); - under.put("il", "i"); - under.put("yokohama.jp", "i"); - under.put("ck", "i"); - under.put("nagoya.jp", "i"); - under.put("sendai.jp", "i"); - under.put("kw", "i"); - under.put("er", "i"); - under.put("mz", "i"); - under.put("platform.sh", "p"); - under.put("gu", "i"); - under.put("nom.br", "i"); - under.put("zm", "i"); - under.put("pg", "i"); - under.put("ni", "i"); - under.put("kawasaki.jp", "i"); - under.put("zw", "i"); - under = Collections.unmodifiableMap(under); + Map underMap = new HashMap<>(30); + underMap.put("bd", "i"); + underMap.put("np", "i"); + underMap.put("jm", "i"); + underMap.put("fj", "i"); + underMap.put("fk", "i"); + underMap.put("ye", "i"); + underMap.put("sch.uk", "i"); + underMap.put("bn", "i"); + underMap.put("kitakyushu.jp", "i"); + underMap.put("kobe.jp", "i"); + underMap.put("ke", "i"); + underMap.put("sapporo.jp", "i"); + underMap.put("kh", "i"); + underMap.put("mm", "i"); + underMap.put("il", "i"); + underMap.put("yokohama.jp", "i"); + underMap.put("ck", "i"); + underMap.put("nagoya.jp", "i"); + underMap.put("sendai.jp", "i"); + underMap.put("kw", "i"); + underMap.put("er", "i"); + underMap.put("mz", "i"); + underMap.put("platform.sh", "p"); + underMap.put("gu", "i"); + underMap.put("nom.br", "i"); + underMap.put("zm", "i"); + underMap.put("pg", "i"); + underMap.put("ni", "i"); + underMap.put("kawasaki.jp", "i"); + underMap.put("zw", "i"); + under = Collections.unmodifiableMap(underMap); - Map excluded = new HashMap<>(9); - excluded.put("city.yokohama.jp", "i"); - excluded.put("teledata.mz", "i"); - excluded.put("city.kobe.jp", "i"); - excluded.put("city.sapporo.jp", "i"); - excluded.put("city.kawasaki.jp", "i"); - excluded.put("city.nagoya.jp", "i"); - excluded.put("www.ck", "i"); - excluded.put("city.sendai.jp", "i"); - excluded.put("city.kitakyushu.jp", "i"); - excluded = Collections.unmodifiableMap(excluded); - - - paramsMap.put("excluded", excluded); - paramsMap.put("under", under); - paramsMap.put("exact", exact); - params = Collections.unmodifiableMap(paramsMap); + Map excludedMap = new HashMap<>(9); + excludedMap.put("city.yokohama.jp", "i"); + excludedMap.put("teledata.mz", "i"); + excludedMap.put("city.kobe.jp", "i"); + excludedMap.put("city.sapporo.jp", "i"); + excludedMap.put("city.kawasaki.jp", "i"); + excludedMap.put("city.nagoya.jp", "i"); + excludedMap.put("www.ck", "i"); + excludedMap.put("city.sendai.jp", "i"); + excludedMap.put("city.kitakyushu.jp", "i"); + excluded = Collections.unmodifiableMap(excludedMap); } - static { - String fn = "String replaceDots(String input) {\n" + - " String output = input;\n" + - " if (output.indexOf('。') >= 0) {\n" + - " output = output.replace('。', '.');\n" + - " }\n" + - " if (output.indexOf('.') >= 0) {\n" + - " output = output.replace('.', '.');\n" + - " }\n" + - " if (output.indexOf('。') >= 0) {\n" + - " output = output.replace('。', '.');\n" + - " }\n" + - " return output;\n" + - "}\n" + - "List split(String value) {\n" + - " int nextWord = 0;\n" + - " List splits = [];\n" + - " for(int i = 0; i < value.length(); i++) {\n" + - " if(value.charAt(i) == (char)'.') {\n" + - " splits.add(value.substring(nextWord, i));\n" + - " nextWord = i+1;\n" + - " }\n" + - " }\n" + - " if (nextWord != value.length()) {\n" + - " splits.add(value.substring(nextWord, value.length()));\n" + - " }\n" + - " return splits;\n" + - "}\n" + - "List splitDomain(String domain) {\n" + - " String dotDomain = replaceDots(domain);\n" + - " return split(dotDomain);\n" + - "}\n" + - "boolean validateSyntax(List parts) {\n" + - " int lastIndex = parts.length - 1;\n" + - " /* Validate the last part specially, as it has different syntax rules. */\n" + - " if (!validatePart(parts[lastIndex], true)) {\n" + - " return false;\n" + - " }\n" + - " for (int i = 0; i < lastIndex; i++) {\n" + - " String part = parts[i];\n" + - " if (!validatePart(part, false)) {\n" + - " return false;\n" + - " }\n" + - " }\n" + - " return true;\n" + - "}\n" + - "boolean validatePart(String part, boolean isFinalPart) {\n" + - " int MAX_DOMAIN_PART_LENGTH = 63;\n" + - " if (part.length() < 1 || part.length() > MAX_DOMAIN_PART_LENGTH) {\n" + - " return false;\n" + - " }\n" + - " int offset = 0;\n" + - " int strLen = part.length();\n" + - " while (offset < strLen) {\n" + - " int curChar = part.charAt(offset);\n" + - " offset += 1;\n" + - " if (!(Character.isLetterOrDigit(curChar) || curChar == (char)'-' || curChar == (char)'_')) {\n" + - " return false;\n" + - " }\n" + - " }\n" + - " if (part.charAt(0) == (char)'-' || part.charAt(0) == (char)'_' ||\n" + - " part.charAt(part.length() - 1) == (char)'-' || part.charAt(part.length() - 1) == (char)'_') {\n" + - " return false;\n" + - " }\n" + - " if (isFinalPart && Character.isDigit(part.charAt(0))) {\n" + - " return false;\n" + - " }\n" + - " return true;\n" + - "}\n" + - "int findPublicSuffix(Map params, List parts) {\n" + - " int partsSize = parts.size();\n" + - "\n" + - " for (int i = 0; i < partsSize; i++) {\n" + - " StringJoiner joiner = new StringJoiner('.');\n" + - " for (String s : parts.subList(i, partsSize)) {\n" + - " joiner.add(s);\n" + - " }\n" + - " /* parts.subList(i, partsSize).each(joiner::add); */\n" + - " String ancestorName = joiner.toString();\n" + - "\n" + - " if (params['exact'].containsKey(ancestorName)) {\n" + - " return i;\n" + - " }\n" + - "\n" + - " /* Excluded domains (e.g. !nhs.uk) use the next highest\n" + - " domain as the effective public suffix (e.g. uk). */\n" + - "\n" + - " if (params['excluded'].containsKey(ancestorName)) {\n" + - " return i + 1;\n" + - " }\n" + - "\n" + - " List pieces = split(ancestorName);\n" + - " if (pieces.length >= 2 && params['under'].containsKey(pieces[1])) {\n" + - " return i;\n" + - " }\n" + - " }\n" + - "\n" + - " return -1;\n" + - "}\n" + - "String ancestor(List parts, int levels) {\n" + - " StringJoiner joiner = new StringJoiner('.');\n" + - " for (String s : parts.subList(levels, parts.size())) {\n" + - " joiner.add(s);\n" + - " }\n" + - " String name = joiner.toString();\n" + - " if (name.endsWith('.')) {\n" + - " name = name.substring(0, name.length() - 1);\n" + - " }\n" + - " return name;\n" + - "}\n" + - "String topPrivateDomain(String name, List parts, int publicSuffixIndex) {\n" + - " if (publicSuffixIndex == 1) {\n" + - " return name;\n" + - " }\n" + - " if (!(publicSuffixIndex > 0)) {\n" + - " throw new IllegalArgumentException('Not under a public suffix: ' + name);\n" + - " }\n" + - " return ancestor(parts, publicSuffixIndex - 1);\n" + - "}\n" + - "List domainSplit(String host, Map params) {\n" + - " int MAX_DNS_NAME_LENGTH = 253;\n" + - " int MAX_LENGTH = 253;\n" + - " int MAX_PARTS = 127;\n" + - " if ('host'.isEmpty()) {\n" + - " return ['',''];\n" + - " }\n" + - " host = host.trim();\n" + - " if (host.contains(':')) {\n" + - " return ['', host];\n" + - " }\n" + - " boolean tentativeIP = true;\n" + - " for(int i = 0; i < host.length(); i++) {\n" + - " if (!(Character.isDigit(host.charAt(i)) || host.charAt(i) == (char)'.')) {\n" + - " tentativeIP = false;\n" + - " break;\n" + - " }\n" + - " }\n" + - " if (tentativeIP) {\n" + - " /* special-snowflake rules now... */\n" + - " if (host == '.') {\n" + - " return ['',''];\n" + - " }\n" + - " return ['', host];\n" + - " }\n" + - " def normalizedHost = host;\n" + - " normalizedHost = normalizedHost.toLowerCase();\n" + - " List parts = splitDomain(normalizedHost);\n" + - " int publicSuffixIndex = findPublicSuffix(params, parts);\n" + - " if (publicSuffixIndex == 0) {\n" + - " return ['', host];\n" + - " }\n" + - " String highestRegistered = '';\n" + - " /* for the case where the host is internal like .local so is not a recognised public suffix */\n" + - " if (publicSuffixIndex == -1) {\n" + - " if (!parts.isEmpty()) {\n" + - " if (parts.size() == 1) {\n" + - " return ['', host];\n" + - " }\n" + - " if (parts.size() > 2) {\n" + - " boolean allNumeric = true;\n" + - " String value = parts.get(parts.size() - 1);\n" + - " for (int i = 0; i < value.length(); i++) {\n" + - " if (!Character.isDigit(value.charAt(i))) {\n" + - " allNumeric = false;\n" + - " break;\n" + - " }\n" + - " }\n" + - " if (allNumeric) {\n" + - " highestRegistered = parts.get(parts.size() - 2) + '.' + parts.get(parts.size() - 1);\n" + - " } else {\n" + - " highestRegistered = parts.get(parts.size() - 1);\n" + - " }\n" + - "\n" + - " } else {\n" + - " highestRegistered = parts.get(parts.size() - 1);\n" + - " }\n" + - " }\n" + - " } else {\n" + - " /* HRD is the top private domain */\n" + - " highestRegistered = topPrivateDomain(normalizedHost, parts, publicSuffixIndex);\n" + - " }\n" + - " String subDomain = host.substring(0, host.length() - highestRegistered.length());\n" + - " if (subDomain.endsWith('.')) {\n" + - " subDomain = subDomain.substring(0, subDomain.length() - 1);\n" + - " }\n" + - " return [subDomain, highestRegistered];\n" + - "}\n"; - fn = fn.replace("\n"," "); - function = fn; + private DomainSplitFunction() {} + + private static String replaceDots(String input) { + String output = input; + if (output.indexOf('。') >= 0) { + output = output.replace('。', '.'); + } + if (output.indexOf('.') >= 0) { + output = output.replace('.', '.'); + } + if (output.indexOf('。') >= 0) { + output = output.replace('。', '.'); + } + return output; + } + + private static List splitDomain(String domain) { + String dotDomain = replaceDots(domain); + return Arrays.asList(dotDomain.split("\\.")); + } + + private static int findPublicSuffix(List parts) { + int partsSize = parts.size(); + for (int i = 0; i < partsSize; i++) { + StringJoiner joiner = new StringJoiner("."); + for (String s : parts.subList(i, partsSize)) { + joiner.add(s); + } + /* parts.subList(i, partsSize).each(joiner::add); */ + String ancestorName = joiner.toString(); + if (exact.containsKey(ancestorName)) { + return i; + } + /* Excluded domains (e.g. !nhs.uk) use the next highest + domain as the effective public suffix (e.g. uk). */ + if (excluded.containsKey(ancestorName)) { + return i + 1; + } + String [] pieces = ancestorName.split("\\."); + if (pieces.length >= 2 && under.containsKey(pieces[1])) { + return i; + } + } + return -1; + } + + private static String ancestor(List parts, int levels) { + StringJoiner joiner = new StringJoiner("."); + for (String s : parts.subList(levels, parts.size())) { + joiner.add(s); + } + String name = joiner.toString(); + if (name.endsWith(".")) { + name = name.substring(0, name.length() - 1); + } + return name; + } + + private static String topPrivateDomain(String name, List parts, int publicSuffixIndex) { + if (publicSuffixIndex == 1) { + return name; + } + if (!(publicSuffixIndex > 0)) { + throw new IllegalArgumentException("Not under a public suffix: " + name); + } + return ancestor(parts, publicSuffixIndex - 1); + } + + public static List domainSplit(String host, Map params) { + // NOTE: we don't check SpecialPermission because this will be called (indirectly) from scripts + AccessController.doPrivileged((PrivilegedAction) () -> { + DEPRECATION_LOGGER.deprecatedAndMaybeLog("domainSplit", + "Method [domainSplit] taking params is deprecated. Remove the params argument."); + return null; + }); + return domainSplit(host); + } + + /** + * Split {@code host} into sub domain and highest registered domain. + * The result is a list containing exactly 2 items the first is the sub domain + * and the second the highest registered domain. + * + * @param host The hostname to split + * @return The sub domain and highest registered domain + */ + public static List domainSplit(String host) { + host = host.trim(); + if (host.contains(":")) { + return Arrays.asList("", host); + } + boolean tentativeIP = true; + for(int i = 0; i < host.length(); i++) { + if (!(Character.isDigit(host.charAt(i)) || host.charAt(i) == '.')) { + tentativeIP = false; + break; + } + } + if (tentativeIP) { + /* special-snowflake rules now... */ + if (host.equals(".")) { + return Arrays.asList("",""); + } + return Arrays.asList("", host); + } + String normalizedHost = host; + normalizedHost = normalizedHost.toLowerCase(Locale.ROOT); + List parts = splitDomain(normalizedHost); + int publicSuffixIndex = findPublicSuffix(parts); + if (publicSuffixIndex == 0) { + return Arrays.asList("", host); + } + String highestRegistered = ""; + /* for the case where the host is internal like .local so is not a recognised public suffix */ + if (publicSuffixIndex == -1) { + if (!parts.isEmpty()) { + if (parts.size() == 1) { + return Arrays.asList("", host); + } + if (parts.size() > 2) { + boolean allNumeric = true; + String value = parts.get(parts.size() - 1); + for (int i = 0; i < value.length(); i++) { + if (!Character.isDigit(value.charAt(i))) { + allNumeric = false; + break; + } + } + if (allNumeric) { + highestRegistered = parts.get(parts.size() - 2) + '.' + parts.get(parts.size() - 1); + } else { + highestRegistered = parts.get(parts.size() - 1); + } + + } else { + highestRegistered = parts.get(parts.size() - 1); + } + } + } else { + /* HRD is the top private domain */ + highestRegistered = topPrivateDomain(normalizedHost, parts, publicSuffixIndex); + } + String subDomain = host.substring(0, host.length() - highestRegistered.length()); + if (subDomain.endsWith(".")) { + subDomain = subDomain.substring(0, subDomain.length() - 1); + } + return Arrays.asList(subDomain, highestRegistered); } } diff --git a/x-pack/plugin/ml/src/main/resources/META-INF/services/org.elasticsearch.painless.spi.PainlessExtension b/x-pack/plugin/ml/src/main/resources/META-INF/services/org.elasticsearch.painless.spi.PainlessExtension new file mode 100644 index 00000000000..c320fa54c01 --- /dev/null +++ b/x-pack/plugin/ml/src/main/resources/META-INF/services/org.elasticsearch.painless.spi.PainlessExtension @@ -0,0 +1 @@ +org.elasticsearch.xpack.ml.MachineLearningPainlessExtension \ No newline at end of file diff --git a/x-pack/plugin/ml/src/main/resources/org/elasticsearch/xpack/ml/whitelist.txt b/x-pack/plugin/ml/src/main/resources/org/elasticsearch/xpack/ml/whitelist.txt new file mode 100644 index 00000000000..d15d6c1cba3 --- /dev/null +++ b/x-pack/plugin/ml/src/main/resources/org/elasticsearch/xpack/ml/whitelist.txt @@ -0,0 +1,5 @@ + +static_import { + List domainSplit(String) from_class org.elasticsearch.xpack.ml.utils.DomainSplitFunction + List domainSplit(String,Map) from_class org.elasticsearch.xpack.ml.utils.DomainSplitFunction +} \ No newline at end of file diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/datafeed/extractor/scroll/ScrollDataExtractorTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/datafeed/extractor/scroll/ScrollDataExtractorTests.java index c83521591fc..f72ae9b46b1 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/datafeed/extractor/scroll/ScrollDataExtractorTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/datafeed/extractor/scroll/ScrollDataExtractorTests.java @@ -454,8 +454,6 @@ public class ScrollDataExtractorTests extends ESTestCase { // Check for the scripts assertThat(searchRequest, containsString("{\"script\":{\"source\":\"return 1 + 1;\",\"lang\":\"mockscript\"}" .replaceAll("\\s", ""))); - assertThat(searchRequest, containsString("List domainSplit(String host, Map params)".replaceAll("\\s", ""))); - assertThat(searchRequest, containsString("String replaceDots(String input) {".replaceAll("\\s", ""))); assertThat(capturedContinueScrollIds.size(), equalTo(1)); assertThat(capturedContinueScrollIds.get(0), equalTo(response1.getScrollId())); diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/utils/DomainSplitFunctionTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/utils/DomainSplitFunctionTests.java new file mode 100644 index 00000000000..cf47fe8a568 --- /dev/null +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/utils/DomainSplitFunctionTests.java @@ -0,0 +1,56 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +package org.elasticsearch.xpack.ml.utils; + +import org.elasticsearch.test.ESTestCase; + +import java.util.List; + +public class DomainSplitFunctionTests extends ESTestCase { + + public void testDomainSplit() { + // Test cases from https://github.com/john-kurkowski/tldextract/tree/master/tldextract/tests + assertDomainSplit("www", "google.com", "www.google.com"); + assertDomainSplit("www.maps", "google.co.uk", "www.maps.google.co.uk"); + assertDomainSplit("www", "theregister.co.uk", "www.theregister.co.uk"); + assertDomainSplit("", "gmail.com", "gmail.com"); + assertDomainSplit("media.forums", "theregister.co.uk", "media.forums.theregister.co.uk"); + assertDomainSplit("www", "www.com", "www.www.com"); + assertDomainSplit("", "www.com", "www.com"); + assertDomainSplit("", "internalunlikelyhostname", "internalunlikelyhostname"); + assertDomainSplit("internalunlikelyhostname", "bizarre", "internalunlikelyhostname.bizarre"); + assertDomainSplit("", "internalunlikelyhostname.info", "internalunlikelyhostname.info"); // .info is a valid TLD + assertDomainSplit("internalunlikelyhostname", "information", "internalunlikelyhostname.information"); + assertDomainSplit("", "216.22.0.192", "216.22.0.192"); + assertDomainSplit("", "::1", "::1"); + assertDomainSplit("", "FE80:0000:0000:0000:0202:B3FF:FE1E:8329", "FE80:0000:0000:0000:0202:B3FF:FE1E:8329"); + assertDomainSplit("216.22", "project.coop", "216.22.project.coop"); + assertDomainSplit("www", "xn--h1alffa9f.xn--p1ai", "www.xn--h1alffa9f.xn--p1ai"); + assertDomainSplit("", "", ""); + assertDomainSplit("www", "parliament.uk", "www.parliament.uk"); + assertDomainSplit("www", "parliament.co.uk", "www.parliament.co.uk"); + assertDomainSplit("www.a", "cgs.act.edu.au", "www.a.cgs.act.edu.au"); + assertDomainSplit("www", "google.com.au", "www.google.com.au"); + assertDomainSplit("www", "metp.net.cn", "www.metp.net.cn"); + assertDomainSplit("www", "waiterrant.blogspot.com", "www.waiterrant.blogspot.com"); + assertDomainSplit("", "kittens.blogspot.co.uk", "kittens.blogspot.co.uk"); + assertDomainSplit("example", "example", "example.example"); + assertDomainSplit("b.example", "example", "b.example.example"); + assertDomainSplit("a.b.example", "example", "a.b.example.example"); + assertDomainSplit("example", "local", "example.local"); + assertDomainSplit("b.example", "local", "b.example.local"); + assertDomainSplit("a.b.example", "local", "a.b.example.local"); + assertDomainSplit("r192494180984795-1-1041782-channel-live.ums", "ustream.tv", + "r192494180984795-1-1041782-channel-live.ums.ustream.tv"); + } + + private void assertDomainSplit(String expectedSubDomain, String expectedDomain, String hostName) { + List split = DomainSplitFunction.domainSplit(hostName); + assertEquals(expectedSubDomain, split.get(0)); + assertEquals(expectedDomain, split.get(1)); + } +}