Scripting: Convert domainSplit function for ML to whitelist (#34426)

This commit moves the definition of domainSplit into java and exposes it
as a painless whitelist extension. The method also no longer needs
params, and version which ignores params is added and deprecated.
This commit is contained in:
Ryan Ernst 2018-10-17 15:54:21 -07:00 committed by GitHub
parent 8734540345
commit d445785f1a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 302 additions and 282 deletions

View File

@ -490,10 +490,10 @@ PUT _xpack/ml/datafeeds/datafeed-test3
}, },
"script_fields":{ "script_fields":{
"sub":{ "sub":{
"script":"return domainSplit(doc['query'].value, params).get(0);" "script":"return domainSplit(doc['query'].value).get(0);"
}, },
"hrd":{ "hrd":{
"script":"return domainSplit(doc['query'].value, params).get(1);" "script":"return domainSplit(doc['query'].value).get(1);"
} }
} }
} }
@ -511,10 +511,6 @@ registered domain. For example, the highest registered domain of
`domainSplit()` function returns an array of two values: the first value is the `domainSplit()` function returns an array of two values: the first value is the
subdomain; the second value is the highest registered domain. subdomain; the second value is the highest registered domain.
NOTE: The `domainSplit()` function takes two arguments. The first argument is
the string you want to split. The second argument is always `params`. This is a
technical implementation detail related to how Painless operates internally.
The preview {dfeed} API returns the following results, which show that The preview {dfeed} API returns the following results, which show that
"www.ml.elastic.co" has been split into "elastic.co" and "www.ml": "www.ml.elastic.co" has been split into "elastic.co" and "www.ml":

View File

@ -9,9 +9,8 @@ esplugin {
description 'Elasticsearch Expanded Pack Plugin - Machine Learning' description 'Elasticsearch Expanded Pack Plugin - Machine Learning'
classname 'org.elasticsearch.xpack.ml.MachineLearning' classname 'org.elasticsearch.xpack.ml.MachineLearning'
hasNativeController true hasNativeController true
extendedPlugins = ['x-pack-core'] extendedPlugins = ['x-pack-core', 'lang-painless']
} }
archivesBaseName = 'x-pack-ml'
configurations { configurations {
nativeBundle { nativeBundle {
@ -42,6 +41,7 @@ compileTestJava.options.compilerArgs << "-Xlint:-deprecation,-rawtypes,-serial,-
dependencies { dependencies {
// "org.elasticsearch.plugin:x-pack-core:${version}" doesn't work with idea because the testArtifacts are also here // "org.elasticsearch.plugin:x-pack-core:${version}" doesn't work with idea because the testArtifacts are also here
compileOnly project(path: xpackModule('core'), configuration: 'default') compileOnly project(path: xpackModule('core'), configuration: 'default')
compileOnly "org.elasticsearch.plugin:elasticsearch-scripting-painless-spi:${versions.elasticsearch}"
testCompile project(path: xpackModule('core'), configuration: 'testArtifacts') testCompile project(path: xpackModule('core'), configuration: 'testArtifacts')
// This should not be here // This should not be here
testCompile project(path: xpackModule('security'), configuration: 'testArtifacts') testCompile project(path: xpackModule('security'), configuration: 'testArtifacts')

View File

@ -13,7 +13,6 @@ import org.elasticsearch.common.Strings;
import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.test.rest.ESRestTestCase; import org.elasticsearch.test.rest.ESRestTestCase;
import org.elasticsearch.xpack.ml.MachineLearning; import org.elasticsearch.xpack.ml.MachineLearning;
import org.elasticsearch.xpack.ml.utils.DomainSplitFunction;
import org.joda.time.DateTime; import org.joda.time.DateTime;
import java.util.ArrayList; import java.util.ArrayList;
@ -190,8 +189,7 @@ public class PainlessDomainSplitIT extends ESRestTestCase {
Pattern pattern = Pattern.compile("domain_split\":\\[(.*?),(.*?)\\]"); Pattern pattern = Pattern.compile("domain_split\":\\[(.*?),(.*?)\\]");
Map<String, Object> params = new HashMap<>(DomainSplitFunction.params.size() + 1); Map<String, Object> params = new HashMap<>();
params.putAll(DomainSplitFunction.params);
for (TestConfiguration testConfig : tests) { for (TestConfiguration testConfig : tests) {
params.put("host", testConfig.hostName); params.put("host", testConfig.hostName);
String mapAsJson = Strings.toString(jsonBuilder().map(params)); String mapAsJson = Strings.toString(jsonBuilder().map(params));
@ -207,8 +205,8 @@ public class PainlessDomainSplitIT extends ESRestTestCase {
" \"domain_split\" : {\n" + " \"domain_split\" : {\n" +
" \"script\" : {\n" + " \"script\" : {\n" +
" \"lang\": \"painless\",\n" + " \"lang\": \"painless\",\n" +
" \"inline\": \"" + DomainSplitFunction.function + " \"inline\": \"" +
" return domainSplit(params['host'], params); \",\n" + " return domainSplit(params['host']); \",\n" +
" \"params\": " + mapAsJson + "\n" + " \"params\": " + mapAsJson + "\n" +
" }\n" + " }\n" +
" }\n" + " }\n" +

View File

@ -0,0 +1,26 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License;
* you may not use this file except in compliance with the Elastic License.
*/
package org.elasticsearch.xpack.ml;
import org.elasticsearch.painless.spi.PainlessExtension;
import org.elasticsearch.painless.spi.Whitelist;
import org.elasticsearch.painless.spi.WhitelistLoader;
import org.elasticsearch.script.ScriptContext;
import org.elasticsearch.script.SearchScript;
import java.util.Collections;
import java.util.List;
import java.util.Map;
public class MachineLearningPainlessExtension implements PainlessExtension {
private static final Whitelist WHITELIST =
WhitelistLoader.loadFromResourceFiles(MachineLearningPainlessExtension.class, "whitelist.txt");
@Override
public Map<ScriptContext<?>, List<Whitelist>> getContextWhitelists() {
return Collections.singletonMap(SearchScript.CONTEXT, Collections.singletonList(WHITELIST));
}
}

View File

@ -17,7 +17,6 @@ import org.elasticsearch.action.search.SearchScrollRequestBuilder;
import org.elasticsearch.client.Client; import org.elasticsearch.client.Client;
import org.elasticsearch.common.logging.Loggers; import org.elasticsearch.common.logging.Loggers;
import org.elasticsearch.common.unit.TimeValue; import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.script.Script;
import org.elasticsearch.search.SearchHit; import org.elasticsearch.search.SearchHit;
import org.elasticsearch.search.fetch.StoredFieldsContext; import org.elasticsearch.search.fetch.StoredFieldsContext;
import org.elasticsearch.search.fetch.subphase.DocValueFieldsContext; import org.elasticsearch.search.fetch.subphase.DocValueFieldsContext;
@ -25,14 +24,11 @@ import org.elasticsearch.search.sort.SortOrder;
import org.elasticsearch.xpack.core.ClientHelper; import org.elasticsearch.xpack.core.ClientHelper;
import org.elasticsearch.xpack.core.ml.datafeed.extractor.DataExtractor; import org.elasticsearch.xpack.core.ml.datafeed.extractor.DataExtractor;
import org.elasticsearch.xpack.core.ml.datafeed.extractor.ExtractorUtils; import org.elasticsearch.xpack.core.ml.datafeed.extractor.ExtractorUtils;
import org.elasticsearch.xpack.ml.utils.DomainSplitFunction;
import java.io.ByteArrayInputStream; import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream; import java.io.ByteArrayOutputStream;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.util.HashMap;
import java.util.Map;
import java.util.NoSuchElementException; import java.util.NoSuchElementException;
import java.util.Objects; import java.util.Objects;
import java.util.Optional; import java.util.Optional;
@ -130,26 +126,10 @@ class ScrollDataExtractor implements DataExtractor {
} else { } else {
searchRequestBuilder.setFetchSource(sourceFields, null); searchRequestBuilder.setFetchSource(sourceFields, null);
} }
context.scriptFields.forEach(f -> searchRequestBuilder.addScriptField( context.scriptFields.forEach(f -> searchRequestBuilder.addScriptField(f.fieldName(), f.script()));
f.fieldName(), injectDomainSplit(f.script())));
return searchRequestBuilder; return searchRequestBuilder;
} }
private Script injectDomainSplit(Script script) {
String code = script.getIdOrCode();
if (code.contains("domainSplit(") && script.getLang().equals("painless")) {
String modifiedCode = DomainSplitFunction.function + code;
Map<String, Object> modifiedParams = new HashMap<>(script.getParams().size()
+ DomainSplitFunction.params.size());
modifiedParams.putAll(script.getParams());
modifiedParams.putAll(DomainSplitFunction.params);
return new Script(script.getType(), script.getLang(), modifiedCode, modifiedParams);
}
return script;
}
private InputStream processSearchResponse(SearchResponse searchResponse) throws IOException { private InputStream processSearchResponse(SearchResponse searchResponse) throws IOException {
if (searchResponse.getFailedShards() > 0 && searchHasShardFailure == false) { if (searchResponse.getFailedShards() > 0 && searchHasShardFailure == false) {

View File

@ -5,25 +5,33 @@
*/ */
package org.elasticsearch.xpack.ml.utils; package org.elasticsearch.xpack.ml.utils;
import org.apache.logging.log4j.LogManager;
import org.elasticsearch.common.io.Streams; import org.elasticsearch.common.io.Streams;
import org.elasticsearch.common.logging.DeprecationLogger;
import java.io.InputStream; import java.io.InputStream;
import java.security.AccessController;
import java.security.PrivilegedAction;
import java.util.Arrays;
import java.util.Collections; import java.util.Collections;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Locale;
import java.util.Map; import java.util.Map;
import java.util.StringJoiner;
public final class DomainSplitFunction { public final class DomainSplitFunction {
public static final String function; private static final DeprecationLogger DEPRECATION_LOGGER =
public static final Map<String, Object> params; new DeprecationLogger(LogManager.getLogger(DomainSplitFunction.class));
DomainSplitFunction() {} private static final int MAX_DOMAIN_PART_LENGTH = 63;
private static final Map<String, String> exact;
private static final Map<String, String> under;
private static final Map<String, String> excluded;
static { static {
Map<String, Object> paramsMap = new HashMap<>(); Map<String, String> exactMap = new HashMap<>(2048);
Map<String, String> exact = new HashMap<>(2048);
String exactResourceName = "org/elasticsearch/xpack/ml/transforms/exact.properties"; String exactResourceName = "org/elasticsearch/xpack/ml/transforms/exact.properties";
@ -31,253 +39,205 @@ public final class DomainSplitFunction {
List<String> lines = Streams.readAllLines(resource); List<String> lines = Streams.readAllLines(resource);
for (String line : lines) { for (String line : lines) {
String[] split = line.split("="); String[] split = line.split("=");
exact.put(split[0].trim(), split[1].trim()); exactMap.put(split[0].trim(), split[1].trim());
} }
} catch (Exception e) { } catch (Exception e) {
throw new RuntimeException("Could not load DomainSplit resource", e); throw new RuntimeException("Could not load DomainSplit resource", e);
} }
exact = Collections.unmodifiableMap(exact); exact = Collections.unmodifiableMap(exactMap);
Map<String, Object> under = new HashMap<>(30); Map<String, String> underMap = new HashMap<>(30);
under.put("bd", "i"); underMap.put("bd", "i");
under.put("np", "i"); underMap.put("np", "i");
under.put("jm", "i"); underMap.put("jm", "i");
under.put("fj", "i"); underMap.put("fj", "i");
under.put("fk", "i"); underMap.put("fk", "i");
under.put("ye", "i"); underMap.put("ye", "i");
under.put("sch.uk", "i"); underMap.put("sch.uk", "i");
under.put("bn", "i"); underMap.put("bn", "i");
under.put("kitakyushu.jp", "i"); underMap.put("kitakyushu.jp", "i");
under.put("kobe.jp", "i"); underMap.put("kobe.jp", "i");
under.put("ke", "i"); underMap.put("ke", "i");
under.put("sapporo.jp", "i"); underMap.put("sapporo.jp", "i");
under.put("kh", "i"); underMap.put("kh", "i");
under.put("mm", "i"); underMap.put("mm", "i");
under.put("il", "i"); underMap.put("il", "i");
under.put("yokohama.jp", "i"); underMap.put("yokohama.jp", "i");
under.put("ck", "i"); underMap.put("ck", "i");
under.put("nagoya.jp", "i"); underMap.put("nagoya.jp", "i");
under.put("sendai.jp", "i"); underMap.put("sendai.jp", "i");
under.put("kw", "i"); underMap.put("kw", "i");
under.put("er", "i"); underMap.put("er", "i");
under.put("mz", "i"); underMap.put("mz", "i");
under.put("platform.sh", "p"); underMap.put("platform.sh", "p");
under.put("gu", "i"); underMap.put("gu", "i");
under.put("nom.br", "i"); underMap.put("nom.br", "i");
under.put("zm", "i"); underMap.put("zm", "i");
under.put("pg", "i"); underMap.put("pg", "i");
under.put("ni", "i"); underMap.put("ni", "i");
under.put("kawasaki.jp", "i"); underMap.put("kawasaki.jp", "i");
under.put("zw", "i"); underMap.put("zw", "i");
under = Collections.unmodifiableMap(under); under = Collections.unmodifiableMap(underMap);
Map<String, String> excluded = new HashMap<>(9); Map<String, String> excludedMap = new HashMap<>(9);
excluded.put("city.yokohama.jp", "i"); excludedMap.put("city.yokohama.jp", "i");
excluded.put("teledata.mz", "i"); excludedMap.put("teledata.mz", "i");
excluded.put("city.kobe.jp", "i"); excludedMap.put("city.kobe.jp", "i");
excluded.put("city.sapporo.jp", "i"); excludedMap.put("city.sapporo.jp", "i");
excluded.put("city.kawasaki.jp", "i"); excludedMap.put("city.kawasaki.jp", "i");
excluded.put("city.nagoya.jp", "i"); excludedMap.put("city.nagoya.jp", "i");
excluded.put("www.ck", "i"); excludedMap.put("www.ck", "i");
excluded.put("city.sendai.jp", "i"); excludedMap.put("city.sendai.jp", "i");
excluded.put("city.kitakyushu.jp", "i"); excludedMap.put("city.kitakyushu.jp", "i");
excluded = Collections.unmodifiableMap(excluded); excluded = Collections.unmodifiableMap(excludedMap);
paramsMap.put("excluded", excluded);
paramsMap.put("under", under);
paramsMap.put("exact", exact);
params = Collections.unmodifiableMap(paramsMap);
} }
static { private DomainSplitFunction() {}
String fn = "String replaceDots(String input) {\n" +
" String output = input;\n" + private static String replaceDots(String input) {
" if (output.indexOf('。') >= 0) {\n" + String output = input;
" output = output.replace('。', '.');\n" + if (output.indexOf('。') >= 0) {
" }\n" + output = output.replace('。', '.');
" if (output.indexOf('') >= 0) {\n" + }
" output = output.replace('', '.');\n" + if (output.indexOf('') >= 0) {
" }\n" + output = output.replace('', '.');
" if (output.indexOf('。') >= 0) {\n" + }
" output = output.replace('。', '.');\n" + if (output.indexOf('。') >= 0) {
" }\n" + output = output.replace('。', '.');
" return output;\n" + }
"}\n" + return output;
"List split(String value) {\n" + }
" int nextWord = 0;\n" +
" List splits = [];\n" + private static List<String> splitDomain(String domain) {
" for(int i = 0; i < value.length(); i++) {\n" + String dotDomain = replaceDots(domain);
" if(value.charAt(i) == (char)'.') {\n" + return Arrays.asList(dotDomain.split("\\."));
" splits.add(value.substring(nextWord, i));\n" + }
" nextWord = i+1;\n" +
" }\n" + private static int findPublicSuffix(List<String> parts) {
" }\n" + int partsSize = parts.size();
" if (nextWord != value.length()) {\n" + for (int i = 0; i < partsSize; i++) {
" splits.add(value.substring(nextWord, value.length()));\n" + StringJoiner joiner = new StringJoiner(".");
" }\n" + for (String s : parts.subList(i, partsSize)) {
" return splits;\n" + joiner.add(s);
"}\n" + }
"List splitDomain(String domain) {\n" + /* parts.subList(i, partsSize).each(joiner::add); */
" String dotDomain = replaceDots(domain);\n" + String ancestorName = joiner.toString();
" return split(dotDomain);\n" + if (exact.containsKey(ancestorName)) {
"}\n" + return i;
"boolean validateSyntax(List parts) {\n" + }
" int lastIndex = parts.length - 1;\n" + /* Excluded domains (e.g. !nhs.uk) use the next highest
" /* Validate the last part specially, as it has different syntax rules. */\n" + domain as the effective public suffix (e.g. uk). */
" if (!validatePart(parts[lastIndex], true)) {\n" + if (excluded.containsKey(ancestorName)) {
" return false;\n" + return i + 1;
" }\n" + }
" for (int i = 0; i < lastIndex; i++) {\n" + String [] pieces = ancestorName.split("\\.");
" String part = parts[i];\n" + if (pieces.length >= 2 && under.containsKey(pieces[1])) {
" if (!validatePart(part, false)) {\n" + return i;
" return false;\n" + }
" }\n" + }
" }\n" + return -1;
" return true;\n" + }
"}\n" +
"boolean validatePart(String part, boolean isFinalPart) {\n" + private static String ancestor(List<String> parts, int levels) {
" int MAX_DOMAIN_PART_LENGTH = 63;\n" + StringJoiner joiner = new StringJoiner(".");
" if (part.length() < 1 || part.length() > MAX_DOMAIN_PART_LENGTH) {\n" + for (String s : parts.subList(levels, parts.size())) {
" return false;\n" + joiner.add(s);
" }\n" + }
" int offset = 0;\n" + String name = joiner.toString();
" int strLen = part.length();\n" + if (name.endsWith(".")) {
" while (offset < strLen) {\n" + name = name.substring(0, name.length() - 1);
" int curChar = part.charAt(offset);\n" + }
" offset += 1;\n" + return name;
" if (!(Character.isLetterOrDigit(curChar) || curChar == (char)'-' || curChar == (char)'_')) {\n" + }
" return false;\n" +
" }\n" + private static String topPrivateDomain(String name, List<String> parts, int publicSuffixIndex) {
" }\n" + if (publicSuffixIndex == 1) {
" if (part.charAt(0) == (char)'-' || part.charAt(0) == (char)'_' ||\n" + return name;
" part.charAt(part.length() - 1) == (char)'-' || part.charAt(part.length() - 1) == (char)'_') {\n" + }
" return false;\n" + if (!(publicSuffixIndex > 0)) {
" }\n" + throw new IllegalArgumentException("Not under a public suffix: " + name);
" if (isFinalPart && Character.isDigit(part.charAt(0))) {\n" + }
" return false;\n" + return ancestor(parts, publicSuffixIndex - 1);
" }\n" + }
" return true;\n" +
"}\n" + public static List<String> domainSplit(String host, Map<String, Object> params) {
"int findPublicSuffix(Map params, List parts) {\n" + // NOTE: we don't check SpecialPermission because this will be called (indirectly) from scripts
" int partsSize = parts.size();\n" + AccessController.doPrivileged((PrivilegedAction<Void>) () -> {
"\n" + DEPRECATION_LOGGER.deprecatedAndMaybeLog("domainSplit",
" for (int i = 0; i < partsSize; i++) {\n" + "Method [domainSplit] taking params is deprecated. Remove the params argument.");
" StringJoiner joiner = new StringJoiner('.');\n" + return null;
" for (String s : parts.subList(i, partsSize)) {\n" + });
" joiner.add(s);\n" + return domainSplit(host);
" }\n" + }
" /* parts.subList(i, partsSize).each(joiner::add); */\n" +
" String ancestorName = joiner.toString();\n" + /**
"\n" + * Split {@code host} into sub domain and highest registered domain.
" if (params['exact'].containsKey(ancestorName)) {\n" + * The result is a list containing exactly 2 items the first is the sub domain
" return i;\n" + * and the second the highest registered domain.
" }\n" + *
"\n" + * @param host The hostname to split
" /* Excluded domains (e.g. !nhs.uk) use the next highest\n" + * @return The sub domain and highest registered domain
" domain as the effective public suffix (e.g. uk). */\n" + */
"\n" + public static List<String> domainSplit(String host) {
" if (params['excluded'].containsKey(ancestorName)) {\n" + host = host.trim();
" return i + 1;\n" + if (host.contains(":")) {
" }\n" + return Arrays.asList("", host);
"\n" + }
" List pieces = split(ancestorName);\n" + boolean tentativeIP = true;
" if (pieces.length >= 2 && params['under'].containsKey(pieces[1])) {\n" + for(int i = 0; i < host.length(); i++) {
" return i;\n" + if (!(Character.isDigit(host.charAt(i)) || host.charAt(i) == '.')) {
" }\n" + tentativeIP = false;
" }\n" + break;
"\n" + }
" return -1;\n" + }
"}\n" + if (tentativeIP) {
"String ancestor(List parts, int levels) {\n" + /* special-snowflake rules now... */
" StringJoiner joiner = new StringJoiner('.');\n" + if (host.equals(".")) {
" for (String s : parts.subList(levels, parts.size())) {\n" + return Arrays.asList("","");
" joiner.add(s);\n" + }
" }\n" + return Arrays.asList("", host);
" String name = joiner.toString();\n" + }
" if (name.endsWith('.')) {\n" + String normalizedHost = host;
" name = name.substring(0, name.length() - 1);\n" + normalizedHost = normalizedHost.toLowerCase(Locale.ROOT);
" }\n" + List<String> parts = splitDomain(normalizedHost);
" return name;\n" + int publicSuffixIndex = findPublicSuffix(parts);
"}\n" + if (publicSuffixIndex == 0) {
"String topPrivateDomain(String name, List parts, int publicSuffixIndex) {\n" + return Arrays.asList("", host);
" if (publicSuffixIndex == 1) {\n" + }
" return name;\n" + String highestRegistered = "";
" }\n" + /* for the case where the host is internal like .local so is not a recognised public suffix */
" if (!(publicSuffixIndex > 0)) {\n" + if (publicSuffixIndex == -1) {
" throw new IllegalArgumentException('Not under a public suffix: ' + name);\n" + if (!parts.isEmpty()) {
" }\n" + if (parts.size() == 1) {
" return ancestor(parts, publicSuffixIndex - 1);\n" + return Arrays.asList("", host);
"}\n" + }
"List domainSplit(String host, Map params) {\n" + if (parts.size() > 2) {
" int MAX_DNS_NAME_LENGTH = 253;\n" + boolean allNumeric = true;
" int MAX_LENGTH = 253;\n" + String value = parts.get(parts.size() - 1);
" int MAX_PARTS = 127;\n" + for (int i = 0; i < value.length(); i++) {
" if ('host'.isEmpty()) {\n" + if (!Character.isDigit(value.charAt(i))) {
" return ['',''];\n" + allNumeric = false;
" }\n" + break;
" host = host.trim();\n" + }
" if (host.contains(':')) {\n" + }
" return ['', host];\n" + if (allNumeric) {
" }\n" + highestRegistered = parts.get(parts.size() - 2) + '.' + parts.get(parts.size() - 1);
" boolean tentativeIP = true;\n" + } else {
" for(int i = 0; i < host.length(); i++) {\n" + highestRegistered = parts.get(parts.size() - 1);
" if (!(Character.isDigit(host.charAt(i)) || host.charAt(i) == (char)'.')) {\n" + }
" tentativeIP = false;\n" +
" break;\n" + } else {
" }\n" + highestRegistered = parts.get(parts.size() - 1);
" }\n" + }
" if (tentativeIP) {\n" + }
" /* special-snowflake rules now... */\n" + } else {
" if (host == '.') {\n" + /* HRD is the top private domain */
" return ['',''];\n" + highestRegistered = topPrivateDomain(normalizedHost, parts, publicSuffixIndex);
" }\n" + }
" return ['', host];\n" + String subDomain = host.substring(0, host.length() - highestRegistered.length());
" }\n" + if (subDomain.endsWith(".")) {
" def normalizedHost = host;\n" + subDomain = subDomain.substring(0, subDomain.length() - 1);
" normalizedHost = normalizedHost.toLowerCase();\n" + }
" List parts = splitDomain(normalizedHost);\n" + return Arrays.asList(subDomain, highestRegistered);
" int publicSuffixIndex = findPublicSuffix(params, parts);\n" +
" if (publicSuffixIndex == 0) {\n" +
" return ['', host];\n" +
" }\n" +
" String highestRegistered = '';\n" +
" /* for the case where the host is internal like .local so is not a recognised public suffix */\n" +
" if (publicSuffixIndex == -1) {\n" +
" if (!parts.isEmpty()) {\n" +
" if (parts.size() == 1) {\n" +
" return ['', host];\n" +
" }\n" +
" if (parts.size() > 2) {\n" +
" boolean allNumeric = true;\n" +
" String value = parts.get(parts.size() - 1);\n" +
" for (int i = 0; i < value.length(); i++) {\n" +
" if (!Character.isDigit(value.charAt(i))) {\n" +
" allNumeric = false;\n" +
" break;\n" +
" }\n" +
" }\n" +
" if (allNumeric) {\n" +
" highestRegistered = parts.get(parts.size() - 2) + '.' + parts.get(parts.size() - 1);\n" +
" } else {\n" +
" highestRegistered = parts.get(parts.size() - 1);\n" +
" }\n" +
"\n" +
" } else {\n" +
" highestRegistered = parts.get(parts.size() - 1);\n" +
" }\n" +
" }\n" +
" } else {\n" +
" /* HRD is the top private domain */\n" +
" highestRegistered = topPrivateDomain(normalizedHost, parts, publicSuffixIndex);\n" +
" }\n" +
" String subDomain = host.substring(0, host.length() - highestRegistered.length());\n" +
" if (subDomain.endsWith('.')) {\n" +
" subDomain = subDomain.substring(0, subDomain.length() - 1);\n" +
" }\n" +
" return [subDomain, highestRegistered];\n" +
"}\n";
fn = fn.replace("\n"," ");
function = fn;
} }
} }

View File

@ -0,0 +1 @@
org.elasticsearch.xpack.ml.MachineLearningPainlessExtension

View File

@ -0,0 +1,5 @@
static_import {
List domainSplit(String) from_class org.elasticsearch.xpack.ml.utils.DomainSplitFunction
List domainSplit(String,Map) from_class org.elasticsearch.xpack.ml.utils.DomainSplitFunction
}

View File

@ -454,8 +454,6 @@ public class ScrollDataExtractorTests extends ESTestCase {
// Check for the scripts // Check for the scripts
assertThat(searchRequest, containsString("{\"script\":{\"source\":\"return 1 + 1;\",\"lang\":\"mockscript\"}" assertThat(searchRequest, containsString("{\"script\":{\"source\":\"return 1 + 1;\",\"lang\":\"mockscript\"}"
.replaceAll("\\s", ""))); .replaceAll("\\s", "")));
assertThat(searchRequest, containsString("List domainSplit(String host, Map params)".replaceAll("\\s", "")));
assertThat(searchRequest, containsString("String replaceDots(String input) {".replaceAll("\\s", "")));
assertThat(capturedContinueScrollIds.size(), equalTo(1)); assertThat(capturedContinueScrollIds.size(), equalTo(1));
assertThat(capturedContinueScrollIds.get(0), equalTo(response1.getScrollId())); assertThat(capturedContinueScrollIds.get(0), equalTo(response1.getScrollId()));

View File

@ -0,0 +1,56 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License;
* you may not use this file except in compliance with the Elastic License.
*/
package org.elasticsearch.xpack.ml.utils;
import org.elasticsearch.test.ESTestCase;
import java.util.List;
public class DomainSplitFunctionTests extends ESTestCase {
public void testDomainSplit() {
// Test cases from https://github.com/john-kurkowski/tldextract/tree/master/tldextract/tests
assertDomainSplit("www", "google.com", "www.google.com");
assertDomainSplit("www.maps", "google.co.uk", "www.maps.google.co.uk");
assertDomainSplit("www", "theregister.co.uk", "www.theregister.co.uk");
assertDomainSplit("", "gmail.com", "gmail.com");
assertDomainSplit("media.forums", "theregister.co.uk", "media.forums.theregister.co.uk");
assertDomainSplit("www", "www.com", "www.www.com");
assertDomainSplit("", "www.com", "www.com");
assertDomainSplit("", "internalunlikelyhostname", "internalunlikelyhostname");
assertDomainSplit("internalunlikelyhostname", "bizarre", "internalunlikelyhostname.bizarre");
assertDomainSplit("", "internalunlikelyhostname.info", "internalunlikelyhostname.info"); // .info is a valid TLD
assertDomainSplit("internalunlikelyhostname", "information", "internalunlikelyhostname.information");
assertDomainSplit("", "216.22.0.192", "216.22.0.192");
assertDomainSplit("", "::1", "::1");
assertDomainSplit("", "FE80:0000:0000:0000:0202:B3FF:FE1E:8329", "FE80:0000:0000:0000:0202:B3FF:FE1E:8329");
assertDomainSplit("216.22", "project.coop", "216.22.project.coop");
assertDomainSplit("www", "xn--h1alffa9f.xn--p1ai", "www.xn--h1alffa9f.xn--p1ai");
assertDomainSplit("", "", "");
assertDomainSplit("www", "parliament.uk", "www.parliament.uk");
assertDomainSplit("www", "parliament.co.uk", "www.parliament.co.uk");
assertDomainSplit("www.a", "cgs.act.edu.au", "www.a.cgs.act.edu.au");
assertDomainSplit("www", "google.com.au", "www.google.com.au");
assertDomainSplit("www", "metp.net.cn", "www.metp.net.cn");
assertDomainSplit("www", "waiterrant.blogspot.com", "www.waiterrant.blogspot.com");
assertDomainSplit("", "kittens.blogspot.co.uk", "kittens.blogspot.co.uk");
assertDomainSplit("example", "example", "example.example");
assertDomainSplit("b.example", "example", "b.example.example");
assertDomainSplit("a.b.example", "example", "a.b.example.example");
assertDomainSplit("example", "local", "example.local");
assertDomainSplit("b.example", "local", "b.example.local");
assertDomainSplit("a.b.example", "local", "a.b.example.local");
assertDomainSplit("r192494180984795-1-1041782-channel-live.ums", "ustream.tv",
"r192494180984795-1-1041782-channel-live.ums.ustream.tv");
}
private void assertDomainSplit(String expectedSubDomain, String expectedDomain, String hostName) {
List<String> split = DomainSplitFunction.domainSplit(hostName);
assertEquals(expectedSubDomain, split.get(0));
assertEquals(expectedDomain, split.get(1));
}
}