From 8ab7ca5284581d59b3988024894a934504371092 Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Mon, 10 Oct 2016 09:32:26 +0200 Subject: [PATCH] Source filtering should treat dots in field names as sub objects. (#20736) Mappings treat dots in field names as sub objects, for instance ``` { "a.b": "c" } ``` generates the same dynamic mappings as ``` { "a": { "b": "c" } } ``` Source filtering should be consistent with this behaviour so that an include list containing `a` should include fields whose name is `a.b`. To make this change easier, source filtering was refactored to use automata. The ability to treat dots in field names as sub objects is provided by the `makeMatchDotsInFieldNames` method of `XContentMapValues`. Closes #20719 --- .../org/elasticsearch/common/regex/Regex.java | 32 +++ .../xcontent/support/XContentMapValues.java | 239 +++++++++++------- .../support/XContentMapValuesTests.java | 39 +++ 3 files changed, 221 insertions(+), 89 deletions(-) diff --git a/core/src/main/java/org/elasticsearch/common/regex/Regex.java b/core/src/main/java/org/elasticsearch/common/regex/Regex.java index 061ad6c26c0..f1f945288e4 100644 --- a/core/src/main/java/org/elasticsearch/common/regex/Regex.java +++ b/core/src/main/java/org/elasticsearch/common/regex/Regex.java @@ -19,8 +19,13 @@ package org.elasticsearch.common.regex; +import org.apache.lucene.util.automaton.Automata; +import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.Operations; import org.elasticsearch.common.Strings; +import java.util.ArrayList; +import java.util.List; import java.util.Locale; import java.util.regex.Pattern; @@ -46,6 +51,33 @@ public class Regex { return str.equals("*"); } + /** Return an {@link Automaton} that matches the given pattern. */ + public static Automaton simpleMatchToAutomaton(String pattern) { + List automata = new ArrayList<>(); + int previous = 0; + for (int i = pattern.indexOf('*'); i != -1; i = pattern.indexOf('*', i + 1)) { + automata.add(Automata.makeString(pattern.substring(previous, i))); + automata.add(Automata.makeAnyString()); + previous = i + 1; + } + automata.add(Automata.makeString(pattern.substring(previous))); + return Operations.concatenate(automata); + } + + /** + * Return an Automaton that matches the union of the provided patterns. + */ + public static Automaton simpleMatchToAutomaton(String... patterns) { + if (patterns.length < 1) { + throw new IllegalArgumentException("There must be at least one pattern, zero given"); + } + List automata = new ArrayList<>(); + for (String pattern : patterns) { + automata.add(simpleMatchToAutomaton(pattern)); + } + return Operations.union(automata); + } + /** * Match a String against the given pattern, supporting the following simple * pattern styles: "xxx*", "*xxx", "*xxx*" and "xxx*yyy" matches (with an diff --git a/core/src/main/java/org/elasticsearch/common/xcontent/support/XContentMapValues.java b/core/src/main/java/org/elasticsearch/common/xcontent/support/XContentMapValues.java index a8c120f424b..fa211bb08e4 100644 --- a/core/src/main/java/org/elasticsearch/common/xcontent/support/XContentMapValues.java +++ b/core/src/main/java/org/elasticsearch/common/xcontent/support/XContentMapValues.java @@ -19,12 +19,17 @@ package org.elasticsearch.common.xcontent.support; +import org.apache.lucene.util.automaton.Automata; +import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.CharacterRunAutomaton; +import org.apache.lucene.util.automaton.Operations; import org.elasticsearch.ElasticsearchParseException; import org.elasticsearch.common.Strings; import org.elasticsearch.common.regex.Regex; import org.elasticsearch.common.unit.TimeValue; import java.util.ArrayList; +import java.util.Arrays; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -134,115 +139,171 @@ public class XContentMapValues { return null; } - public static Map filter(Map map, String[] includes, String[] excludes) { - Map result = new HashMap<>(); - filter(map, result, includes == null ? Strings.EMPTY_ARRAY : includes, excludes == null ? Strings.EMPTY_ARRAY : excludes, new StringBuilder()); - return result; + /** + * Only keep properties in {@code map} that match the {@code includes} but + * not the {@code excludes}. An empty list of includes is interpreted as a + * wildcard while an empty list of excludes does not match anything. + * + * If a property matches both an include and an exclude, then the exclude + * wins. + * + * If an object matches, then any of its sub properties are automatically + * considered as matching as well, both for includes and excludes. + * + * Dots in field names are treated as sub objects. So for instance if a + * document contains {@code a.b} as a property and {@code a} is an include, + * then {@code a.b} will be kept in the filtered map. + */ + public static Map filter(Map map, String[] includes, String[] excludes) { + CharacterRunAutomaton matchAllAutomaton = new CharacterRunAutomaton(Automata.makeAnyString()); + + CharacterRunAutomaton include; + if (includes == null || includes.length == 0) { + include = matchAllAutomaton; + } else { + Automaton includeA = Regex.simpleMatchToAutomaton(includes); + includeA = makeMatchDotsInFieldNames(includeA); + include = new CharacterRunAutomaton(includeA); + } + + Automaton excludeA; + if (excludes == null || excludes.length == 0) { + excludeA = Automata.makeEmpty(); + } else { + excludeA = Regex.simpleMatchToAutomaton(excludes); + excludeA = makeMatchDotsInFieldNames(excludeA); + } + CharacterRunAutomaton exclude = new CharacterRunAutomaton(excludeA); + + // NOTE: We cannot use Operations.minus because of the special case that + // we want all sub properties to match as soon as an object matches + + return filter(map, + include, include.getInitialState(), + exclude, exclude.getInitialState(), + matchAllAutomaton); } - private static void filter(Map map, Map into, String[] includes, String[] excludes, StringBuilder sb) { - if (includes.length == 0 && excludes.length == 0) { - into.putAll(map); - return; + /** Make matches on objects also match dots in field names. + * For instance, if the original simple regex is `foo`, this will translate + * it into `foo` OR `foo.*`. */ + private static Automaton makeMatchDotsInFieldNames(Automaton automaton) { + return Operations.union( + automaton, + Operations.concatenate(Arrays.asList(automaton, Automata.makeChar('.'), Automata.makeAnyString()))); + } + + private static int step(CharacterRunAutomaton automaton, String key, int state) { + for (int i = 0; state != -1 && i < key.length(); ++i) { + state = automaton.step(state, key.charAt(i)); } - for (Map.Entry entry : map.entrySet()) { + return state; + } + + private static Map filter(Map map, + CharacterRunAutomaton includeAutomaton, int initialIncludeState, + CharacterRunAutomaton excludeAutomaton, int initialExcludeState, + CharacterRunAutomaton matchAllAutomaton) { + Map filtered = new HashMap<>(); + for (Map.Entry entry : map.entrySet()) { String key = entry.getKey(); - int mark = sb.length(); - if (sb.length() > 0) { - sb.append('.'); - } - sb.append(key); - String path = sb.toString(); - if (Regex.simpleMatch(excludes, path)) { - sb.setLength(mark); + int includeState = step(includeAutomaton, key, initialIncludeState); + if (includeState == -1) { continue; } - boolean exactIncludeMatch = false; // true if the current position was specifically mentioned - boolean pathIsPrefixOfAnInclude = false; // true if potentially a sub scope can be included - if (includes.length == 0) { - // implied match anything - exactIncludeMatch = true; + int excludeState = step(excludeAutomaton, key, initialExcludeState); + if (excludeState != -1 && excludeAutomaton.isAccept(excludeState)) { + continue; + } + + Object value = entry.getValue(); + + CharacterRunAutomaton subIncludeAutomaton = includeAutomaton; + int subIncludeState = includeState; + if (includeAutomaton.isAccept(includeState)) { + if (excludeState == -1 || excludeAutomaton.step(excludeState, '.') == -1) { + // the exclude has no chances to match inner properties + filtered.put(key, value); + continue; + } else { + // the object matched, so consider that the include matches every inner property + // we only care about excludes now + subIncludeAutomaton = matchAllAutomaton; + subIncludeState = includeAutomaton.getInitialState(); + } + } + + if (value instanceof Map) { + + subIncludeState = subIncludeAutomaton.step(subIncludeState, '.'); + if (subIncludeState == -1) { + continue; + } + if (excludeState != -1) { + excludeState = excludeAutomaton.step(excludeState, '.'); + } + + Map valueAsMap = (Map) value; + Map filteredValue = filter(valueAsMap, + subIncludeAutomaton, subIncludeState, excludeAutomaton, excludeState, matchAllAutomaton); + if (includeAutomaton.isAccept(includeState) || filteredValue.isEmpty() == false) { + filtered.put(key, filteredValue); + } + + } else if (value instanceof Iterable) { + + List filteredValue = filter((Iterable) value, + subIncludeAutomaton, subIncludeState, excludeAutomaton, excludeState, matchAllAutomaton); + if (includeAutomaton.isAccept(includeState) || filteredValue.isEmpty() == false) { + filtered.put(key, filteredValue); + } + } else { - for (String include : includes) { - // check for prefix matches as well to see if we need to zero in, something like: obj1.arr1.* or *.field - // note, this does not work well with middle matches, like obj1.*.obj3 - if (include.charAt(0) == '*') { - if (Regex.simpleMatch(include, path)) { - exactIncludeMatch = true; - break; - } - pathIsPrefixOfAnInclude = true; - continue; - } - if (include.startsWith(path)) { - if (include.length() == path.length()) { - exactIncludeMatch = true; - break; - } else if (include.length() > path.length() && include.charAt(path.length()) == '.') { - // include might may match deeper paths. Dive deeper. - pathIsPrefixOfAnInclude = true; - continue; - } - } - if (Regex.simpleMatch(include, path)) { - exactIncludeMatch = true; - break; - } + + // leaf property + if (includeAutomaton.isAccept(includeState) + && (excludeState == -1 || excludeAutomaton.isAccept(excludeState) == false)) { + filtered.put(key, value); } + } - if (!(pathIsPrefixOfAnInclude || exactIncludeMatch)) { - // skip subkeys, not interesting. - sb.setLength(mark); - continue; - } - - - if (entry.getValue() instanceof Map) { - Map innerInto = new HashMap<>(); - // if we had an exact match, we want give deeper excludes their chance - filter((Map) entry.getValue(), innerInto, exactIncludeMatch ? Strings.EMPTY_ARRAY : includes, excludes, sb); - if (exactIncludeMatch || !innerInto.isEmpty()) { - into.put(entry.getKey(), innerInto); - } - } else if (entry.getValue() instanceof List) { - List list = (List) entry.getValue(); - List innerInto = new ArrayList<>(list.size()); - // if we had an exact match, we want give deeper excludes their chance - filter(list, innerInto, exactIncludeMatch ? Strings.EMPTY_ARRAY : includes, excludes, sb); - into.put(entry.getKey(), innerInto); - } else if (exactIncludeMatch) { - into.put(entry.getKey(), entry.getValue()); - } - sb.setLength(mark); } + return filtered; } - private static void filter(List from, List to, String[] includes, String[] excludes, StringBuilder sb) { - if (includes.length == 0 && excludes.length == 0) { - to.addAll(from); - return; - } - - for (Object o : from) { - if (o instanceof Map) { - Map innerInto = new HashMap<>(); - filter((Map) o, innerInto, includes, excludes, sb); - if (!innerInto.isEmpty()) { - to.add(innerInto); + private static List filter(Iterable iterable, + CharacterRunAutomaton includeAutomaton, int initialIncludeState, + CharacterRunAutomaton excludeAutomaton, int initialExcludeState, + CharacterRunAutomaton matchAllAutomaton) { + List filtered = new ArrayList<>(); + for (Object value : iterable) { + if (value instanceof Map) { + int includeState = includeAutomaton.step(initialIncludeState, '.'); + int excludeState = initialExcludeState; + if (excludeState != -1) { + excludeState = excludeAutomaton.step(excludeState, '.'); } - } else if (o instanceof List) { - List innerInto = new ArrayList<>(); - filter((List) o, innerInto, includes, excludes, sb); - if (!innerInto.isEmpty()) { - to.add(innerInto); + Map filteredValue = filter((Map)value, + includeAutomaton, includeState, excludeAutomaton, excludeState, matchAllAutomaton); + if (filteredValue.isEmpty() == false) { + filtered.add(filteredValue); + } + } else if (value instanceof Iterable) { + List filteredValue = filter((Iterable) value, + includeAutomaton, initialIncludeState, excludeAutomaton, initialExcludeState, matchAllAutomaton); + if (filteredValue.isEmpty() == false) { + filtered.add(filteredValue); } } else { - to.add(o); + // TODO: we have tests relying on this behavior on arrays even + // if the path does not match, but this looks like a bug? + filtered.add(value); } } + return filtered; } public static boolean isObject(Object node) { diff --git a/core/src/test/java/org/elasticsearch/common/xcontent/support/XContentMapValuesTests.java b/core/src/test/java/org/elasticsearch/common/xcontent/support/XContentMapValuesTests.java index ba2043bbe20..d7e363b3274 100644 --- a/core/src/test/java/org/elasticsearch/common/xcontent/support/XContentMapValuesTests.java +++ b/core/src/test/java/org/elasticsearch/common/xcontent/support/XContentMapValuesTests.java @@ -551,4 +551,43 @@ public class XContentMapValuesTests extends ESTestCase { parser.list()); } } + + public void testDotsInFieldNames() { + Map map = new HashMap<>(); + map.put("foo.bar", 2); + Map sub = new HashMap<>(); + sub.put("baz", 3); + map.put("foo", sub); + map.put("quux", 5); + + // dots in field names in includes + Map filtered = XContentMapValues.filter(map, new String[] {"foo"}, new String[0]); + Map expected = new HashMap<>(map); + expected.remove("quux"); + assertEquals(expected, filtered); + + // dots in field names in excludes + filtered = XContentMapValues.filter(map, new String[0], new String[] {"foo"}); + expected = new HashMap<>(map); + expected.keySet().retainAll(Collections.singleton("quux")); + assertEquals(expected, filtered); + } + + public void testSupplementaryCharactersInPaths() { + Map map = new HashMap<>(); + map.put("搜索", 2); + map.put("指数", 3); + + assertEquals(Collections.singletonMap("搜索", 2), XContentMapValues.filter(map, new String[] {"搜索"}, new String[0])); + assertEquals(Collections.singletonMap("指数", 3), XContentMapValues.filter(map, new String[0], new String[] {"搜索"})); + } + + public void testSharedPrefixes() { + Map map = new HashMap<>(); + map.put("foobar", 2); + map.put("foobaz", 3); + + assertEquals(Collections.singletonMap("foobar", 2), XContentMapValues.filter(map, new String[] {"foobar"}, new String[0])); + assertEquals(Collections.singletonMap("foobaz", 3), XContentMapValues.filter(map, new String[0], new String[] {"foobar"})); + } }