From 8ab7ca5284581d59b3988024894a934504371092 Mon Sep 17 00:00:00 2001
From: Adrien Grand <jpountz@gmail.com>
Date: Mon, 10 Oct 2016 09:32:26 +0200
Subject: [PATCH] Source filtering should treat dots in field names as sub
 objects. (#20736)

Mappings treat dots in field names as sub objects, for instance

```
{
  "a.b": "c"
}
```

generates the same dynamic mappings as

```
{
  "a": {
    "b": "c"
  }
}
```

Source filtering should be consistent with this behaviour so that an include
list containing `a` should include fields whose name is `a.b`.

To make this change easier, source filtering was refactored to use automata.
The ability to treat dots in field names as sub objects is provided by the
`makeMatchDotsInFieldNames` method of `XContentMapValues`.

Closes #20719
---
 .../org/elasticsearch/common/regex/Regex.java |  32 +++
 .../xcontent/support/XContentMapValues.java   | 239 +++++++++++-------
 .../support/XContentMapValuesTests.java       |  39 +++
 3 files changed, 221 insertions(+), 89 deletions(-)
diff --git a/core/src/main/java/org/elasticsearch/common/regex/Regex.java b/core/src/main/java/org/elasticsearch/common/regex/Regex.java
index 061ad6c26c0..f1f945288e4 100644
--- a/core/src/main/java/org/elasticsearch/common/regex/Regex.java
+++ b/core/src/main/java/org/elasticsearch/common/regex/Regex.java
@@ -19,8 +19,13 @@
 
 package org.elasticsearch.common.regex;
 
+import org.apache.lucene.util.automaton.Automata;
+import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.automaton.Operations;
 import org.elasticsearch.common.Strings;
 
+import java.util.ArrayList;
+import java.util.List;
 import java.util.Locale;
 import java.util.regex.Pattern;
 
@@ -46,6 +51,33 @@ public class Regex {
         return str.equals("*");
     }
 
+    /** Return an {@link Automaton} that matches the given pattern. */
+    public static Automaton simpleMatchToAutomaton(String pattern) {
+        List<Automaton> automata = new ArrayList<>();
+        int previous = 0;
+        for (int i = pattern.indexOf('*'); i != -1; i = pattern.indexOf('*', i + 1)) {
+            automata.add(Automata.makeString(pattern.substring(previous, i)));
+            automata.add(Automata.makeAnyString());
+            previous = i + 1;
+        }
+        automata.add(Automata.makeString(pattern.substring(previous)));
+        return Operations.concatenate(automata);
+    }
+
+    /**
+     * Return an Automaton that matches the union of the provided patterns.
+     */
+    public static Automaton simpleMatchToAutomaton(String... patterns) {
+        if (patterns.length < 1) {
+            throw new IllegalArgumentException("There must be at least one pattern, zero given");
+        }
+        List<Automaton> automata = new ArrayList<>();
+        for (String pattern : patterns) {
+            automata.add(simpleMatchToAutomaton(pattern));
+        }
+        return Operations.union(automata);
+    }
+
     /**
      * Match a String against the given pattern, supporting the following simple
      * pattern styles: "xxx*", "*xxx", "*xxx*" and "xxx*yyy" matches (with an
diff --git a/core/src/main/java/org/elasticsearch/common/xcontent/support/XContentMapValues.java b/core/src/main/java/org/elasticsearch/common/xcontent/support/XContentMapValues.java
index a8c120f424b..fa211bb08e4 100644
--- a/core/src/main/java/org/elasticsearch/common/xcontent/support/XContentMapValues.java
+++ b/core/src/main/java/org/elasticsearch/common/xcontent/support/XContentMapValues.java
@@ -19,12 +19,17 @@
 
 package org.elasticsearch.common.xcontent.support;
 
+import org.apache.lucene.util.automaton.Automata;
+import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.automaton.CharacterRunAutomaton;
+import org.apache.lucene.util.automaton.Operations;
 import org.elasticsearch.ElasticsearchParseException;
 import org.elasticsearch.common.Strings;
 import org.elasticsearch.common.regex.Regex;
 import org.elasticsearch.common.unit.TimeValue;
 
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
@@ -134,115 +139,171 @@ public class XContentMapValues {
         return null;
     }
 
-    public static Map<String, Object> filter(Map<String, Object> map, String[] includes, String[] excludes) {
-        Map<String, Object> result = new HashMap<>();
-        filter(map, result, includes == null ? Strings.EMPTY_ARRAY : includes, excludes == null ? Strings.EMPTY_ARRAY : excludes, new StringBuilder());
-        return result;
+    /**
+     * Only keep properties in {@code map} that match the {@code includes} but
+     * not the {@code excludes}. An empty list of includes is interpreted as a
+     * wildcard while an empty list of excludes does not match anything.
+     *
+     * If a property matches both an include and an exclude, then the exclude
+     * wins.
+     *
+     * If an object matches, then any of its sub properties are automatically
+     * considered as matching as well, both for includes and excludes.
+     *
+     * Dots in field names are treated as sub objects. So for instance if a
+     * document contains {@code a.b} as a property and {@code a} is an include,
+     * then {@code a.b} will be kept in the filtered map.
+     */
+    public static Map<String, Object> filter(Map<String, ?> map, String[] includes, String[] excludes) {
+        CharacterRunAutomaton matchAllAutomaton = new CharacterRunAutomaton(Automata.makeAnyString());
+
+        CharacterRunAutomaton include;
+        if (includes == null || includes.length == 0) {
+            include = matchAllAutomaton;
+        } else {
+            Automaton includeA = Regex.simpleMatchToAutomaton(includes);
+            includeA = makeMatchDotsInFieldNames(includeA);
+            include = new CharacterRunAutomaton(includeA);
+        }
+
+        Automaton excludeA;
+        if (excludes == null || excludes.length == 0) {
+            excludeA = Automata.makeEmpty();
+        } else {
+            excludeA = Regex.simpleMatchToAutomaton(excludes);
+            excludeA = makeMatchDotsInFieldNames(excludeA);
+        }
+        CharacterRunAutomaton exclude = new CharacterRunAutomaton(excludeA);
+
+        // NOTE: We cannot use Operations.minus because of the special case that
+        // we want all sub properties to match as soon as an object matches
+
+        return filter(map,
+                include, include.getInitialState(),
+                exclude, exclude.getInitialState(),
+                matchAllAutomaton);
     }
 
-    private static void filter(Map<String, Object> map, Map<String, Object> into, String[] includes, String[] excludes, StringBuilder sb) {
-        if (includes.length == 0 && excludes.length == 0) {
-            into.putAll(map);
-            return;
+    /** Make matches on objects also match dots in field names.
+     *  For instance, if the original simple regex is `foo`, this will translate
+     *  it into `foo` OR `foo.*`. */
+    private static Automaton makeMatchDotsInFieldNames(Automaton automaton) {
+        return Operations.union(
+                automaton,
+                Operations.concatenate(Arrays.asList(automaton, Automata.makeChar('.'), Automata.makeAnyString())));
+    }
+
+    private static int step(CharacterRunAutomaton automaton, String key, int state) {
+        for (int i = 0; state != -1 && i < key.length(); ++i) {
+            state = automaton.step(state, key.charAt(i));
         }
-        for (Map.Entry<String, Object> entry : map.entrySet()) {
+        return state;
+    }
+
+    private static Map<String, Object> filter(Map<String, ?> map,
+            CharacterRunAutomaton includeAutomaton, int initialIncludeState,
+            CharacterRunAutomaton excludeAutomaton, int initialExcludeState,
+            CharacterRunAutomaton matchAllAutomaton) {
+        Map<String, Object> filtered = new HashMap<>();
+        for (Map.Entry<String, ?> entry : map.entrySet()) {
             String key = entry.getKey();
-            int mark = sb.length();
-            if (sb.length() > 0) {
-                sb.append('.');
-            }
-            sb.append(key);
-            String path = sb.toString();
 
-            if (Regex.simpleMatch(excludes, path)) {
-                sb.setLength(mark);
+            int includeState = step(includeAutomaton, key, initialIncludeState);
+            if (includeState == -1) {
                 continue;
             }
 
-            boolean exactIncludeMatch = false; // true if the current position was specifically mentioned
-            boolean pathIsPrefixOfAnInclude = false; // true if potentially a sub scope can be included
-            if (includes.length == 0) {
-                // implied match anything
-                exactIncludeMatch = true;
+            int excludeState = step(excludeAutomaton, key, initialExcludeState);
+            if (excludeState != -1 && excludeAutomaton.isAccept(excludeState)) {
+                continue;
+            }
+
+            Object value = entry.getValue();
+
+            CharacterRunAutomaton subIncludeAutomaton = includeAutomaton;
+            int subIncludeState = includeState;
+            if (includeAutomaton.isAccept(includeState)) {
+                if (excludeState == -1 || excludeAutomaton.step(excludeState, '.') == -1) {
+                    // the exclude has no chances to match inner properties
+                    filtered.put(key, value);
+                    continue;
+                } else {
+                    // the object matched, so consider that the include matches every inner property
+                    // we only care about excludes now
+                    subIncludeAutomaton = matchAllAutomaton;
+                    subIncludeState = includeAutomaton.getInitialState();
+                }
+            }
+
+            if (value instanceof Map) {
+
+                subIncludeState = subIncludeAutomaton.step(subIncludeState, '.');
+                if (subIncludeState == -1) {
+                    continue;
+                }
+                if (excludeState != -1) {
+                    excludeState = excludeAutomaton.step(excludeState, '.');
+                }
+
+                Map<String, Object> valueAsMap = (Map<String, Object>) value;
+                Map<String, Object> filteredValue = filter(valueAsMap,
+                        subIncludeAutomaton, subIncludeState, excludeAutomaton, excludeState, matchAllAutomaton);
+                if (includeAutomaton.isAccept(includeState) || filteredValue.isEmpty() == false) {
+                    filtered.put(key, filteredValue);
+                }
+
+            } else if (value instanceof Iterable) {
+
+                List<Object> filteredValue = filter((Iterable<?>) value,
+                        subIncludeAutomaton, subIncludeState, excludeAutomaton, excludeState, matchAllAutomaton);
+                if (includeAutomaton.isAccept(includeState) || filteredValue.isEmpty() == false) {
+                    filtered.put(key, filteredValue);
+                }
+
             } else {
-                for (String include : includes) {
-                    // check for prefix matches as well to see if we need to zero in, something like: obj1.arr1.* or *.field
-                    // note, this does not work well with middle matches, like obj1.*.obj3
-                    if (include.charAt(0) == '*') {
-                        if (Regex.simpleMatch(include, path)) {
-                            exactIncludeMatch = true;
-                            break;
-                        }
-                        pathIsPrefixOfAnInclude = true;
-                        continue;
-                    }
-                    if (include.startsWith(path)) {
-                        if (include.length() == path.length()) {
-                            exactIncludeMatch = true;
-                            break;
-                        } else if (include.length() > path.length() && include.charAt(path.length()) == '.') {
-                            // include might may match deeper paths. Dive deeper.
-                            pathIsPrefixOfAnInclude = true;
-                            continue;
-                        }
-                    }
-                    if (Regex.simpleMatch(include, path)) {
-                        exactIncludeMatch = true;
-                        break;
-                    }
+
+                // leaf property
+                if (includeAutomaton.isAccept(includeState)
+                        && (excludeState == -1 || excludeAutomaton.isAccept(excludeState) == false)) {
+                    filtered.put(key, value);
                 }
+
             }
 
-            if (!(pathIsPrefixOfAnInclude || exactIncludeMatch)) {
-                // skip subkeys, not interesting.
-                sb.setLength(mark);
-                continue;
-            }
-
-
-            if (entry.getValue() instanceof Map) {
-                Map<String, Object> innerInto = new HashMap<>();
-                // if we had an exact match, we want give deeper excludes their chance
-                filter((Map<String, Object>) entry.getValue(), innerInto, exactIncludeMatch ? Strings.EMPTY_ARRAY : includes, excludes, sb);
-                if (exactIncludeMatch || !innerInto.isEmpty()) {
-                    into.put(entry.getKey(), innerInto);
-                }
-            } else if (entry.getValue() instanceof List) {
-                List<Object> list = (List<Object>) entry.getValue();
-                List<Object> innerInto = new ArrayList<>(list.size());
-                // if we had an exact match, we want give deeper excludes their chance
-                filter(list, innerInto, exactIncludeMatch ? Strings.EMPTY_ARRAY : includes, excludes, sb);
-                into.put(entry.getKey(), innerInto);
-            } else if (exactIncludeMatch) {
-                into.put(entry.getKey(), entry.getValue());
-            }
-            sb.setLength(mark);
         }
+        return filtered;
     }
 
-    private static void filter(List<Object> from, List<Object> to, String[] includes, String[] excludes, StringBuilder sb) {
-        if (includes.length == 0 && excludes.length == 0) {
-            to.addAll(from);
-            return;
-        }
-
-        for (Object o : from) {
-            if (o instanceof Map) {
-                Map<String, Object> innerInto = new HashMap<>();
-                filter((Map<String, Object>) o, innerInto, includes, excludes, sb);
-                if (!innerInto.isEmpty()) {
-                    to.add(innerInto);
+    private static List<Object> filter(Iterable<?> iterable,
+            CharacterRunAutomaton includeAutomaton, int initialIncludeState,
+            CharacterRunAutomaton excludeAutomaton, int initialExcludeState,
+            CharacterRunAutomaton matchAllAutomaton) {
+        List<Object> filtered = new ArrayList<>();
+        for (Object value : iterable) {
+            if (value instanceof Map) {
+                int includeState = includeAutomaton.step(initialIncludeState, '.');
+                int excludeState = initialExcludeState;
+                if (excludeState != -1) {
+                    excludeState = excludeAutomaton.step(excludeState, '.');
                 }
-            } else if (o instanceof List) {
-                List<Object> innerInto = new ArrayList<>();
-                filter((List<Object>) o, innerInto, includes, excludes, sb);
-                if (!innerInto.isEmpty()) {
-                    to.add(innerInto);
+                Map<String, Object> filteredValue = filter((Map<String, ?>)value,
+                        includeAutomaton, includeState, excludeAutomaton, excludeState, matchAllAutomaton);
+                if (filteredValue.isEmpty() == false) {
+                    filtered.add(filteredValue);
+                }
+            } else if (value instanceof Iterable) {
+                List<Object> filteredValue = filter((Iterable<?>) value,
+                        includeAutomaton, initialIncludeState, excludeAutomaton, initialExcludeState, matchAllAutomaton);
+                if (filteredValue.isEmpty() == false) {
+                    filtered.add(filteredValue);
                 }
             } else {
-                to.add(o);
+                // TODO: we have tests relying on this behavior on arrays even
+                // if the path does not match, but this looks like a bug?
+                filtered.add(value);
             }
         }
+        return filtered;
     }
 
     public static boolean isObject(Object node) {
diff --git a/core/src/test/java/org/elasticsearch/common/xcontent/support/XContentMapValuesTests.java b/core/src/test/java/org/elasticsearch/common/xcontent/support/XContentMapValuesTests.java
index ba2043bbe20..d7e363b3274 100644
--- a/core/src/test/java/org/elasticsearch/common/xcontent/support/XContentMapValuesTests.java
+++ b/core/src/test/java/org/elasticsearch/common/xcontent/support/XContentMapValuesTests.java
@@ -551,4 +551,43 @@ public class XContentMapValuesTests extends ESTestCase {
                     parser.list());
         }
     }
+
+    public void testDotsInFieldNames() {
+        Map<String, Object> map = new HashMap<>();
+        map.put("foo.bar", 2);
+        Map<String, Object> sub = new HashMap<>();
+        sub.put("baz", 3);
+        map.put("foo", sub);
+        map.put("quux", 5);
+
+        // dots in field names in includes
+        Map<String, Object> filtered = XContentMapValues.filter(map, new String[] {"foo"}, new String[0]);
+        Map<String, Object> expected = new HashMap<>(map);
+        expected.remove("quux");
+        assertEquals(expected, filtered);
+
+        // dots in field names in excludes
+        filtered = XContentMapValues.filter(map, new String[0], new String[] {"foo"});
+        expected = new HashMap<>(map);
+        expected.keySet().retainAll(Collections.singleton("quux"));
+        assertEquals(expected, filtered);
+    }
+
+    public void testSupplementaryCharactersInPaths() {
+        Map<String, Object> map = new HashMap<>();
+        map.put("搜索", 2);
+        map.put("指数", 3);
+
+        assertEquals(Collections.singletonMap("搜索", 2), XContentMapValues.filter(map, new String[] {"搜索"}, new String[0]));
+        assertEquals(Collections.singletonMap("指数", 3), XContentMapValues.filter(map, new String[0], new String[] {"搜索"}));
+    }
+
+    public void testSharedPrefixes() {
+        Map<String, Object> map = new HashMap<>();
+        map.put("foobar", 2);
+        map.put("foobaz", 3);
+
+        assertEquals(Collections.singletonMap("foobar", 2), XContentMapValues.filter(map, new String[] {"foobar"}, new String[0]));
+        assertEquals(Collections.singletonMap("foobaz", 3), XContentMapValues.filter(map, new String[0], new String[] {"foobar"}));
+    }
 }