Source filtering should treat dots in field names as sub objects. (#20736)

Mappings treat dots in field names as sub objects, for instance

```
{
  "a.b": "c"
}
```

generates the same dynamic mappings as

```
{
  "a": {
    "b": "c"
  }
}
```

Source filtering should be consistent with this behaviour so that an include
list containing `a` should include fields whose name is `a.b`.

To make this change easier, source filtering was refactored to use automata.
The ability to treat dots in field names as sub objects is provided by the
`makeMatchDotsInFieldNames` method of `XContentMapValues`.

Closes #20719
This commit is contained in:
Adrien Grand 2016-10-10 09:32:26 +02:00 committed by GitHub
parent 741ecf80ff
commit 8ab7ca5284
3 changed files with 221 additions and 89 deletions

View File

@ -19,8 +19,13 @@
package org.elasticsearch.common.regex;
import org.apache.lucene.util.automaton.Automata;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.Operations;
import org.elasticsearch.common.Strings;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
import java.util.regex.Pattern;
@ -46,6 +51,33 @@ public class Regex {
return str.equals("*");
}
/** Return an {@link Automaton} that matches the given pattern. */
public static Automaton simpleMatchToAutomaton(String pattern) {
List<Automaton> automata = new ArrayList<>();
int previous = 0;
for (int i = pattern.indexOf('*'); i != -1; i = pattern.indexOf('*', i + 1)) {
automata.add(Automata.makeString(pattern.substring(previous, i)));
automata.add(Automata.makeAnyString());
previous = i + 1;
}
automata.add(Automata.makeString(pattern.substring(previous)));
return Operations.concatenate(automata);
}
/**
* Return an Automaton that matches the union of the provided patterns.
*/
public static Automaton simpleMatchToAutomaton(String... patterns) {
if (patterns.length < 1) {
throw new IllegalArgumentException("There must be at least one pattern, zero given");
}
List<Automaton> automata = new ArrayList<>();
for (String pattern : patterns) {
automata.add(simpleMatchToAutomaton(pattern));
}
return Operations.union(automata);
}
/**
* Match a String against the given pattern, supporting the following simple
* pattern styles: "xxx*", "*xxx", "*xxx*" and "xxx*yyy" matches (with an

View File

@ -19,12 +19,17 @@
package org.elasticsearch.common.xcontent.support;
import org.apache.lucene.util.automaton.Automata;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
import org.apache.lucene.util.automaton.Operations;
import org.elasticsearch.ElasticsearchParseException;
import org.elasticsearch.common.Strings;
import org.elasticsearch.common.regex.Regex;
import org.elasticsearch.common.unit.TimeValue;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
@ -134,115 +139,171 @@ public class XContentMapValues {
return null;
}
public static Map<String, Object> filter(Map<String, Object> map, String[] includes, String[] excludes) {
Map<String, Object> result = new HashMap<>();
filter(map, result, includes == null ? Strings.EMPTY_ARRAY : includes, excludes == null ? Strings.EMPTY_ARRAY : excludes, new StringBuilder());
return result;
/**
* Only keep properties in {@code map} that match the {@code includes} but
* not the {@code excludes}. An empty list of includes is interpreted as a
* wildcard while an empty list of excludes does not match anything.
*
* If a property matches both an include and an exclude, then the exclude
* wins.
*
* If an object matches, then any of its sub properties are automatically
* considered as matching as well, both for includes and excludes.
*
* Dots in field names are treated as sub objects. So for instance if a
* document contains {@code a.b} as a property and {@code a} is an include,
* then {@code a.b} will be kept in the filtered map.
*/
public static Map<String, Object> filter(Map<String, ?> map, String[] includes, String[] excludes) {
CharacterRunAutomaton matchAllAutomaton = new CharacterRunAutomaton(Automata.makeAnyString());
CharacterRunAutomaton include;
if (includes == null || includes.length == 0) {
include = matchAllAutomaton;
} else {
Automaton includeA = Regex.simpleMatchToAutomaton(includes);
includeA = makeMatchDotsInFieldNames(includeA);
include = new CharacterRunAutomaton(includeA);
}
Automaton excludeA;
if (excludes == null || excludes.length == 0) {
excludeA = Automata.makeEmpty();
} else {
excludeA = Regex.simpleMatchToAutomaton(excludes);
excludeA = makeMatchDotsInFieldNames(excludeA);
}
CharacterRunAutomaton exclude = new CharacterRunAutomaton(excludeA);
// NOTE: We cannot use Operations.minus because of the special case that
// we want all sub properties to match as soon as an object matches
return filter(map,
include, include.getInitialState(),
exclude, exclude.getInitialState(),
matchAllAutomaton);
}
private static void filter(Map<String, Object> map, Map<String, Object> into, String[] includes, String[] excludes, StringBuilder sb) {
if (includes.length == 0 && excludes.length == 0) {
into.putAll(map);
return;
/** Make matches on objects also match dots in field names.
* For instance, if the original simple regex is `foo`, this will translate
* it into `foo` OR `foo.*`. */
private static Automaton makeMatchDotsInFieldNames(Automaton automaton) {
return Operations.union(
automaton,
Operations.concatenate(Arrays.asList(automaton, Automata.makeChar('.'), Automata.makeAnyString())));
}
private static int step(CharacterRunAutomaton automaton, String key, int state) {
for (int i = 0; state != -1 && i < key.length(); ++i) {
state = automaton.step(state, key.charAt(i));
}
for (Map.Entry<String, Object> entry : map.entrySet()) {
return state;
}
private static Map<String, Object> filter(Map<String, ?> map,
CharacterRunAutomaton includeAutomaton, int initialIncludeState,
CharacterRunAutomaton excludeAutomaton, int initialExcludeState,
CharacterRunAutomaton matchAllAutomaton) {
Map<String, Object> filtered = new HashMap<>();
for (Map.Entry<String, ?> entry : map.entrySet()) {
String key = entry.getKey();
int mark = sb.length();
if (sb.length() > 0) {
sb.append('.');
}
sb.append(key);
String path = sb.toString();
if (Regex.simpleMatch(excludes, path)) {
sb.setLength(mark);
int includeState = step(includeAutomaton, key, initialIncludeState);
if (includeState == -1) {
continue;
}
boolean exactIncludeMatch = false; // true if the current position was specifically mentioned
boolean pathIsPrefixOfAnInclude = false; // true if potentially a sub scope can be included
if (includes.length == 0) {
// implied match anything
exactIncludeMatch = true;
int excludeState = step(excludeAutomaton, key, initialExcludeState);
if (excludeState != -1 && excludeAutomaton.isAccept(excludeState)) {
continue;
}
Object value = entry.getValue();
CharacterRunAutomaton subIncludeAutomaton = includeAutomaton;
int subIncludeState = includeState;
if (includeAutomaton.isAccept(includeState)) {
if (excludeState == -1 || excludeAutomaton.step(excludeState, '.') == -1) {
// the exclude has no chances to match inner properties
filtered.put(key, value);
continue;
} else {
// the object matched, so consider that the include matches every inner property
// we only care about excludes now
subIncludeAutomaton = matchAllAutomaton;
subIncludeState = includeAutomaton.getInitialState();
}
}
if (value instanceof Map) {
subIncludeState = subIncludeAutomaton.step(subIncludeState, '.');
if (subIncludeState == -1) {
continue;
}
if (excludeState != -1) {
excludeState = excludeAutomaton.step(excludeState, '.');
}
Map<String, Object> valueAsMap = (Map<String, Object>) value;
Map<String, Object> filteredValue = filter(valueAsMap,
subIncludeAutomaton, subIncludeState, excludeAutomaton, excludeState, matchAllAutomaton);
if (includeAutomaton.isAccept(includeState) || filteredValue.isEmpty() == false) {
filtered.put(key, filteredValue);
}
} else if (value instanceof Iterable) {
List<Object> filteredValue = filter((Iterable<?>) value,
subIncludeAutomaton, subIncludeState, excludeAutomaton, excludeState, matchAllAutomaton);
if (includeAutomaton.isAccept(includeState) || filteredValue.isEmpty() == false) {
filtered.put(key, filteredValue);
}
} else {
for (String include : includes) {
// check for prefix matches as well to see if we need to zero in, something like: obj1.arr1.* or *.field
// note, this does not work well with middle matches, like obj1.*.obj3
if (include.charAt(0) == '*') {
if (Regex.simpleMatch(include, path)) {
exactIncludeMatch = true;
break;
}
pathIsPrefixOfAnInclude = true;
continue;
}
if (include.startsWith(path)) {
if (include.length() == path.length()) {
exactIncludeMatch = true;
break;
} else if (include.length() > path.length() && include.charAt(path.length()) == '.') {
// include might may match deeper paths. Dive deeper.
pathIsPrefixOfAnInclude = true;
continue;
}
}
if (Regex.simpleMatch(include, path)) {
exactIncludeMatch = true;
break;
}
// leaf property
if (includeAutomaton.isAccept(includeState)
&& (excludeState == -1 || excludeAutomaton.isAccept(excludeState) == false)) {
filtered.put(key, value);
}
}
if (!(pathIsPrefixOfAnInclude || exactIncludeMatch)) {
// skip subkeys, not interesting.
sb.setLength(mark);
continue;
}
if (entry.getValue() instanceof Map) {
Map<String, Object> innerInto = new HashMap<>();
// if we had an exact match, we want give deeper excludes their chance
filter((Map<String, Object>) entry.getValue(), innerInto, exactIncludeMatch ? Strings.EMPTY_ARRAY : includes, excludes, sb);
if (exactIncludeMatch || !innerInto.isEmpty()) {
into.put(entry.getKey(), innerInto);
}
} else if (entry.getValue() instanceof List) {
List<Object> list = (List<Object>) entry.getValue();
List<Object> innerInto = new ArrayList<>(list.size());
// if we had an exact match, we want give deeper excludes their chance
filter(list, innerInto, exactIncludeMatch ? Strings.EMPTY_ARRAY : includes, excludes, sb);
into.put(entry.getKey(), innerInto);
} else if (exactIncludeMatch) {
into.put(entry.getKey(), entry.getValue());
}
sb.setLength(mark);
}
return filtered;
}
private static void filter(List<Object> from, List<Object> to, String[] includes, String[] excludes, StringBuilder sb) {
if (includes.length == 0 && excludes.length == 0) {
to.addAll(from);
return;
}
for (Object o : from) {
if (o instanceof Map) {
Map<String, Object> innerInto = new HashMap<>();
filter((Map<String, Object>) o, innerInto, includes, excludes, sb);
if (!innerInto.isEmpty()) {
to.add(innerInto);
private static List<Object> filter(Iterable<?> iterable,
CharacterRunAutomaton includeAutomaton, int initialIncludeState,
CharacterRunAutomaton excludeAutomaton, int initialExcludeState,
CharacterRunAutomaton matchAllAutomaton) {
List<Object> filtered = new ArrayList<>();
for (Object value : iterable) {
if (value instanceof Map) {
int includeState = includeAutomaton.step(initialIncludeState, '.');
int excludeState = initialExcludeState;
if (excludeState != -1) {
excludeState = excludeAutomaton.step(excludeState, '.');
}
} else if (o instanceof List) {
List<Object> innerInto = new ArrayList<>();
filter((List<Object>) o, innerInto, includes, excludes, sb);
if (!innerInto.isEmpty()) {
to.add(innerInto);
Map<String, Object> filteredValue = filter((Map<String, ?>)value,
includeAutomaton, includeState, excludeAutomaton, excludeState, matchAllAutomaton);
if (filteredValue.isEmpty() == false) {
filtered.add(filteredValue);
}
} else if (value instanceof Iterable) {
List<Object> filteredValue = filter((Iterable<?>) value,
includeAutomaton, initialIncludeState, excludeAutomaton, initialExcludeState, matchAllAutomaton);
if (filteredValue.isEmpty() == false) {
filtered.add(filteredValue);
}
} else {
to.add(o);
// TODO: we have tests relying on this behavior on arrays even
// if the path does not match, but this looks like a bug?
filtered.add(value);
}
}
return filtered;
}
public static boolean isObject(Object node) {

View File

@ -551,4 +551,43 @@ public class XContentMapValuesTests extends ESTestCase {
parser.list());
}
}
public void testDotsInFieldNames() {
Map<String, Object> map = new HashMap<>();
map.put("foo.bar", 2);
Map<String, Object> sub = new HashMap<>();
sub.put("baz", 3);
map.put("foo", sub);
map.put("quux", 5);
// dots in field names in includes
Map<String, Object> filtered = XContentMapValues.filter(map, new String[] {"foo"}, new String[0]);
Map<String, Object> expected = new HashMap<>(map);
expected.remove("quux");
assertEquals(expected, filtered);
// dots in field names in excludes
filtered = XContentMapValues.filter(map, new String[0], new String[] {"foo"});
expected = new HashMap<>(map);
expected.keySet().retainAll(Collections.singleton("quux"));
assertEquals(expected, filtered);
}
public void testSupplementaryCharactersInPaths() {
Map<String, Object> map = new HashMap<>();
map.put("搜索", 2);
map.put("指数", 3);
assertEquals(Collections.singletonMap("搜索", 2), XContentMapValues.filter(map, new String[] {"搜索"}, new String[0]));
assertEquals(Collections.singletonMap("指数", 3), XContentMapValues.filter(map, new String[0], new String[] {"搜索"}));
}
public void testSharedPrefixes() {
Map<String, Object> map = new HashMap<>();
map.put("foobar", 2);
map.put("foobaz", 3);
assertEquals(Collections.singletonMap("foobar", 2), XContentMapValues.filter(map, new String[] {"foobar"}, new String[0]));
assertEquals(Collections.singletonMap("foobaz", 3), XContentMapValues.filter(map, new String[0], new String[] {"foobar"}));
}
}