Add regex query support to wildcard field (approach 2) (#55548) (#57141)

Backport of #55548 Adds equivalence for keyword field to the wildcard field. Regex, fuzzy, wildcard and prefix queries are all supported. All queries use an approximation query backed by an automaton-based verification queries. Closes #54275
2020-05-26 16:55:59 +01:00 · 2020-05-26 16:55:59 +01:00 · b2bc6071fd
parent 9f1e3bc82b
commit b2bc6071fd
5 changed files with 1005 additions and 208 deletions
--- a/x-pack/plugin/src/test/resources/rest-api-spec/test/wildcard/10_wildcard_basic.yml
+++ b/x-pack/plugin/src/test/resources/rest-api-spec/test/wildcard/10_wildcard_basic.yml
@ -1,8 +1,8 @@
 setup:
  - skip:
      features: headers
-      version: " - 7.7.99"
+      version: " - 7.8.99"
-      reason: "wildcard fields were added from 7.8"
+      reason: "wildcard fields were added from 7.9"
  - do:
      indices.create:
--- a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/AutomatonQueryOnBinaryDv.java
+++ b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/AutomatonQueryOnBinaryDv.java
@ -24,6 +24,7 @@ import org.apache.lucene.util.automaton.ByteRunAutomaton;
 import java.io.IOException;
 import java.util.Objects;
 import java.util.function.Supplier;
 /**
 * Query that runs an Automaton across all binary doc values. 
@ -33,18 +34,19 @@ public class AutomatonQueryOnBinaryDv extends Query {
    private final String field;
    private final String matchPattern;
-    private final Automaton automaton;
+    private final Supplier<Automaton> automatonSupplier;
-    public AutomatonQueryOnBinaryDv(String field, String matchPattern, Automaton automaton) {
+    public AutomatonQueryOnBinaryDv(String field, String matchPattern, Supplier<Automaton> automatonSupplier) {
        this.field = field;
        this.matchPattern = matchPattern;
-        this.automaton = automaton;
+        this.automatonSupplier = automatonSupplier;
    }
    @Override
    public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException {
-        ByteRunAutomaton bytesMatcher = new ByteRunAutomaton(automaton);
+        
        ByteRunAutomaton bytesMatcher = new ByteRunAutomaton(automatonSupplier.get());
        return new ConstantScoreWeight(this, boost) {
@ -92,6 +94,9 @@ public class AutomatonQueryOnBinaryDv extends Query {
    @Override
    public boolean equals(Object obj) {
        if (obj == null || obj.getClass() != getClass()) {
            return false;
          }        
        AutomatonQueryOnBinaryDv other = (AutomatonQueryOnBinaryDv) obj;
        return Objects.equals(field, other.field)  && Objects.equals(matchPattern, other.matchPattern);            
    }
--- a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/MatchAllButRequireVerificationQuery.java
+++ b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/MatchAllButRequireVerificationQuery.java
@ -0,0 +1,50 @@
 /*
 * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 * or more contributor license agreements. Licensed under the Elastic License;
 * you may not use this file except in compliance with the Elastic License.
 */
 package org.elasticsearch.xpack.wildcard.mapper;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.search.MatchAllDocsQuery;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.QueryVisitor;
 import java.io.IOException;
 /**
 * A query that matches all documents. The class is more of a marker
 * that we encountered something that will need verification.
 * (A MatchAllDocs query is used to indicate we can match all
 * _without_ verification)
 */
 public final class MatchAllButRequireVerificationQuery extends Query {
  @Override
    public Query rewrite(IndexReader reader) throws IOException {
        return new MatchAllDocsQuery();
    }
  @Override
  public String toString(String field) {
    return "*:* (tbc)";
  }
  @Override
  public boolean equals(Object o) {
    return sameClassAs(o);
  }
  @Override
  public int hashCode() {
    return classHash();
  }
  @Override
  public void visit(QueryVisitor visitor) {
    visitor.visitLeaf(this);
  }
 }
--- a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java
+++ b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java
@ -21,24 +21,32 @@ import org.apache.lucene.search.BooleanClause.Occur;
 import org.apache.lucene.search.BooleanQuery;
 import org.apache.lucene.search.ConstantScoreQuery;
 import org.apache.lucene.search.DocValuesFieldExistsQuery;
 import org.apache.lucene.search.FuzzyQuery;
 import org.apache.lucene.search.MatchAllDocsQuery;
 import org.apache.lucene.search.MatchNoDocsQuery;
 import org.apache.lucene.search.MultiTermQuery;
 import org.apache.lucene.search.MultiTermQuery.RewriteMethod;
 import org.apache.lucene.search.PrefixQuery;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.SortField;
 import org.apache.lucene.search.TermQuery;
 import org.apache.lucene.search.WildcardQuery;
 import org.apache.lucene.util.automaton.Automaton;
 import org.apache.lucene.util.automaton.RegExp;
 import org.apache.lucene.util.automaton.RegExp.Kind;
 import org.elasticsearch.ElasticsearchException;
 import org.elasticsearch.ElasticsearchParseException;
 import org.elasticsearch.common.lucene.BytesRefs;
 import org.elasticsearch.common.lucene.Lucene;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.common.unit.Fuzziness;
 import org.elasticsearch.common.xcontent.XContentBuilder;
 import org.elasticsearch.common.xcontent.XContentParser;
 import org.elasticsearch.common.xcontent.support.XContentMapValues;
 import org.elasticsearch.index.Index;
 import org.elasticsearch.index.IndexSettings;
 import org.elasticsearch.index.analysis.AnalyzerScope;
 import org.elasticsearch.index.analysis.LowercaseNormalizer;
 import org.elasticsearch.index.analysis.NamedAnalyzer;
 import org.elasticsearch.index.fielddata.IndexFieldData;
 import org.elasticsearch.index.fielddata.IndexFieldData.XFieldComparatorSource.Nested;
@ -63,11 +71,16 @@ import org.elasticsearch.search.aggregations.support.ValuesSourceType;
 import java.io.IOException;
 import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
 import java.util.HashSet;
 import java.util.Iterator;
 import java.util.LinkedHashSet;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
 import java.util.function.Supplier;
 import static org.elasticsearch.index.mapper.TypeParsers.parseField;
 import static org.elasticsearch.search.SearchService.ALLOW_EXPENSIVE_QUERIES;
 /**
 * A {@link FieldMapper} for indexing fields with ngrams for efficient wildcard matching
@ -206,9 +219,13 @@ public class WildcardFieldMapper extends FieldMapper {
    }
     public static final char TOKEN_START_OR_END_CHAR = 0;
     public static final String TOKEN_START_STRING = Character.toString(TOKEN_START_OR_END_CHAR);
     public static final String TOKEN_END_STRING = TOKEN_START_STRING + TOKEN_START_STRING;
     public static final class WildcardFieldType extends MappedFieldType {
        static Analyzer lowercaseNormalizer = new LowercaseNormalizer();
        public WildcardFieldType() {
            setIndexAnalyzer(Lucene.KEYWORD_ANALYZER);
            setSearchAnalyzer(Lucene.KEYWORD_ANALYZER);
@ -223,218 +240,533 @@ public class WildcardFieldMapper extends FieldMapper {
            return result;
        }
        // Holds parsed information about the wildcard pattern
        static class PatternStructure {
            boolean openStart, openEnd, hasSymbols;
            int lastGap =0;
            int wildcardCharCount, wildcardStringCount;
            String[] fragments;
            Integer []  precedingGapSizes;
            final String pattern;
            @SuppressWarnings("fallthrough") // Intentionally uses fallthrough mirroring implementation in Lucene's WildcardQuery
            PatternStructure (String wildcardText) {
                this.pattern = wildcardText;
                ArrayList<String> fragmentList = new ArrayList<>();
                ArrayList<Integer> precedingGapSizeList = new ArrayList<>();
                StringBuilder sb = new StringBuilder();
                for (int i = 0; i < wildcardText.length();) {
                    final int c = wildcardText.codePointAt(i);
                    int length = Character.charCount(c);
                    switch (c) {
                    case WildcardQuery.WILDCARD_STRING:
                        if (i == 0) {
                            openStart = true;
                        }
                        openEnd = true;
                        hasSymbols = true;
                        wildcardStringCount++;
                        if (sb.length() > 0) {
                            precedingGapSizeList.add(lastGap);
                            fragmentList.add(sb.toString());
                            sb = new StringBuilder();
                        }
                        lastGap = Integer.MAX_VALUE;
                        break;
                    case WildcardQuery.WILDCARD_CHAR:
                        if (i == 0) {
                            openStart = true;
                        }
                        hasSymbols = true;
                        wildcardCharCount++;
                        openEnd = true;
                        if (sb.length() > 0) {
                            precedingGapSizeList.add(lastGap);
                            fragmentList.add(sb.toString());
                            sb = new StringBuilder();
                            lastGap = 0;
                        }
                        if (lastGap != Integer.MAX_VALUE) {
                            lastGap++;
                        }
                        break;
                    case WildcardQuery.WILDCARD_ESCAPE:
                        // add the next codepoint instead, if it exists
                        if (i + length < wildcardText.length()) {
                            final int nextChar = wildcardText.codePointAt(i + length);
                            length += Character.charCount(nextChar);
                            sb.append(Character.toChars(nextChar));
                            openEnd = false;
                            break;
                        } // else fallthru, lenient parsing with a trailing \
                    default:
                        openEnd = false;
                        sb.append(Character.toChars(c));
                    }
                    i += length;
                }
                if (sb.length() > 0) {
                    precedingGapSizeList.add(lastGap);
                    fragmentList.add(sb.toString());
                    lastGap = 0;
                }
                fragments = fragmentList.toArray(new String[0]);
                precedingGapSizes = precedingGapSizeList.toArray(new Integer[0]);
            }
            public boolean needsVerification() {
                // Return true if term queries are not enough evidence
                if (fragments.length == 1 && wildcardCharCount == 0) {
                    // The one case where we don't need verification is when
                    // we have a single fragment and no ? characters
                    return false;
                }
                return true;
            }
            // Returns number of positions for last gap (Integer.MAX means unlimited gap)
            public int getPrecedingGapSize(int fragmentNum) {
                return precedingGapSizes[fragmentNum];
            }
            public boolean isMatchAll() {
                return fragments.length == 0 && wildcardStringCount >0 && wildcardCharCount ==0;
            }
            @Override
            public int hashCode() {
                return pattern.hashCode();
            }
            @Override
            public boolean equals(Object obj) {
                PatternStructure other = (PatternStructure) obj;
                return pattern.equals(other.pattern);
            }
        }
        @Override
        public Query wildcardQuery(String wildcardPattern, RewriteMethod method, QueryShardContext context) {
            PatternStructure patternStructure = new PatternStructure(wildcardPattern);
            ArrayList<String> tokens = new ArrayList<>();
-            for (int i = 0; i < patternStructure.fragments.length; i++) {
+            String ngramIndexPattern = addLineEndChars(toLowerCase(wildcardPattern));
                String fragment = patternStructure.fragments[i];
                int fLength = fragment.length();
                if (fLength == 0) {
                    continue;
                }
-                // Add any start/end of string character
+            // Break search term into tokens
-                if (i == 0 && patternStructure.openStart == false) {
+            Set<String> tokens = new LinkedHashSet<>();
-                    // Start-of-string anchored (is not a leading wildcard)
+            StringBuilder sequence = new StringBuilder();
-                    fragment = TOKEN_START_OR_END_CHAR + fragment;
+            int numWildcardChars = 0;
            int numWildcardStrings = 0;
            for (int i = 0; i < ngramIndexPattern.length();) {
                final int c = ngramIndexPattern.codePointAt(i);
                int length = Character.charCount(c);
                switch (c) {
                    case WildcardQuery.WILDCARD_STRING:
                        if (sequence.length() > 0) {
                            getNgramTokens(tokens, sequence.toString());
                            sequence = new StringBuilder();
                        }
                        numWildcardStrings++;
                        break;
                    case WildcardQuery.WILDCARD_CHAR:
                        if (sequence.length() > 0) {
                            getNgramTokens(tokens, sequence.toString());
                            sequence = new StringBuilder();
                        }
                        numWildcardChars++;
                        break;
                    case WildcardQuery.WILDCARD_ESCAPE:
                        // add the next codepoint instead, if it exists
                        if (i + length < ngramIndexPattern.length()) {
                            final int nextChar = ngramIndexPattern.codePointAt(i + length);
                            length += Character.charCount(nextChar);
                            sequence.append(Character.toChars(nextChar));
                        } else {
                            sequence.append(Character.toChars(c));
                        }
                        break;
                    default:
                        sequence.append(Character.toChars(c));
                }
-                if (patternStructure.openEnd == false && i == patternStructure.fragments.length - 1) {
+                i += length;
-                    // End-of-string anchored (is not a trailing wildcard)
+            }
-                    fragment = fragment + TOKEN_START_OR_END_CHAR + TOKEN_START_OR_END_CHAR;
+
            if (sequence.length() > 0) {
                getNgramTokens(tokens, sequence.toString());
            }
            BooleanQuery.Builder rewritten = new BooleanQuery.Builder();
            int clauseCount = 0;
            for (String string : tokens) {
                if (clauseCount >= MAX_CLAUSES_IN_APPROXIMATION_QUERY) {
                    break;
                }
-                if (fragment.codePointCount(0, fragment.length()) <= NGRAM_SIZE) {
+                addClause(string, rewritten, Occur.MUST);
-                    tokens.add(fragment);
+                clauseCount++;
            }
            Supplier<Automaton> deferredAutomatonSupplier = () -> {
                return WildcardQuery.toAutomaton(new Term(name(), wildcardPattern));
            };
            AutomatonQueryOnBinaryDv verifyingQuery = new AutomatonQueryOnBinaryDv(name(), wildcardPattern, deferredAutomatonSupplier);
            if (clauseCount > 0) {
                // We can accelerate execution with the ngram query
                BooleanQuery approxQuery = rewritten.build();
                BooleanQuery.Builder verifyingBuilder = new BooleanQuery.Builder();
                verifyingBuilder.add(new BooleanClause(approxQuery, Occur.MUST));
                verifyingBuilder.add(new BooleanClause(verifyingQuery, Occur.MUST));
                return verifyingBuilder.build();
            } else if (numWildcardChars == 0 || numWildcardStrings > 0) {
                // We have no concrete characters and we're not a pure length query e.g. ??? 
                return new DocValuesFieldExistsQuery(name());
            }
            return verifyingQuery;
        }
        @Override
        public Query regexpQuery(String value, int flags, int maxDeterminizedStates, RewriteMethod method, QueryShardContext context) {
            if (value.length() == 0) {
                return new MatchNoDocsQuery();
            }
            if (context.allowExpensiveQueries() == false) {
                throw new ElasticsearchException(
                    "[regexp] queries cannot be executed when '" + ALLOW_EXPENSIVE_QUERIES.getKey() + "' is set to false."
                );
            }
            RegExp ngramRegex = new RegExp(addLineEndChars(toLowerCase(value)), flags);
            Query approxBooleanQuery = toApproximationQuery(ngramRegex);
            Query approxNgramQuery = rewriteBoolToNgramQuery(approxBooleanQuery);
            // MatchAll is a special case meaning the regex is known to match everything .* and 
            // there is no need for verification.
            if (approxNgramQuery instanceof MatchAllDocsQuery) {
                return existsQuery(context);
            }
            Supplier<Automaton> deferredAutomatonSupplier = ()-> {
                RegExp regex = new RegExp(value, flags);
                return regex.toAutomaton(maxDeterminizedStates);
            };
            AutomatonQueryOnBinaryDv verifyingQuery = new AutomatonQueryOnBinaryDv(name(), value, deferredAutomatonSupplier);
            // MatchAllButRequireVerificationQuery is a special case meaning the regex is reduced to a single
            // clause which we can't accelerate at all and needs verification. Example would be ".." 
            if (approxNgramQuery instanceof MatchAllButRequireVerificationQuery) {
                return verifyingQuery;
            }
            // We can accelerate execution with the ngram query
            BooleanQuery.Builder verifyingBuilder = new BooleanQuery.Builder();
            verifyingBuilder.add(new BooleanClause(approxNgramQuery, Occur.MUST));
            verifyingBuilder.add(new BooleanClause(verifyingQuery, Occur.MUST));
            return verifyingBuilder.build();
        }
        // Convert a regular expression to a simplified query consisting of BooleanQuery and TermQuery objects
        // which captures as much of the logic as possible. Query can produce some false positives but shouldn't
        // produce any false negatives.
        // In addition to Term and BooleanQuery clauses there are MatchAllDocsQuery objects (e.g for .*) and
        // a RegExpQuery if we can't resolve to any of the above. 
        // *  If an expression resolves to a single MatchAllDocsQuery eg .* then a match all shortcut is possible with 
        //    no verification needed.     
        // * If an expression resolves to a RegExpQuery eg ?? then only the verification 
        //   query is run.
        // * Anything else is a concrete query that should be run on the ngram index.
        public static Query toApproximationQuery(RegExp r) throws IllegalArgumentException {
            Query result = null;
            switch (r.kind) {
                case REGEXP_UNION:
                    result = createUnionQuery(r);
                    break;
                case REGEXP_CONCATENATION:
                    result = createConcatenationQuery(r);
                    break;
                case REGEXP_STRING:
                    String normalizedString = toLowerCase(r.s);
                    result = new TermQuery(new Term("", normalizedString));
                    break;
                case REGEXP_CHAR:                    
                    String cs = new StringBuilder().appendCodePoint(r.c).toString();
                    String normalizedChar = toLowerCase(cs);
                    result = new TermQuery(new Term("", normalizedChar));
                    break;
                case REGEXP_REPEAT:
                    // Repeat is zero or more times so zero matches = match all
                    result = new MatchAllDocsQuery();
                    break;
                case REGEXP_REPEAT_MIN:
                case REGEXP_REPEAT_MINMAX:
                    if (r.min > 0) {
                        result = toApproximationQuery(r.exp1);
                        if(result instanceof TermQuery) {
                            // Wrap the repeating expression so that it is not concatenated by a parent which concatenates
                            // plain TermQuery objects together. Boolean queries are interpreted as a black box and not
                            // concatenated.
                            BooleanQuery.Builder wrapper = new BooleanQuery.Builder();
                            wrapper.add(result, Occur.MUST);
                            result = wrapper.build();
                        }
                    } else {
                        // Expressions like (a){0,3} match empty string or up to 3 a's.
                        result = new MatchAllButRequireVerificationQuery();
                    }
                    break;
                case REGEXP_ANYSTRING:
                    // optimisation for .* queries - match all and no verification stage required.
                    result = new MatchAllDocsQuery();
                    break;
                // All other kinds of expression cannot be represented as a boolean or term query so return an object 
                // that indicates verification is required
                case REGEXP_OPTIONAL:
                case REGEXP_INTERSECTION:
                case REGEXP_COMPLEMENT:
                case REGEXP_CHAR_RANGE:
                case REGEXP_ANYCHAR:
                case REGEXP_INTERVAL:
                case REGEXP_EMPTY: 
                case REGEXP_AUTOMATON:
                    result = new MatchAllButRequireVerificationQuery();
                    break;
            }
            assert result != null; // All regex types are understood and translated to a query.
            return result;
        }
        private static Query createConcatenationQuery(RegExp r) {
            // Create ANDs of expressions plus collapse consecutive TermQuerys into single longer ones
            ArrayList<Query> queries = new ArrayList<>();
            findLeaves(r.exp1, Kind.REGEXP_CONCATENATION, queries);
            findLeaves(r.exp2, Kind.REGEXP_CONCATENATION, queries);
            BooleanQuery.Builder bAnd = new BooleanQuery.Builder();
            StringBuilder sequence = new StringBuilder();
            for (Query query : queries) {
                if (query instanceof TermQuery) {
                    TermQuery tq = (TermQuery) query;
                    sequence.append(tq.getTerm().text());
                } else {
-                    // Break fragment into multiple Ngrams
+                    if (sequence.length() > 0) {
-                    TokenStream tokenizer = WILDCARD_ANALYZER.tokenStream(name(), fragment);
+                        bAnd.add(new TermQuery(new Term("", sequence.toString())), Occur.MUST);
-                    CharTermAttribute termAtt = tokenizer.addAttribute(CharTermAttribute.class);
+                        sequence = new StringBuilder();
-                    String lastUnusedToken = null;
+                    }
-                    try {
+                    bAnd.add(query, Occur.MUST);                    
-                        tokenizer.reset();
+                }
-                        boolean takeThis = true;
+            }
-                        // minimise number of terms searched - eg for "12345" and 3grams we only need terms
+            if (sequence.length() > 0) {
-                        // `123` and `345` - no need to search for 234. We take every other ngram.
+                bAnd.add(new TermQuery(new Term("", sequence.toString())), Occur.MUST);
-                        while (tokenizer.incrementToken()) {
+            }
-                            String tokenValue = termAtt.toString();
+            BooleanQuery combined = bAnd.build();
-                            if (takeThis) {
+            if (combined.clauses().size() > 0) {
-                                tokens.add(tokenValue);
+                return combined;
-                            } else {
+            }
-                                lastUnusedToken = tokenValue;
+            // There's something in the regex we couldn't represent as a query - resort to a match all with verification 
            return new MatchAllButRequireVerificationQuery();
        }
        private static Query createUnionQuery(RegExp r) {
            // Create an OR of clauses
            ArrayList<Query> queries = new ArrayList<>();
            findLeaves(r.exp1, Kind.REGEXP_UNION, queries);
            findLeaves(r.exp2, Kind.REGEXP_UNION, queries);
            BooleanQuery.Builder bOr = new BooleanQuery.Builder();
            HashSet<Query> uniqueClauses = new HashSet<>();
            for (Query query : queries) {
                if (uniqueClauses.add(query)) {
                    bOr.add(query, Occur.SHOULD);
                }
            }
            if (uniqueClauses.size() > 0) {
                if (uniqueClauses.size() == 1) {
                    // Fully-understood ORs that collapse to a single term should be returned minus
                    // the BooleanQuery wrapper so that they might be concatenated.
                    // Helps turn [Pp][Oo][Ww][Ee][Rr][Ss][Hh][Ee][Ll][Ll] into "powershell"
                    // Each char pair eg (P OR p) can be normalized to (p) which can be a single term
                    return uniqueClauses.iterator().next();
                } else {
                    return bOr.build();
                }
            }
            // There's something in the regex we couldn't represent as a query - resort to a match all with verification 
            return new MatchAllButRequireVerificationQuery();
        }
        private static void findLeaves(RegExp exp, Kind kind, List<Query> queries) {
            if (exp.kind == kind) {
                findLeaves(exp.exp1, kind, queries);
                findLeaves( exp.exp2, kind, queries);
            } else {
                queries.add(toApproximationQuery(exp));
            }
        }        
        private static String toLowerCase(String string) {
            return lowercaseNormalizer.normalize(null, string).utf8ToString();
        }
        // Takes a BooleanQuery + TermQuery tree representing query logic and rewrites using ngrams of appropriate size.
        private Query rewriteBoolToNgramQuery(Query approxQuery) {
            //TODO optimise more intelligently so we: 
            // 1) favour full-length term queries eg abc over short eg a* when pruning too many clauses.
            // 2) make MAX_CLAUSES_IN_APPROXIMATION_QUERY a global cap rather than per-boolean clause.
            if (approxQuery == null) {
                return null;
            }
            if (approxQuery instanceof BooleanQuery) {
                BooleanQuery bq = (BooleanQuery) approxQuery;
                BooleanQuery.Builder rewritten = new BooleanQuery.Builder();
                int clauseCount = 0;
                for (BooleanClause clause : bq) {
                    Query q = rewriteBoolToNgramQuery(clause.getQuery());
                    if (q != null) {
                        if (clause.getOccur().equals(Occur.MUST)) {
                            // Can't drop "should" clauses because it can elevate a sibling optional item
                            // to mandatory (shoulds with 1 clause) causing false negatives
                            // Dropping MUSTs increase false positives which are OK because are verified anyway.
                            clauseCount++;
                            if (clauseCount >= MAX_CLAUSES_IN_APPROXIMATION_QUERY) {
                                break;
                            }
                            // alternate
                            takeThis = !takeThis;
                        }
-                        if (lastUnusedToken != null) {
+                        rewritten.add(q, clause.getOccur());
-                            // given `cake` and 3 grams the loop above would output only `cak` and we need to add trailing
+                    }
-                            // `ake` to complete the logic.
+                }
-                            tokens.add(lastUnusedToken);
+                return simplify(rewritten.build());
-                        }
+            }
-                        tokenizer.end();
+            if (approxQuery instanceof TermQuery) {
-                        tokenizer.close();
+                TermQuery tq = (TermQuery) approxQuery;
-                    } catch (IOException ioe) {
+               
-                        throw new ElasticsearchParseException("Error parsing wildcard query pattern fragment [" + fragment + "]");
+                //Remove simple terms that are only string beginnings or ends.
                String s = tq.getTerm().text();
                if (s.equals(WildcardFieldMapper.TOKEN_START_STRING) || s.equals(WildcardFieldMapper.TOKEN_END_STRING)) {
                    return new MatchAllButRequireVerificationQuery();
                }
                // Break term into tokens
                Set<String> tokens = new LinkedHashSet<>();
                getNgramTokens(tokens, s);
                BooleanQuery.Builder rewritten = new BooleanQuery.Builder();
                for (String string : tokens) {
                    addClause(string, rewritten, Occur.MUST);
                }
                return simplify(rewritten.build());
            }
            if (isMatchAll(approxQuery)) {
                return approxQuery;
            }
            throw new IllegalStateException("Invalid query type found parsing regex query:" + approxQuery);
        }    
        static Query simplify(Query input) {
            if (input instanceof BooleanQuery == false) {
                return input;
            }
            BooleanQuery result = (BooleanQuery) input;
            if (result.clauses().size() == 0) {
                // A ".*" clause can produce zero clauses in which case we return MatchAll
                return new MatchAllDocsQuery();
            }
            if (result.clauses().size() == 1) {
                return simplify(result.clauses().get(0).getQuery());
            }
            // We may have a mix of MatchAll and concrete queries - assess if we can simplify
            int matchAllCount = 0;
            int verifyCount = 0;
            boolean allConcretesAreOptional = true;
            for (BooleanClause booleanClause : result.clauses()) {
                Query q = booleanClause.getQuery();
                if (q instanceof MatchAllDocsQuery) {
                    matchAllCount++;
                } else if (q instanceof MatchAllButRequireVerificationQuery) {
                    verifyCount++;
                } else {
                    // Concrete query
                    if (booleanClause.getOccur() != Occur.SHOULD) {
                        allConcretesAreOptional = false;
                    }
                }
            }
-            if (patternStructure.isMatchAll()) {
+            if ((allConcretesAreOptional && matchAllCount > 0)) {
                // Any match all expression takes precedence over all optional concrete queries.
                return new MatchAllDocsQuery();
            }
            BooleanQuery approximation = createApproximationQuery(tokens);
            if (approximation.clauses().size() > 1 || patternStructure.needsVerification()) {
                BooleanQuery.Builder verifyingBuilder = new BooleanQuery.Builder();
                verifyingBuilder.add(new BooleanClause(approximation, Occur.MUST));
                Automaton automaton = WildcardQuery.toAutomaton(new Term(name(), wildcardPattern));
                verifyingBuilder.add(new BooleanClause(new AutomatonQueryOnBinaryDv(name(), wildcardPattern, automaton), Occur.MUST));
                return verifyingBuilder.build();
            }
            return approximation;
        }
-        private BooleanQuery createApproximationQuery(ArrayList<String> tokens) {
+            if ((allConcretesAreOptional && verifyCount > 0)) {
-            BooleanQuery.Builder bqBuilder = new BooleanQuery.Builder();
+                // Any match all expression that needs verification takes precedence over all optional concrete queries.
-            if (tokens.size() <= MAX_CLAUSES_IN_APPROXIMATION_QUERY) {
+                return new MatchAllButRequireVerificationQuery();
-                for (String token : tokens) {
+            }
-                    addClause(token, bqBuilder);
+
            // We have some mandatory concrete queries - strip out the superfluous match all expressions
            if (allConcretesAreOptional == false && matchAllCount + verifyCount > 0) {
                BooleanQuery.Builder rewritten = new BooleanQuery.Builder();
                for (BooleanClause booleanClause : result.clauses()) {
                    if (isMatchAll(booleanClause.getQuery()) == false) {
                        rewritten.add(booleanClause);
                    }
                }
-                return bqBuilder.build();
+                return simplify(rewritten.build());
            }
-            // Thin out the number of clauses using a selection spread evenly across the range
+            return result;
            float step = (float) (tokens.size() - 1) / (float) (MAX_CLAUSES_IN_APPROXIMATION_QUERY - 1); // set step size
            for (int i = 0; i < MAX_CLAUSES_IN_APPROXIMATION_QUERY; i++) {
                addClause(tokens.get(Math.round(step * i)), bqBuilder); // add each element of a position which is a multiple of step
            }
            // TODO we can be smarter about pruning here. e.g.
            // * Avoid wildcard queries if there are sufficient numbers of other terms that are full 3grams that are cheaper term queries
            // * We can select terms on their scarcity rather than even spreads across the search string.
            return bqBuilder.build();
        }
        private void addClause(String token, BooleanQuery.Builder bqBuilder) {
            assert token.codePointCount(0, token.length()) <= NGRAM_SIZE;
            if (token.codePointCount(0, token.length()) == NGRAM_SIZE) {
                TermQuery tq = new TermQuery(new Term(name(), token));
                bqBuilder.add(new BooleanClause(tq, Occur.MUST));
            } else {
                WildcardQuery wq = new WildcardQuery(new Term(name(), token + "*"));
                wq.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_REWRITE);
                bqBuilder.add(new BooleanClause(wq, Occur.MUST));
            }
        static boolean isMatchAll(Query q) {
            return q instanceof MatchAllDocsQuery || q instanceof MatchAllButRequireVerificationQuery;
        }
        protected void getNgramTokens(Set<String> tokens, String fragment) {
            if (fragment.equals(TOKEN_START_STRING) || fragment.equals(TOKEN_END_STRING)) {
                // If a regex is a form of match-all e.g. ".*" we only produce the token start/end markers as search
                // terms which can be ignored.
                return;
            }
            // Break fragment into multiple Ngrams
            TokenStream tokenizer = WILDCARD_ANALYZER.tokenStream(name(), fragment);
            CharTermAttribute termAtt = tokenizer.addAttribute(CharTermAttribute.class);
            // If fragment length < NGRAM_SIZE then it is not emitted by token stream so need
            // to initialise with the value here
            String lastUnusedToken = fragment;
            try {
                tokenizer.reset();
                boolean takeThis = true;
                // minimise number of terms searched - eg for "12345" and 3grams we only need terms
                // `123` and `345` - no need to search for 234. We take every other ngram.
                while (tokenizer.incrementToken()) {
                    String tokenValue = termAtt.toString();
                    if (takeThis) {
                        tokens.add(tokenValue);
                        lastUnusedToken = null;
                    } else {
                        lastUnusedToken = tokenValue;
                    }
                    // alternate
                    takeThis = !takeThis;
                    if (tokens.size() >= MAX_CLAUSES_IN_APPROXIMATION_QUERY) {
                        lastUnusedToken = null;
                        break;
                    }
                }
                if (lastUnusedToken != null) {
                    // given `cake` and 3 grams the loop above would output only `cak` and we need to add trailing
                    // `ake` to complete the logic.
                    tokens.add(lastUnusedToken);
                }
                tokenizer.end();
                tokenizer.close();
            } catch (IOException ioe) {
                throw new ElasticsearchParseException("Error parsing wildcard regex pattern fragment [" + fragment + "]");
            }
        }
        private void addClause(String token, BooleanQuery.Builder bqBuilder, Occur occur) {
            assert token.codePointCount(0, token.length()) <= NGRAM_SIZE;
            int tokenSize = token.codePointCount(0, token.length());
            if (tokenSize < 2 || token.equals(WildcardFieldMapper.TOKEN_END_STRING)) {
                // there's something concrete to be searched but it's too short
                // Require verification.
                bqBuilder.add(new BooleanClause(new MatchAllButRequireVerificationQuery(), occur));
                return;
            }
            if (tokenSize == NGRAM_SIZE) {
                TermQuery tq = new TermQuery(new Term(name(), token));
                bqBuilder.add(new BooleanClause(tq, occur));
            } else {
                PrefixQuery wq = new PrefixQuery(new Term(name(), token));
                wq.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_REWRITE);
                bqBuilder.add(new BooleanClause(wq, occur));
            }
        }
        @Override
        public Query fuzzyQuery(
            Object value,
            Fuzziness fuzziness,
            int prefixLength,
            int maxExpansions,
            boolean transpositions,
            QueryShardContext context
        ) {
            String searchTerm = BytesRefs.toString(value);
            String lowerSearchTerm = toLowerCase(searchTerm);
            try {
                BooleanQuery.Builder bqBuilder = new BooleanQuery.Builder();
                //The approximation query can have a prefix and any number of ngrams.
                BooleanQuery.Builder approxBuilder = new BooleanQuery.Builder();
                String postPrefixString = lowerSearchTerm;
                // Add all content prior to prefixLength as a MUST clause to the ngram index query
                if (prefixLength > 0) {
                    Set<String> prefixTokens = new LinkedHashSet<>();
                    postPrefixString = lowerSearchTerm.substring(prefixLength);
                    String prefixCandidate = TOKEN_START_OR_END_CHAR + lowerSearchTerm.substring(0,  prefixLength);
                    getNgramTokens(prefixTokens, prefixCandidate);
                    for (String prefixToken : prefixTokens) {
                        addClause(prefixToken, approxBuilder, Occur.MUST);
                    }
                }
                // Tokenize all content after the prefix
                TokenStream tokenizer = WILDCARD_ANALYZER.tokenStream(name(), postPrefixString);
                CharTermAttribute termAtt = tokenizer.addAttribute(CharTermAttribute.class);
                ArrayList<String> postPrefixTokens = new ArrayList<>();
                String firstToken = null;
                tokenizer.reset();
                int tokenNumber = 0;
                while (tokenizer.incrementToken()) {
                    if (tokenNumber == 0) {
                        String token = termAtt.toString();
                        if (firstToken == null) {
                            firstToken = token;
                        }
                        postPrefixTokens.add(token);
                    }
                    // Take every 3rd ngram so they are all disjoint. Our calculation for min_should_match
                    // number relies on there being no overlaps
                    tokenNumber++;
                    if (tokenNumber == 3) {
                        tokenNumber = 0;
                    }
                }
                tokenizer.end();
                tokenizer.close();
                BooleanQuery.Builder ngramBuilder = new BooleanQuery.Builder();
                int numClauses = 0;
                for (String token : postPrefixTokens) {
                    addClause(token, ngramBuilder, Occur.SHOULD);
                    numClauses++;
                }
                // Approximation query
                if (numClauses > fuzziness.asDistance(searchTerm)) {
                    // Useful accelerant - set min should match based on number of permitted edits.
                    ngramBuilder.setMinimumNumberShouldMatch(numClauses - fuzziness.asDistance(searchTerm));
                    approxBuilder.add(ngramBuilder.build(), Occur.MUST);
                }
                BooleanQuery ngramQ = approxBuilder.build();
                if (ngramQ.clauses().size()>0) {
                    bqBuilder.add(ngramQ, Occur.MUST);
                }
                Supplier <Automaton> deferredAutomatonSupplier = ()->{
                    // Verification query
                    FuzzyQuery fq = new FuzzyQuery(
                        new Term(name(), searchTerm),
                        fuzziness.asDistance(searchTerm),
                        prefixLength,
                        maxExpansions,
                        transpositions
                    );
                    return fq.getAutomata().automaton;
                };
                bqBuilder.add(new AutomatonQueryOnBinaryDv(name(), searchTerm, deferredAutomatonSupplier), Occur.MUST);
                return bqBuilder.build();
            } catch (IOException ioe) {
                throw new ElasticsearchParseException("Error parsing wildcard field fuzzy string [" + searchTerm + "]");
            }
        }
        @Override
@ -568,7 +900,10 @@ public class WildcardFieldMapper extends FieldMapper {
        if (value == null || value.length() > ignoreAbove) {
            return;
        }
-        String ngramValue = TOKEN_START_OR_END_CHAR + value + TOKEN_START_OR_END_CHAR + TOKEN_START_OR_END_CHAR;
+        // Always lower case the ngram index and value - helps with 
        // a) speed (less ngram variations to explore on disk and in RAM-based automaton) and 
        // b) uses less disk space
        String ngramValue = addLineEndChars(WildcardFieldType.toLowerCase(value));
        Field ngramField = new Field(fieldType().name(), ngramValue, ngramFieldType);
        fields.add(ngramField);
@ -581,6 +916,11 @@ public class WildcardFieldMapper extends FieldMapper {
        }
    }
    // Values held in the ngram index are encoded with special characters to denote start and end of values.
    static String addLineEndChars(String value) {
        return TOKEN_START_OR_END_CHAR + value + TOKEN_START_OR_END_CHAR + TOKEN_START_OR_END_CHAR;
    }
    @Override
    protected String contentType() {
        return CONTENT_TYPE;
--- a/x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapperTests.java
+++ b/x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapperTests.java
@ -6,6 +6,7 @@
 package org.elasticsearch.xpack.wildcard.mapper;
 import org.apache.lucene.analysis.core.KeywordAnalyzer;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.SortedSetDocValuesField;
@ -15,20 +16,31 @@ import org.apache.lucene.index.IndexWriterConfig;
 import org.apache.lucene.index.IndexableField;
 import org.apache.lucene.index.RandomIndexWriter;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.queryparser.classic.ParseException;
 import org.apache.lucene.queryparser.classic.QueryParser;
 import org.apache.lucene.search.BooleanClause;
 import org.apache.lucene.search.BooleanClause.Occur;
 import org.apache.lucene.search.BooleanQuery;
 import org.apache.lucene.search.DocValuesFieldExistsQuery;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.MatchAllDocsQuery;
 import org.apache.lucene.search.MatchNoDocsQuery;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.ScoreDoc;
 import org.apache.lucene.search.Sort;
 import org.apache.lucene.search.SortField;
 import org.apache.lucene.search.TermQuery;
 import org.apache.lucene.search.TopDocs;
 import org.apache.lucene.search.WildcardQuery;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.automaton.Automaton;
 import org.apache.lucene.util.automaton.ByteRunAutomaton;
 import org.apache.lucene.util.automaton.RegExp;
 import org.elasticsearch.Version;
 import org.elasticsearch.cluster.metadata.IndexMetadata;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.common.unit.Fuzziness;
 import org.elasticsearch.common.util.BigArrays;
 import org.elasticsearch.index.Index;
 import org.elasticsearch.index.IndexSettings;
@ -55,12 +67,22 @@ import java.util.HashSet;
 import java.util.function.BiFunction;
 import static org.hamcrest.Matchers.equalTo;
 import static org.mockito.Mockito.mock;
 import static org.mockito.Mockito.when;
 public class WildcardFieldMapperTests extends ESTestCase {
    static QueryShardContext createMockQueryShardContext(boolean allowExpensiveQueries) {
        QueryShardContext queryShardContext = mock(QueryShardContext.class);
        when(queryShardContext.allowExpensiveQueries()).thenReturn(allowExpensiveQueries);
        return queryShardContext;
    }    
    private static final String KEYWORD_FIELD_NAME = "keyword_field";
    private static final String WILDCARD_FIELD_NAME = "wildcard_field";
-    static final int MAX_FIELD_LENGTH = 100;
+    public static final QueryShardContext MOCK_QSC = createMockQueryShardContext(true);
    static final int MAX_FIELD_LENGTH = 30;
    static WildcardFieldMapper wildcardFieldType;
    static KeywordFieldMapper keywordFieldType;
@ -136,11 +158,18 @@ public class WildcardFieldMapperTests extends ESTestCase {
        IndexSearcher searcher = newSearcher(reader);
        iw.close();
        // Test wildcard query
        String queryString = randomABString((BooleanQuery.getMaxClauseCount() * 2) + 1);
        Query wildcardFieldQuery = wildcardFieldType.fieldType().wildcardQuery(queryString, null, null);
        TopDocs wildcardFieldTopDocs = searcher.search(wildcardFieldQuery, 10, Sort.INDEXORDER);
        assertThat(wildcardFieldTopDocs.totalHits.value, equalTo(0L));
        // Test regexp query
        wildcardFieldQuery = wildcardFieldType.fieldType().regexpQuery(queryString, RegExp.ALL, 20000, null, MOCK_QSC);
        wildcardFieldTopDocs = searcher.search(wildcardFieldQuery, 10, Sort.INDEXORDER);
        assertThat(wildcardFieldTopDocs.totalHits.value, equalTo(0L));
        reader.close();
        dir.close();
    }
@ -181,15 +210,59 @@ public class WildcardFieldMapperTests extends ESTestCase {
        int numSearches = 100;
        for (int i = 0; i < numSearches; i++) {
            String randomWildcardPattern = getRandomWildcardPattern();
-            Query wildcardFieldQuery = wildcardFieldType.fieldType().wildcardQuery(randomWildcardPattern, null, null);
+            Query wildcardFieldQuery = null;
-            TopDocs wildcardFieldTopDocs = searcher.search(wildcardFieldQuery, values.size() + 1, Sort.INDEXORDER);
+            Query keywordFieldQuery = null;
            String pattern = null;
            switch (randomInt(3)) {
            case 0:
                pattern = getRandomWildcardPattern();                
                wildcardFieldQuery = wildcardFieldType.fieldType().wildcardQuery(pattern, null, MOCK_QSC);
                keywordFieldQuery = keywordFieldType.fieldType().wildcardQuery(pattern, null, MOCK_QSC);
                break;
            case 1:
                pattern = getRandomRegexPattern(values);
                wildcardFieldQuery = wildcardFieldType.fieldType().regexpQuery(pattern, RegExp.ALL, 20000, null, MOCK_QSC);
                keywordFieldQuery = keywordFieldType.fieldType().regexpQuery(pattern, RegExp.ALL, 20000, null, MOCK_QSC);
                break;
            case 2:
                pattern = randomABString(5);
                wildcardFieldQuery = wildcardFieldType.fieldType().prefixQuery(pattern, null, MOCK_QSC);
                keywordFieldQuery = keywordFieldType.fieldType().prefixQuery(pattern, null, MOCK_QSC);
                break;
            case 3:
                int edits = randomInt(2);
                int prefixLength = randomInt(4);
                pattern = getRandomFuzzyPattern(values, edits, prefixLength);
                Fuzziness fuzziness = Fuzziness.AUTO;
                switch (edits) {
                    case 0:
                        fuzziness = Fuzziness.ZERO;
                        break;
                    case 1:
                        fuzziness = Fuzziness.ONE;
                        break;
                    case 2:
                        fuzziness = Fuzziness.TWO;
                        break;
                    default:
                        break;
                }
                // Prefix length shouldn't be longer than selected search string
                // BUT keyword field has a bug with prefix length when equal - see https://github.com/elastic/elasticsearch/issues/55790
                // so we opt for one less
                prefixLength = Math.min(pattern.length() - 1 , prefixLength);                
                boolean transpositions = randomBoolean();
-            Query keywordFieldQuery = new WildcardQuery(new Term(KEYWORD_FIELD_NAME, randomWildcardPattern));
+                wildcardFieldQuery = wildcardFieldType.fieldType().fuzzyQuery(pattern, fuzziness, prefixLength, 50, 
-            TopDocs kwTopDocs = searcher.search(keywordFieldQuery, values.size() + 1, Sort.INDEXORDER);
+                    transpositions, MOCK_QSC);
-
+                keywordFieldQuery = keywordFieldType.fieldType().fuzzyQuery(pattern, fuzziness, prefixLength, 50, 
-            assertThat(kwTopDocs.totalHits.value, equalTo(wildcardFieldTopDocs.totalHits.value));
+                    transpositions, MOCK_QSC);
                break;
            }
            TopDocs kwTopDocs = searcher.search(keywordFieldQuery, values.size() + 1, Sort.RELEVANCE);
            TopDocs wildcardFieldTopDocs = searcher.search(wildcardFieldQuery, values.size() + 1, Sort.RELEVANCE);
            assertThat(wildcardFieldTopDocs.totalHits.value, equalTo(kwTopDocs.totalHits.value));
            HashSet<Integer> expectedDocs = new HashSet<>();
            for (ScoreDoc topDoc : kwTopDocs.scoreDocs) {
@ -201,7 +274,6 @@ public class WildcardFieldMapperTests extends ESTestCase {
            assertThat(expectedDocs.size(), equalTo(0));
        }
        //Test keyword and wildcard sort operations are also equivalent
        QueryShardContext shardContextMock = createMockShardContext();
@ -222,7 +294,333 @@ public class WildcardFieldMapperTests extends ESTestCase {
        dir.close();
    }
    public void testRegexAcceleration() throws IOException, ParseException {
        // All these expressions should rewrite to a match all with no verification step required at all
        String superfastRegexes[]= { ".*",  "...*..", "(foo|bar|.*)", "@"};
        for (String regex : superfastRegexes) {
            Query wildcardFieldQuery = wildcardFieldType.fieldType().regexpQuery(regex, RegExp.ALL, 20000, null, MOCK_QSC);
            assertTrue(wildcardFieldQuery instanceof DocValuesFieldExistsQuery);
        }        
        String matchNoDocsRegexes[]= { ""};
        for (String regex : matchNoDocsRegexes) {
            Query wildcardFieldQuery = wildcardFieldType.fieldType().regexpQuery(regex, RegExp.ALL, 20000, null, MOCK_QSC);
            assertTrue(wildcardFieldQuery instanceof MatchNoDocsQuery);
        }        
        // All of these regexes should be accelerated as the equivalent of the given QueryString query 
        String acceleratedTests[][] = { 
            {".*foo.*", "foo"}, 
            {"..foobar","+foo +oba +ar_ +r__"},
            {"(maynotexist)?foobar","+foo +oba +ar_ +r__"},
            {".*/etc/passw.*", "+\\/et +tc\\/ +\\/pa +ass +ssw"}, 
            {".*etc/passwd",  "+etc +c\\/p +pas +ssw +wd_ +d__"}, 
            {"(http|ftp)://foo.*",  "+((+htt +ttp) ftp) +(+\\:\\/\\/ +\\/fo +foo)"}, 
            {"[Pp][Oo][Ww][Ee][Rr][Ss][Hh][Ee][Ll][Ll]\\.[Ee][Xx][Ee]",  "+_po +owe +ers +she +ell +l\\.e +exe +e__"}, 
            {"foo<1-100>bar",  "+(+_fo +foo) +(+bar +r__ )"},
            {"(aaa.+&.+bbb)cat", "+cat +t__"},
            {".a", "a__"}
            };
        for (String[] test : acceleratedTests) {
            String regex = test[0];
            String expectedAccelerationQueryString = test[1].replaceAll("_", ""+WildcardFieldMapper.TOKEN_START_OR_END_CHAR);
            Query wildcardFieldQuery = wildcardFieldType.fieldType().regexpQuery(regex, RegExp.ALL, 20000, null, MOCK_QSC);            
            testExpectedAccelerationQuery(regex, wildcardFieldQuery, expectedAccelerationQueryString);
        }
        // All these expressions should rewrite to just the verification query (there's no ngram acceleration) 
        // TODO we can possibly improve on some of these 
        String matchAllButVerifyTests[]= { "..", "(a)?","(a|b){0,3}", "((foo)?|(foo|bar)?)", "@&~(abc.+)", "aaa.+&.+bbb"};
        for (String regex : matchAllButVerifyTests) {
            Query wildcardFieldQuery = wildcardFieldType.fieldType().regexpQuery(regex, RegExp.ALL, 20000, null, MOCK_QSC);
            assertTrue(regex +" was not a pure verify query " +formatQuery(wildcardFieldQuery), 
                wildcardFieldQuery instanceof AutomatonQueryOnBinaryDv);
        }        
        // Documentation - regexes that do try accelerate but we would like to improve in future versions. 
        String suboptimalTests[][] = { 
            // TODO short wildcards like a* OR b* aren't great so we just drop them. 
            // Ideally we would attach to successors to create (acd OR bcd)
            { "[ab]cd",  "+cd_ +d__"}
            };
        for (String[] test : suboptimalTests) {
            String regex = test[0];
            String expectedAccelerationQueryString = test[1].replaceAll("_", ""+WildcardFieldMapper.TOKEN_START_OR_END_CHAR);
            Query wildcardFieldQuery = wildcardFieldType.fieldType().regexpQuery(regex, RegExp.ALL, 20000, null, MOCK_QSC);
            testExpectedAccelerationQuery(regex, wildcardFieldQuery, expectedAccelerationQueryString);
        }          
    }    
    // Make error messages more readable
    String formatQuery(Query q) {
        return q.toString().replaceAll(WILDCARD_FIELD_NAME+":", "").replaceAll(WildcardFieldMapper.TOKEN_START_STRING, "_");
    }
    public void testWildcardAcceleration() throws IOException, ParseException {
        // All these expressions should rewrite to MatchAll with no verification step required at all
        String superfastPattern[] = { "*", "**", "*?" };
        for (String pattern : superfastPattern) {
            Query wildcardFieldQuery = wildcardFieldType.fieldType().wildcardQuery(pattern, null, MOCK_QSC);
            assertTrue(
                pattern + " was not a pure match all query " + formatQuery(wildcardFieldQuery),
                wildcardFieldQuery instanceof DocValuesFieldExistsQuery
            );
        }
        // All of these patterns should be accelerated.
        String tests[][] = {
            { "*foobar", "+foo +oba +ar_ +r__" },
            { "foobar*", "+_fo +oob +bar" },
            { "foo\\*bar*", "+_fo +oo\\* +\\*ba +bar" },
            { "foo\\?bar*", "+_fo +oo\\? +\\?ba +bar" },
            { "foo*bar", "+_fo +foo +bar +r__" },
            { "foo?bar", "+_fo +foo +bar +r__" },
            { "?foo*bar?", "+foo +bar" },
            { "*c", "+c__" } };
        for (String[] test : tests) {
            String pattern = test[0];
            String expectedAccelerationQueryString = test[1].replaceAll("_", "" + WildcardFieldMapper.TOKEN_START_OR_END_CHAR);
            Query wildcardFieldQuery = wildcardFieldType.fieldType().wildcardQuery(pattern, null, MOCK_QSC);
            testExpectedAccelerationQuery(pattern, wildcardFieldQuery, expectedAccelerationQueryString);
            assertTrue(wildcardFieldQuery instanceof BooleanQuery);
        }
        // TODO All these expressions have no acceleration at all and could be improved
        String slowPatterns[] = { "??" };
        for (String pattern : slowPatterns) {
            Query wildcardFieldQuery = wildcardFieldType.fieldType().wildcardQuery(pattern, null, MOCK_QSC);
            assertTrue(
                pattern + " was not as slow as we assumed " + formatQuery(wildcardFieldQuery),
                wildcardFieldQuery instanceof AutomatonQueryOnBinaryDv
            );
        }
    }
    static class FuzzyTest {
        String pattern;
        int prefixLength;
        Fuzziness fuzziness;
        String expectedPrefixQuery;
        int expectedMinShouldMatch;
        String ngrams;
        FuzzyTest(
            String pattern,
            int prefixLength,
            Fuzziness fuzziness,
            String expectedPrefixQuery,
            int expectedMinShouldMatch,
            String ngrams
        ) {
            super();
            this.pattern = pattern;
            this.prefixLength = prefixLength;
            this.fuzziness = fuzziness;
            this.expectedPrefixQuery = expectedPrefixQuery;
            this.expectedMinShouldMatch = expectedMinShouldMatch;
            this.ngrams = ngrams;
        }
        Query getFuzzyQuery() {
            return wildcardFieldType.fieldType().fuzzyQuery(pattern, fuzziness, prefixLength, 50, true, MOCK_QSC);
        }
        Query getExpectedApproxQuery() throws ParseException {
            BooleanQuery.Builder bq = new BooleanQuery.Builder();
            if (expectedPrefixQuery != null) {
                String[] tokens = expectedPrefixQuery.split(" ");
                Query prefixQuery = null;
                if (tokens.length == 1) {
                    prefixQuery = new TermQuery(
                        new Term(WILDCARD_FIELD_NAME, tokens[0].replaceAll("_", WildcardFieldMapper.TOKEN_START_STRING))
                    );
                } else {
                    BooleanQuery.Builder pqb = new BooleanQuery.Builder();
                    for (String token : tokens) {
                        Query ngramQuery = new TermQuery(
                            new Term(WILDCARD_FIELD_NAME, token.replaceAll("_", WildcardFieldMapper.TOKEN_START_STRING))
                        );
                        pqb.add(ngramQuery, Occur.MUST);
                    }
                    prefixQuery = pqb.build();
                }
                if (ngrams == null) {
                    return prefixQuery;
                }
                bq.add(prefixQuery, Occur.MUST);
            }
            if (ngrams != null) {
                BooleanQuery.Builder nq = new BooleanQuery.Builder();
                String[] tokens = ngrams.split(" ");
                for (String token : tokens) {
                    Query ngramQuery = new TermQuery(
                        new Term(WILDCARD_FIELD_NAME, token.replaceAll("_", WildcardFieldMapper.TOKEN_START_STRING))
                    );
                    nq.add(ngramQuery, Occur.SHOULD);
                }
                nq.setMinimumNumberShouldMatch(expectedMinShouldMatch);
                bq.add(nq.build(), Occur.MUST);
            }
            return bq.build();
        }
    }
    public void testFuzzyAcceleration() throws IOException, ParseException {
        FuzzyTest[] tests = {
            new FuzzyTest("123456", 0, Fuzziness.ONE, null, 1, "123 456"),
            new FuzzyTest("1234567890", 2, Fuzziness.ONE, "_12", 1, "345 678"),
            new FuzzyTest("12345678901", 2, Fuzziness.ONE, "_12", 2, "345 678 901"),
            new FuzzyTest("12345678", 4, Fuzziness.ONE, "_12 234", 0, null)
        };
        for (FuzzyTest test : tests) {
            Query wildcardFieldQuery = test.getFuzzyQuery();
            testExpectedAccelerationQuery(test.pattern, wildcardFieldQuery, test.getExpectedApproxQuery());
        }
    }    
    void testExpectedAccelerationQuery(String regex, Query combinedQuery, String expectedAccelerationQueryString) throws ParseException {
        QueryParser qsp = new QueryParser(WILDCARD_FIELD_NAME, new KeywordAnalyzer());
        Query expectedAccelerationQuery = qsp.parse(expectedAccelerationQueryString);
        testExpectedAccelerationQuery(regex, combinedQuery, expectedAccelerationQuery);
    }
    void testExpectedAccelerationQuery(String regex, Query combinedQuery, Query expectedAccelerationQuery) throws ParseException {
        BooleanQuery cq = (BooleanQuery) combinedQuery;
        assert cq.clauses().size() == 2;
        Query approximationQuery = null;
        boolean verifyQueryFound = false;
        for (BooleanClause booleanClause : cq.clauses()) {
            Query q = booleanClause.getQuery();
            if (q instanceof AutomatonQueryOnBinaryDv) {
                verifyQueryFound = true;
            } else {
                approximationQuery = q;
            }
        }
        assert verifyQueryFound;
        String message = "regex: "+ regex +"\nactual query: " + formatQuery(approximationQuery) + 
            "\nexpected query: " + formatQuery(expectedAccelerationQuery) + "\n";
        assertEquals(message, expectedAccelerationQuery, approximationQuery);
    }    
    private String getRandomFuzzyPattern(HashSet<String> values, int edits, int prefixLength) {
        assert edits >=0 && edits <=2;
        // Pick one of the indexed document values to focus our queries on.
        String randomValue = values.toArray(new String[0])[randomIntBetween(0, values.size()-1)];
        if (edits == 0) {
            return randomValue;
        }
        if (randomValue.length() > prefixLength) {
            randomValue = randomValue.substring(0,prefixLength) + "C" + randomValue.substring(prefixLength);
            edits--;
        }
        if(edits > 0) {
            randomValue = randomValue + "a";
        }
        return randomValue;
    }    
    private String getRandomRegexPattern(HashSet<String> values) {
        // Pick one of the indexed document values to focus our queries on.
        String randomValue = values.toArray(new String[0])[randomIntBetween(0, values.size()-1)];        
        return convertToRandomRegex(randomValue);
    }
    // Produces a random regex string guaranteed to match the provided value
    protected String convertToRandomRegex(String randomValue) {
        StringBuilder result = new StringBuilder();
        //Pick a part of the string to change
        int substitutionPoint = randomIntBetween(0, randomValue.length()-1);
        int substitutionLength = randomIntBetween(1, Math.min(10, randomValue.length() - substitutionPoint));
        //Add any head to the result, unchanged
        if(substitutionPoint >0) {
            result.append(randomValue.substring(0,substitutionPoint));
        }
        // Modify the middle...
        String replacementPart = randomValue.substring(substitutionPoint, substitutionPoint+substitutionLength);
        int mutation = randomIntBetween(0, 11);
        switch (mutation) {
        case 0:
            // OR with random alpha of same length
            result.append("("+replacementPart+"|c"+ randomABString(replacementPart.length())+")");            
            break;
        case 1:
            // OR with non-existant value
            result.append("("+replacementPart+"|doesnotexist)");            
            break;
        case 2:
            // OR with another randomised regex (used to create nested levels of expression).
            result.append("(" + convertToRandomRegex(replacementPart) +"|doesnotexist)");   
            break;
        case 3:
            // Star-replace all ab sequences.
            result.append(replacementPart.replaceAll("ab", ".*"));            
            break;
        case 4:
            // .-replace all b chars
            result.append(replacementPart.replaceAll("b", "."));            
            break;
        case 5:
            // length-limited stars {1,2}
            result.append(".{1,"+replacementPart.length()+"}");            
            break;
        case 6:
            // replace all chars with .
            result.append(replacementPart.replaceAll(".", "."));       
            break;
        case 7:
            // OR with uppercase chars eg [aA] (many of these sorts of expression in the wild..
            char [] chars = replacementPart.toCharArray();
            for (char c : chars) {
                result.append("[" + c + Character.toUpperCase(c) +"]");
            }
            break;
        case 8:
            // NOT a character - replace all b's with "not a"
            result.append(replacementPart.replaceAll("b", "[^a]"));
            break;
        case 9:
            // Make whole part repeatable 1 or more times
            result.append("(" + replacementPart +")+");
            break;
        case 10:
            // Make whole part repeatable 0 or more times
            result.append("(" + replacementPart +")?");
            break;
        case 11:
            // all but ... syntax
            result.append("@&~(doesnotexist.+)");
            break;
        default:
            break;
        }
        //add any remaining tail, unchanged
        if(substitutionPoint + substitutionLength <= randomValue.length()-1) {
            result.append(randomValue.substring(substitutionPoint + substitutionLength));
        }
        //Assert our randomly generated regex actually matches the provided raw input.
        RegExp regex = new RegExp(result.toString());
        Automaton automaton = regex.toAutomaton();
        ByteRunAutomaton bytesMatcher = new ByteRunAutomaton(automaton);
        BytesRef br = new BytesRef(randomValue);
        assertTrue("[" + result.toString() + "]should match [" + randomValue + "]" + substitutionPoint + "-" + substitutionLength + "/"
                + randomValue.length(), bytesMatcher.run(br.bytes, br.offset, br.length));
        return result.toString();
    }
    protected MappedFieldType provideMappedFieldType(String name) {
        if (name.equals(WILDCARD_FIELD_NAME)) {
@ -284,7 +682,11 @@ public class WildcardFieldMapperTests extends ESTestCase {
        StringBuilder sb = new StringBuilder();
        while (sb.length() < minLength) {
            if (randomBoolean()) {
-                sb.append("a");
+                if (randomBoolean()) {
                    sb.append("a");
                } else {
                    sb.append("A");                    
                }
            } else {
                sb.append("b");
            }