Search - add range query support to wildcard field (#57881) (#57988)

Backport to add range query support to wildcard field Closes #57816
2020-06-12 11:30:54 +01:00 · 2020-06-12 11:30:54 +01:00 · 2da8e57f59
parent db03e7c93b
commit 2da8e57f59
2 changed files with 250 additions and 1 deletions
--- a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java
+++ b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java
@ -30,15 +30,19 @@ import org.apache.lucene.search.PrefixQuery;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.SortField;
 import org.apache.lucene.search.TermQuery;
 import org.apache.lucene.search.TermRangeQuery;
 import org.apache.lucene.search.WildcardQuery;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.automaton.Automaton;
 import org.apache.lucene.util.automaton.RegExp;
 import org.apache.lucene.util.automaton.RegExp.Kind;
 import org.elasticsearch.ElasticsearchException;
 import org.elasticsearch.ElasticsearchParseException;
 import org.elasticsearch.common.geo.ShapeRelation;
 import org.elasticsearch.common.lucene.BytesRefs;
 import org.elasticsearch.common.lucene.Lucene;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.common.time.DateMathParser;
 import org.elasticsearch.common.unit.Fuzziness;
 import org.elasticsearch.common.xcontent.XContentBuilder;
 import org.elasticsearch.common.xcontent.XContentParser;
@ -70,6 +74,7 @@ import org.elasticsearch.search.aggregations.support.ValuesSourceType;
 import java.io.IOException;
 import java.nio.charset.StandardCharsets;
 import java.time.ZoneId;
 import java.util.ArrayList;
 import java.util.HashSet;
 import java.util.Iterator;
@ -614,6 +619,12 @@ public class WildcardFieldMapper extends FieldMapper {
            return q instanceof MatchAllDocsQuery || q instanceof MatchAllButRequireVerificationQuery;
        }
        protected String firstNgramToken(String fragment) {
            LinkedHashSet<String> tokens = new LinkedHashSet<>();
            getNgramTokens(tokens, fragment);
            return tokens.iterator().next();
        }
        protected void getNgramTokens(Set<String> tokens, String fragment) {
            if (fragment.equals(TOKEN_START_STRING) || fragment.equals(TOKEN_END_STRING)) {
                // If a regex is a form of match-all e.g. ".*" we only produce the token start/end markers as search
@ -678,6 +689,90 @@ public class WildcardFieldMapper extends FieldMapper {
            }
        }
        @Override
        public Query rangeQuery(
            Object lowerTerm,
            Object upperTerm,
            boolean includeLower,
            boolean includeUpper,
            ShapeRelation relation,
            ZoneId timeZone,
            DateMathParser parser,
            QueryShardContext context
        ) {
            if (context.allowExpensiveQueries() == false) {
                throw new ElasticsearchException("[range] queries on [wildcard] fields cannot be executed when '" +
                        ALLOW_EXPENSIVE_QUERIES.getKey() + "' is set to false.");
            }
            BytesRef lower = lowerTerm == null ? null : BytesRefs.toBytesRef(lowerTerm);
            BytesRef upper = upperTerm == null ? null : BytesRefs.toBytesRef(upperTerm);
            Query accelerationQuery = null;
            if (lowerTerm != null && upperTerm != null) {
                // Long common prefixes e.g. "C:/Program Files/a,txt" to "C:/Program Files/z,txt"
                // can be accelerated by searching for all the common leading ngrams e.g. c:/, /pr, rog, gra etc 
                StringBuilder commonPrefix = new StringBuilder();
                String lowerS = addLineEndChars(toLowerCase(lower.utf8ToString()));
                String upperS = addLineEndChars(toLowerCase(upper.utf8ToString()));
                for (int i = 0; i < Math.min(lowerS.length(), upperS.length());) {
                    final int cL = lowerS.codePointAt(i);
                    final int cU = upperS.codePointAt(i);
                    if (cL == cU) {
                        commonPrefix.append(Character.toChars(cL));
                    } else {
                        break;
                    }
                    int length = Character.charCount(cL);
                    i += length;
                }
                if (commonPrefix.length() > 0) {
                    Set<String> tokens = new HashSet<>();
                    getNgramTokens(tokens, commonPrefix.toString());
                    BooleanQuery.Builder bqBuilder = new BooleanQuery.Builder();
                    for (String token : tokens) {
                        int tokenSize = token.codePointCount(0, token.length());
                        if (tokenSize < 2 || token.equals(WildcardFieldMapper.TOKEN_END_STRING)) {
                            continue;
                        }
                        if (tokenSize == NGRAM_SIZE) {
                            TermQuery tq = new TermQuery(new Term(name(), token));
                            bqBuilder.add(new BooleanClause(tq, Occur.MUST));
                        } else {
                            PrefixQuery wq = new PrefixQuery(new Term(name(), token));
                            wq.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_REWRITE);
                            bqBuilder.add(new BooleanClause(wq, Occur.MUST));
                        }
                    }
                    BooleanQuery bq = bqBuilder.build();
                    if (bq.clauses().size() > 0) {
                        accelerationQuery = bq;
                    }                     
                }                
            }
            if (accelerationQuery == null) {
                // Fallback - if there is no common prefix sequence then we look for the range of ngrams that appear at the start
                // of the string e.g. given 100 to 999 we would search for ngrams in the range
                //   TOKEN_START_OR_END_CHAR + "10" to 
                //   TOKEN_START_OR_END_CHAR + "99"
                BytesRef lowerNgram = lower == null ? null : new BytesRef(firstNgramToken(
                    addLineEndChars(toLowerCase(lower.utf8ToString()))));
                BytesRef upperNgram = upper == null ? null : new BytesRef(firstNgramToken(
                    addLineEndChars(toLowerCase(upper.utf8ToString()))));
                accelerationQuery = new TermRangeQuery(name(), lowerNgram, upperNgram, true, true);                
            }
            Supplier <Automaton> deferredAutomatonSupplier = ()->{
                return TermRangeQuery.toAutomaton(lower, upper, includeLower, includeUpper);
            };
            AutomatonQueryOnBinaryDv slowQuery = new AutomatonQueryOnBinaryDv(name(), lower + "-" + upper, deferredAutomatonSupplier);
            BooleanQuery.Builder qBuilder = new BooleanQuery.Builder();
            qBuilder.add(accelerationQuery, Occur.MUST);
            qBuilder.add(slowQuery, Occur.MUST);
            return qBuilder.build();
        }
        @Override
        public Query fuzzyQuery(
            Object value,
--- a/x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapperTests.java
+++ b/x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapperTests.java
@ -30,6 +30,7 @@ import org.apache.lucene.search.ScoreDoc;
 import org.apache.lucene.search.Sort;
 import org.apache.lucene.search.SortField;
 import org.apache.lucene.search.TermQuery;
 import org.apache.lucene.search.TermRangeQuery;
 import org.apache.lucene.search.TopDocs;
 import org.apache.lucene.search.WildcardQuery;
 import org.apache.lucene.store.Directory;
@ -214,7 +215,7 @@ public class WildcardFieldMapperTests extends ESTestCase {
            Query wildcardFieldQuery = null;
            Query keywordFieldQuery = null;
            String pattern = null;
-            switch (randomInt(3)) {
+            switch (randomInt(4)) {
            case 0:
                pattern = getRandomWildcardPattern();                
                wildcardFieldQuery = wildcardFieldType.fieldType().wildcardQuery(pattern, null, MOCK_QSC);
@ -259,6 +260,14 @@ public class WildcardFieldMapperTests extends ESTestCase {
                keywordFieldQuery = keywordFieldType.fieldType().fuzzyQuery(pattern, fuzziness, prefixLength, 50, 
                    transpositions, MOCK_QSC);
                break;
            case 4:
                TermRangeQuery trq = getRandomRange(values);
                wildcardFieldQuery = wildcardFieldType.fieldType().rangeQuery(trq.getLowerTerm(),trq.getUpperTerm(), trq.includesLower(), 
                    trq.includesUpper(), null, null, null, MOCK_QSC);
                keywordFieldQuery = keywordFieldType.fieldType().rangeQuery(trq.getLowerTerm(),trq.getUpperTerm(), trq.includesLower(), 
                    trq.includesUpper(), null, null, null, MOCK_QSC);
                break;
            }
            TopDocs kwTopDocs = searcher.search(keywordFieldQuery, values.size() + 1, Sort.RELEVANCE);
            TopDocs wildcardFieldTopDocs = searcher.search(wildcardFieldQuery, values.size() + 1, Sort.RELEVANCE);
@ -294,6 +303,76 @@ public class WildcardFieldMapperTests extends ESTestCase {
        dir.close();
    }
    private void indexDoc(RandomIndexWriter iw, String value) throws IOException {
        Document doc = new Document();
        ParseContext.Document parseDoc = new ParseContext.Document();
        addFields(parseDoc, doc, value);
        indexDoc(parseDoc, doc, iw);        
    }
    public void testRangeQueryVersusKeywordField() throws IOException {
        Directory dir = newDirectory();
        IndexWriterConfig iwc = newIndexWriterConfig(WildcardFieldMapper.WILDCARD_ANALYZER);
        iwc.setMergePolicy(newTieredMergePolicy(random()));
        RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
        // Tests for acceleration strategy based on long common prefix
        indexDoc(iw, "C:\\Program Files\\a.txt");
        indexDoc(iw, "C:\\Program Files\\n.txt");
        indexDoc(iw, "C:\\Program Files\\z.txt");
        // Tests for acceleration strategy based on no common prefix
        indexDoc(iw, "a.txt");
        indexDoc(iw, "n.txt");
        indexDoc(iw, "z.txt");
        iw.forceMerge(1);
        DirectoryReader reader = iw.getReader();
        IndexSearcher searcher = newSearcher(reader);
        iw.close();
        String [][] rangeTests = {
            {"C:\\Program Files\\a", "C:\\Program Files\\z"}, 
            {"C:\\Program Files\\a", "C:\\Program Files\\n"}, 
            {null, "C:\\Program Files\\z"}, 
            {"C:\\Program Files\\a", null},
            {"a.txt", "z.txt"}, 
            {"a.txt", "n.txt"}, 
            {null, "z.txt"}, 
            {"a.txt", null} 
        };
        for (String[] bounds : rangeTests) {
            BytesRef lower = bounds[0] == null ? null :new BytesRef(bounds[0]);
            BytesRef upper = bounds[1] == null ? null :new BytesRef(bounds[1]);
            TermRangeQuery trq = new TermRangeQuery(WILDCARD_FIELD_NAME, lower, upper, randomBoolean(), randomBoolean());
            Query wildcardFieldQuery = wildcardFieldType.fieldType().rangeQuery(trq.getLowerTerm(),trq.getUpperTerm(), trq.includesLower(), 
                trq.includesUpper(), null, null, null, MOCK_QSC);
            Query keywordFieldQuery = keywordFieldType.fieldType().rangeQuery(trq.getLowerTerm(),trq.getUpperTerm(), trq.includesLower(), 
                trq.includesUpper(), null, null, null, MOCK_QSC);
            TopDocs kwTopDocs = searcher.search(keywordFieldQuery, 10, Sort.RELEVANCE);
            TopDocs wildcardFieldTopDocs = searcher.search(wildcardFieldQuery, 10, Sort.RELEVANCE);
            assertThat(wildcardFieldTopDocs.totalHits.value, equalTo(kwTopDocs.totalHits.value));
            HashSet<Integer> expectedDocs = new HashSet<>();
            for (ScoreDoc topDoc : kwTopDocs.scoreDocs) {
                expectedDocs.add(topDoc.doc);
            }
            for (ScoreDoc wcTopDoc : wildcardFieldTopDocs.scoreDocs) {
                assertTrue(expectedDocs.remove(wcTopDoc.doc));
            }
            assertThat(expectedDocs.size(), equalTo(0));
        }
        reader.close();
        dir.close();
    }
    public void testRegexAcceleration() throws IOException, ParseException {
        // All these expressions should rewrite to a match all with no verification step required at all
        String superfastRegexes[]= { ".*",  "...*..", "(foo|bar|.*)", "@"};
@ -485,6 +564,54 @@ public class WildcardFieldMapperTests extends ESTestCase {
        }
    }    
    static class RangeTest {
        String lower;
        String upper;
        String ngrams;
        RangeTest(
            String lower,
            String upper,
            String ngrams
        ) {
            super();
            this.lower = lower;
            this.upper = upper;
            this.ngrams = ngrams;
        }
        Query getRangeQuery() {
            return wildcardFieldType.fieldType().rangeQuery(lower, upper, true, true, null, null, null, MOCK_QSC);
        }
        Query getExpectedApproxQuery() throws ParseException {
            BooleanQuery.Builder bq = new BooleanQuery.Builder();
            if (ngrams != null) {
                String[] tokens = ngrams.split(" ");
                for (String token : tokens) {
                    Query ngramQuery = new TermQuery(
                        new Term(WILDCARD_FIELD_NAME, token.replaceAll("_", WildcardFieldMapper.TOKEN_START_STRING))
                    );
                    bq.add(ngramQuery, Occur.MUST);
                }
            }
            return bq.build();
        }
    }    
    public void testRangeAcceleration() throws IOException, ParseException {
        RangeTest[] tests = {
            new RangeTest("c:/a.txt", "c:/z.txt", "_c: c:/"),
            new RangeTest("C:/ProgramFiles/a.txt", "C:/ProgramFiles/z.txt", "_c: :/p pro ogr ram mfi ile es/"),
        };
        for (RangeTest test : tests) {
            Query wildcardFieldQuery = test.getRangeQuery();
            testExpectedAccelerationQuery(test.lower + "-" + test.upper, wildcardFieldQuery, test.getExpectedApproxQuery());
        }
    }      
    void testExpectedAccelerationQuery(String regex, Query combinedQuery, String expectedAccelerationQueryString) throws ParseException {
        QueryParser qsp = new QueryParser(WILDCARD_FIELD_NAME, new KeywordAnalyzer());
@ -531,6 +658,33 @@ public class WildcardFieldMapperTests extends ESTestCase {
        return randomValue;
    }    
    private TermRangeQuery getRandomRange(HashSet<String> values) {
        // Pick one of the indexed document values to focus our queries on.
        String randomValue = values.toArray(new String[0])[randomIntBetween(0, values.size()-1)];
        StringBuilder upper = new StringBuilder();
        //Pick a part of the string to change
        int substitutionPoint = randomIntBetween(0, randomValue.length()-1);
        int substitutionLength = randomIntBetween(1, Math.min(10, randomValue.length() - substitutionPoint));
        //Add any head to the result, unchanged
        if(substitutionPoint >0) {
            upper.append(randomValue.substring(0,substitutionPoint));
        }
        // Modify the middle...
        String replacementPart = randomValue.substring(substitutionPoint, substitutionPoint+substitutionLength);
        // .-replace all a chars with z
        upper.append(replacementPart.replaceAll("a", "z"));            
        //add any remaining tail, unchanged
        if(substitutionPoint + substitutionLength <= randomValue.length()-1) {
            upper.append(randomValue.substring(substitutionPoint + substitutionLength));
        }
        return new TermRangeQuery(WILDCARD_FIELD_NAME, new BytesRef(randomValue), new BytesRef(upper.toString()), 
            randomBoolean(), randomBoolean());
    }    
    private String getRandomRegexPattern(HashSet<String> values) {
        // Pick one of the indexed document values to focus our queries on.
        String randomValue = values.toArray(new String[0])[randomIntBetween(0, values.size()-1)];