Wildcard field - add normalisation of ngram tokens to reduce disk space. (#63120) (#63193)

Adds normalisation of ngram tokens to reduce disk space.
All punctuation becomes / char and for A-Z0-9 chars turn even codepoints to prior odd e.g. aab becomes aaa

Closes #62817
This commit is contained in:
markharwood 2020-10-02 16:24:27 +01:00 committed by GitHub
parent 5370f270d7
commit bfb3071539
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 214 additions and 95 deletions

View File

@ -8,6 +8,8 @@
package org.elasticsearch.xpack.wildcard.mapper;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.ngram.NGramTokenizer;
@ -37,6 +39,7 @@ import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.RegExp;
import org.apache.lucene.util.automaton.RegExp.Kind;
import org.elasticsearch.ElasticsearchParseException;
import org.elasticsearch.Version;
import org.elasticsearch.common.geo.ShapeRelation;
import org.elasticsearch.common.lucene.BytesRefs;
import org.elasticsearch.common.lucene.Lucene;
@ -91,13 +94,94 @@ public class WildcardFieldMapper extends FieldMapper {
public static final String CONTENT_TYPE = "wildcard";
public static short MAX_CLAUSES_IN_APPROXIMATION_QUERY = 10;
public static final int NGRAM_SIZE = 3;
static final NamedAnalyzer WILDCARD_ANALYZER = new NamedAnalyzer("_wildcard", AnalyzerScope.GLOBAL, new Analyzer() {
static final NamedAnalyzer WILDCARD_ANALYZER_7_10 = new NamedAnalyzer("_wildcard_7_10", AnalyzerScope.GLOBAL, new Analyzer() {
@Override
public TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new NGramTokenizer(NGRAM_SIZE, NGRAM_SIZE);
return new TokenStreamComponents(tokenizer);
TokenStream tok = new LowerCaseFilter(tokenizer);
tok = new PunctuationFoldingFilter(tok);
return new TokenStreamComponents(r -> {
tokenizer.setReader(r);
}, tok);
}
});
// @deprecated - used for BWC with elasticsearch 7.9
static final NamedAnalyzer WILDCARD_ANALYZER_7_9 = new NamedAnalyzer("_wildcard", AnalyzerScope.GLOBAL, new Analyzer() {
@Override
public TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new NGramTokenizer(NGRAM_SIZE, NGRAM_SIZE);
TokenStream tok = new LowerCaseFilter(tokenizer);
return new TokenStreamComponents(r -> {
tokenizer.setReader(r);
}, tok);
}
});
public static class PunctuationFoldingFilter extends TokenFilter {
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
/**
* Create a new PunctuationFoldingFilter, that normalizes token text such that even-numbered ascii values
* are made odd and punctuation is replaced with /
*
* @param in TokenStream to filter
*/
public PunctuationFoldingFilter(TokenStream in) {
super(in);
}
@Override
public final boolean incrementToken() throws IOException {
if (input.incrementToken()) {
normalize(termAtt.buffer(), 0, termAtt.length());
return true;
} else
return false;
}
public static String normalize(String s) {
char[] chars = s.toCharArray();
normalize(chars, 0, chars.length);
return new String(chars);
}
/**
* Normalizes a token
*/
public static void normalize(final char[] buffer, final int offset, final int limit) {
assert buffer.length >= limit;
assert 0 <= offset && offset <= buffer.length;
for (int i = offset; i < limit;) {
int codepoint = Character.codePointAt(buffer, i, limit);
i += Character.toChars(
normalize(codepoint), buffer, i);
}
}
private static int normalize(int codepoint) {
if (codepoint == TOKEN_START_OR_END_CHAR) {
return codepoint;
}
if (Character.isLetterOrDigit(codepoint) == false) {
// Replace non letters or digits with /
return 47;
}
// All other ascii characters, normalize even numbers to prior odd.
if (codepoint > 48 && codepoint <= 128 && codepoint % 2 == 0) {
// Odd ascii chars in 0-9 a-z range.
return codepoint - 1;
} else {
// return even ascii char or non-ascii chars
return codepoint;
}
}
}
public static class Defaults {
public static final FieldType FIELD_TYPE = new FieldType();
@ -172,8 +256,14 @@ public class WildcardFieldMapper extends FieldMapper {
@Override
public WildcardFieldMapper build(BuilderContext context) {
return new WildcardFieldMapper(
name, fieldType, new WildcardFieldType(buildFullName(context), fieldType, meta), ignoreAbove,
multiFieldsBuilder.build(this, context), copyTo, nullValue);
name,
fieldType,
new WildcardFieldType(buildFullName(context), fieldType, meta, context.indexCreatedVersion()),
ignoreAbove,
multiFieldsBuilder.build(this, context),
copyTo,
nullValue
);
}
}
@ -212,17 +302,21 @@ public class WildcardFieldMapper extends FieldMapper {
static Analyzer lowercaseNormalizer = new LowercaseNormalizer();
private WildcardFieldType(String name, FieldType fieldType, Map<String, String> meta) {
private WildcardFieldType(String name, FieldType fieldType, Map<String, String> meta, Version version) {
super(name, true, fieldType.stored(), true,
new TextSearchInfo(fieldType, null, Lucene.KEYWORD_ANALYZER, Lucene.KEYWORD_ANALYZER), meta);
setIndexAnalyzer(WILDCARD_ANALYZER);
if (version.onOrAfter(Version.V_7_10_0)) {
setIndexAnalyzer(WILDCARD_ANALYZER_7_10);
} else {
setIndexAnalyzer(WILDCARD_ANALYZER_7_9);
}
}
@Override
public Query wildcardQuery(String wildcardPattern, RewriteMethod method, boolean caseInsensitive, QueryShardContext context) {
String ngramIndexPattern = addLineEndChars(toLowerCase(wildcardPattern));
String ngramIndexPattern = addLineEndChars(wildcardPattern);
// Break search term into tokens
Set<String> tokens = new LinkedHashSet<>();
StringBuilder sequence = new StringBuilder();
@ -305,8 +399,8 @@ public class WildcardFieldMapper extends FieldMapper {
if (value.length() == 0) {
return new MatchNoDocsQuery();
}
RegExp ngramRegex = new RegExp(addLineEndChars(toLowerCase(value)), syntaxFlags, matchFlags);
RegExp ngramRegex = new RegExp(addLineEndChars(value), syntaxFlags, matchFlags);
Query approxBooleanQuery = toApproximationQuery(ngramRegex);
Query approxNgramQuery = rewriteBoolToNgramQuery(approxBooleanQuery);
@ -590,7 +684,7 @@ public class WildcardFieldMapper extends FieldMapper {
return q instanceof MatchAllDocsQuery || q instanceof MatchAllButRequireVerificationQuery;
}
protected String firstNgramToken(String fragment) {
protected String firstNgramToken(String fragment, Analyzer analyzer) {
LinkedHashSet<String> tokens = new LinkedHashSet<>();
getNgramTokens(tokens, fragment);
return tokens.iterator().next();
@ -603,41 +697,30 @@ public class WildcardFieldMapper extends FieldMapper {
return;
}
// Break fragment into multiple Ngrams
TokenStream tokenizer = WILDCARD_ANALYZER.tokenStream(name(), fragment);
TokenStream tokenizer = indexAnalyzer().tokenStream(name(), fragment);
CharTermAttribute termAtt = tokenizer.addAttribute(CharTermAttribute.class);
// If fragment length < NGRAM_SIZE then it is not emitted by token stream so need
// to initialise with the value here
String lastUnusedToken = fragment;
int foundTokens = 0;
try {
tokenizer.reset();
boolean takeThis = true;
// minimise number of terms searched - eg for "12345" and 3grams we only need terms
// `123` and `345` - no need to search for 234. We take every other ngram.
while (tokenizer.incrementToken()) {
String tokenValue = termAtt.toString();
if (takeThis) {
tokens.add(tokenValue);
lastUnusedToken = null;
} else {
lastUnusedToken = tokenValue;
}
// alternate
takeThis = !takeThis;
if (tokens.size() >= MAX_CLAUSES_IN_APPROXIMATION_QUERY) {
lastUnusedToken = null;
break;
}
}
if (lastUnusedToken != null) {
// given `cake` and 3 grams the loop above would output only `cak` and we need to add trailing
// `ake` to complete the logic.
tokens.add(lastUnusedToken);
tokens.add(tokenValue);
foundTokens++;
}
tokenizer.end();
tokenizer.close();
} catch (IOException ioe) {
throw new ElasticsearchParseException("Error parsing wildcard regex pattern fragment [" + fragment + "]");
}
if (foundTokens == 0 && fragment.length() > 0) {
// fragment must have been less than NGRAM_SIZE - add a placeholder which may be used in a prefix query e.g. ab*
fragment = toLowerCase(fragment);
if (indexAnalyzer() == WILDCARD_ANALYZER_7_10) {
fragment = PunctuationFoldingFilter.normalize(fragment);
}
tokens.add(fragment);
}
}
@ -678,8 +761,8 @@ public class WildcardFieldMapper extends FieldMapper {
// Long common prefixes e.g. "C:/Program Files/a,txt" to "C:/Program Files/z,txt"
// can be accelerated by searching for all the common leading ngrams e.g. c:/, /pr, rog, gra etc
StringBuilder commonPrefix = new StringBuilder();
String lowerS = addLineEndChars(toLowerCase(lower.utf8ToString()));
String upperS = addLineEndChars(toLowerCase(upper.utf8ToString()));
String lowerS = addLineEndChars(lower.utf8ToString());
String upperS = addLineEndChars(upper.utf8ToString());
for (int i = 0; i < Math.min(lowerS.length(), upperS.length());) {
final int cL = lowerS.codePointAt(i);
final int cU = upperS.codePointAt(i);
@ -717,23 +800,15 @@ public class WildcardFieldMapper extends FieldMapper {
}
}
}
if (accelerationQuery == null) {
// Fallback - if there is no common prefix sequence then we look for the range of ngrams that appear at the start
// of the string e.g. given 100 to 999 we would search for ngrams in the range
// TOKEN_START_OR_END_CHAR + "10" to
// TOKEN_START_OR_END_CHAR + "99"
BytesRef lowerNgram = lower == null ? null : new BytesRef(firstNgramToken(
addLineEndChars(toLowerCase(lower.utf8ToString()))));
BytesRef upperNgram = upper == null ? null : new BytesRef(firstNgramToken(
addLineEndChars(toLowerCase(upper.utf8ToString()))));
accelerationQuery = new TermRangeQuery(name(), lowerNgram, upperNgram, true, true);
}
Supplier <Automaton> deferredAutomatonSupplier = ()->{
return TermRangeQuery.toAutomaton(lower, upper, includeLower, includeUpper);
};
AutomatonQueryOnBinaryDv slowQuery = new AutomatonQueryOnBinaryDv(name(), lower + "-" + upper, deferredAutomatonSupplier);
if (accelerationQuery == null) {
return slowQuery;
}
BooleanQuery.Builder qBuilder = new BooleanQuery.Builder();
qBuilder.add(accelerationQuery, Occur.MUST);
qBuilder.add(slowQuery, Occur.MUST);
@ -750,26 +825,25 @@ public class WildcardFieldMapper extends FieldMapper {
QueryShardContext context
) {
String searchTerm = BytesRefs.toString(value);
String lowerSearchTerm = toLowerCase(searchTerm);
try {
BooleanQuery.Builder bqBuilder = new BooleanQuery.Builder();
//The approximation query can have a prefix and any number of ngrams.
BooleanQuery.Builder approxBuilder = new BooleanQuery.Builder();
String postPrefixString = lowerSearchTerm;
String postPrefixString = searchTerm;
// Add all content prior to prefixLength as a MUST clause to the ngram index query
if (prefixLength > 0) {
Set<String> prefixTokens = new LinkedHashSet<>();
postPrefixString = lowerSearchTerm.substring(prefixLength);
String prefixCandidate = TOKEN_START_OR_END_CHAR + lowerSearchTerm.substring(0, prefixLength);
postPrefixString = searchTerm.substring(prefixLength);
String prefixCandidate = TOKEN_START_OR_END_CHAR + searchTerm.substring(0, prefixLength);
getNgramTokens(prefixTokens, prefixCandidate);
for (String prefixToken : prefixTokens) {
addClause(prefixToken, approxBuilder, Occur.MUST);
}
}
// Tokenize all content after the prefix
TokenStream tokenizer = WILDCARD_ANALYZER.tokenStream(name(), postPrefixString);
TokenStream tokenizer = indexAnalyzer().tokenStream(name(), postPrefixString);
CharTermAttribute termAtt = tokenizer.addAttribute(CharTermAttribute.class);
ArrayList<String> postPrefixTokens = new ArrayList<>();
String firstToken = null;
@ -985,10 +1059,7 @@ public class WildcardFieldMapper extends FieldMapper {
if (value == null || value.length() > ignoreAbove) {
return;
}
// Always lower case the ngram index and value - helps with
// a) speed (less ngram variations to explore on disk and in RAM-based automaton) and
// b) uses less disk space
String ngramValue = addLineEndChars(WildcardFieldType.toLowerCase(value));
String ngramValue = addLineEndChars(value);
Field ngramField = new Field(fieldType().name(), ngramValue, ngramFieldType);
fields.add(ngramField);

View File

@ -77,18 +77,21 @@ import static org.mockito.Mockito.when;
public class WildcardFieldMapperTests extends ESTestCase {
static QueryShardContext createMockQueryShardContext(boolean allowExpensiveQueries) {
static QueryShardContext createMockQueryShardContext(boolean allowExpensiveQueries, Version version) {
QueryShardContext queryShardContext = mock(QueryShardContext.class);
when(queryShardContext.allowExpensiveQueries()).thenReturn(allowExpensiveQueries);
when(queryShardContext.indexVersionCreated()).thenReturn(version);
return queryShardContext;
}
}
private static final String KEYWORD_FIELD_NAME = "keyword_field";
private static final String WILDCARD_FIELD_NAME = "wildcard_field";
public static final QueryShardContext MOCK_QSC = createMockQueryShardContext(true);
public static final QueryShardContext MOCK_QSC = createMockQueryShardContext(true, Version.CURRENT);
public static final QueryShardContext MOCK_7_9_QSC = createMockQueryShardContext(true, Version.V_7_9_0);
static final int MAX_FIELD_LENGTH = 30;
static WildcardFieldMapper wildcardFieldType;
static WildcardFieldMapper wildcardFieldType79;
static KeywordFieldMapper keywordFieldType;
@Override
@ -96,11 +99,17 @@ public class WildcardFieldMapperTests extends ESTestCase {
public void setUp() throws Exception {
Builder builder = new WildcardFieldMapper.Builder(WILDCARD_FIELD_NAME);
builder.ignoreAbove(MAX_FIELD_LENGTH);
wildcardFieldType = builder.build(new Mapper.BuilderContext(createIndexSettings().getSettings(), new ContentPath(0)));
wildcardFieldType = builder.build(
new Mapper.BuilderContext(createIndexSettings(Version.CURRENT).getSettings(), new ContentPath(0))
);
wildcardFieldType79 = builder.build(
new Mapper.BuilderContext(createIndexSettings(Version.V_7_9_0).getSettings(), new ContentPath(0))
);
org.elasticsearch.index.mapper.KeywordFieldMapper.Builder kwBuilder = new KeywordFieldMapper.Builder(KEYWORD_FIELD_NAME);
keywordFieldType = kwBuilder.build(new Mapper.BuilderContext(createIndexSettings().getSettings(), new ContentPath(0)));
keywordFieldType = kwBuilder.build(
new Mapper.BuilderContext(createIndexSettings(Version.CURRENT).getSettings(), new ContentPath(0))
);
super.setUp();
}
@ -120,7 +129,7 @@ public class WildcardFieldMapperTests extends ESTestCase {
public void testTooBigKeywordField() throws IOException {
Directory dir = newDirectory();
IndexWriterConfig iwc = newIndexWriterConfig(WildcardFieldMapper.WILDCARD_ANALYZER);
IndexWriterConfig iwc = newIndexWriterConfig(WildcardFieldMapper.WILDCARD_ANALYZER_7_10);
iwc.setMergePolicy(newTieredMergePolicy(random()));
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
@ -143,11 +152,44 @@ public class WildcardFieldMapperTests extends ESTestCase {
reader.close();
dir.close();
}
public void testBWCIndexVersion() throws IOException {
// Create old format index using wildcard ngram analyzer used in 7.9 launch
Directory dir = newDirectory();
IndexWriterConfig iwc = newIndexWriterConfig(WildcardFieldMapper.WILDCARD_ANALYZER_7_9);
iwc.setMergePolicy(newTieredMergePolicy(random()));
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
Document doc = new Document();
ParseContext.Document parseDoc = new ParseContext.Document();
addFields(parseDoc, doc, "a b");
indexDoc(parseDoc, doc, iw);
iw.forceMerge(1);
DirectoryReader reader = iw.getReader();
IndexSearcher searcher = newSearcher(reader);
iw.close();
// Unnatural circumstance - testing we fail if we were to use the new analyzer on old index
Query oldWildcardFieldQuery = wildcardFieldType.fieldType().wildcardQuery("a b", null, null);
TopDocs oldWildcardFieldTopDocs = searcher.search(oldWildcardFieldQuery, 10, Sort.INDEXORDER);
assertThat(oldWildcardFieldTopDocs.totalHits.value, equalTo(0L));
// Natural circumstance test we revert to the old analyzer for old indices
Query wildcardFieldQuery = wildcardFieldType79.fieldType().wildcardQuery("a b", null, null);
TopDocs wildcardFieldTopDocs = searcher.search(wildcardFieldQuery, 10, Sort.INDEXORDER);
assertThat(wildcardFieldTopDocs.totalHits.value, equalTo(1L));
reader.close();
dir.close();
}
//Test long query strings don't cause exceptions
public void testTooBigQueryField() throws IOException {
Directory dir = newDirectory();
IndexWriterConfig iwc = newIndexWriterConfig(WildcardFieldMapper.WILDCARD_ANALYZER);
IndexWriterConfig iwc = newIndexWriterConfig(WildcardFieldMapper.WILDCARD_ANALYZER_7_10);
iwc.setMergePolicy(newTieredMergePolicy(random()));
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
@ -182,7 +224,7 @@ public class WildcardFieldMapperTests extends ESTestCase {
public void testTermAndPrefixQueryIgnoreWildcardSyntax() throws IOException {
Directory dir = newDirectory();
IndexWriterConfig iwc = newIndexWriterConfig(WildcardFieldMapper.WILDCARD_ANALYZER);
IndexWriterConfig iwc = newIndexWriterConfig(WildcardFieldMapper.WILDCARD_ANALYZER_7_10);
iwc.setMergePolicy(newTieredMergePolicy(random()));
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
@ -223,7 +265,7 @@ public class WildcardFieldMapperTests extends ESTestCase {
public void testSearchResultsVersusKeywordField() throws IOException {
Directory dir = newDirectory();
IndexWriterConfig iwc = newIndexWriterConfig(WildcardFieldMapper.WILDCARD_ANALYZER);
IndexWriterConfig iwc = newIndexWriterConfig(WildcardFieldMapper.WILDCARD_ANALYZER_7_10);
iwc.setMergePolicy(newTieredMergePolicy(random()));
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
@ -361,7 +403,7 @@ public class WildcardFieldMapperTests extends ESTestCase {
public void testRangeQueryVersusKeywordField() throws IOException {
Directory dir = newDirectory();
IndexWriterConfig iwc = newIndexWriterConfig(WildcardFieldMapper.WILDCARD_ANALYZER);
IndexWriterConfig iwc = newIndexWriterConfig(WildcardFieldMapper.WILDCARD_ANALYZER_7_10);
iwc.setMergePolicy(newTieredMergePolicy(random()));
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
@ -374,6 +416,10 @@ public class WildcardFieldMapperTests extends ESTestCase {
indexDoc(iw, "a.txt");
indexDoc(iw, "n.txt");
indexDoc(iw, "z.txt");
indexDoc(iw, "A.txt");
indexDoc(iw, "N.txt");
indexDoc(iw, "^.txt");
indexDoc(iw, "Z.txt");
iw.forceMerge(1);
DirectoryReader reader = iw.getReader();
@ -390,7 +436,8 @@ public class WildcardFieldMapperTests extends ESTestCase {
{"a.txt", "z.txt"},
{"a.txt", "n.txt"},
{null, "z.txt"},
{"a.txt", null}
{"a.txt", null},
{"A.txt", "z.txt"}
};
for (String[] bounds : rangeTests) {
@ -436,15 +483,16 @@ public class WildcardFieldMapperTests extends ESTestCase {
// All of these regexes should be accelerated as the equivalent of the given QueryString query
String acceleratedTests[][] = {
{".*foo.*", "foo"},
{"..foobar","+foo +oba +ar_ +r__"},
{"(maynotexist)?foobar","+foo +oba +ar_ +r__"},
{".*/etc/passw.*", "+\\/et +tc\\/ +\\/pa +ass +ssw"},
{".*etc/passwd", "+etc +c\\/p +pas +ssw +wd_ +d__"},
{"(http|ftp)://foo.*", "+((+htt +ttp) ftp) +(+\\:\\/\\/ +\\/fo +foo)"},
{"[Pp][Oo][Ww][Ee][Rr][Ss][Hh][Ee][Ll][Ll]\\.[Ee][Xx][Ee]", "+_po +owe +ers +she +ell +l\\.e +exe +e__"},
{"foo<1-100>bar", "+(+_fo +foo) +(+bar +r__ )"},
{"(aaa.+&.+bbb)cat", "+cat +t__"},
{".*foo.*", "eoo"},
{"..foobar","+eoo +ooa +oaa +aaq +aq_ +q__"},
{"(maynotexist)?foobar","+eoo +ooa +oaa +aaq +aq_ +q__"},
{".*/etc/passw.*", "+\\/es +esc +sc\\/ +c\\/o +\\/oa +oas +ass +ssw"},
{".*etc/passwd", " +esc +sc\\/ +c\\/o +\\/oa +oas +ass +ssw +swc +wc_ +c__"},
{"(http|ftp)://foo.*", "+((+gss +sso) eso) +(+\\/\\/\\/ +\\/\\/e +\\/eo +eoo)"},
{"[Pp][Oo][Ww][Ee][Rr][Ss][Hh][Ee][Ll][Ll]\\.[Ee][Xx][Ee]",
"+_oo +oow +owe +weq +eqs +qsg +sge +gek +ekk +kk\\/ +k\\/e +\\/ew +ewe +we_ +e__"},
{"foo<1-100>bar", "+(+_eo +eoo) +(+aaq +aq_ +q__)"},
{"(aaa.+&.+bbb)cat", "+cas +as_ +s__"},
{".a", "a__"}
};
for (String[] test : acceleratedTests) {
@ -469,7 +517,7 @@ public class WildcardFieldMapperTests extends ESTestCase {
String suboptimalTests[][] = {
// TODO short wildcards like a* OR b* aren't great so we just drop them.
// Ideally we would attach to successors to create (acd OR bcd)
{ "[ab]cd", "+cd_ +d__"}
{ "[ab]cd", "+cc_ +c__"}
};
for (String[] test : suboptimalTests) {
String regex = test[0];
@ -499,13 +547,13 @@ public class WildcardFieldMapperTests extends ESTestCase {
// All of these patterns should be accelerated.
String tests[][] = {
{ "*foobar", "+foo +oba +ar_ +r__" },
{ "foobar*", "+_fo +oob +bar" },
{ "foo\\*bar*", "+_fo +oo\\* +\\*ba +bar" },
{ "foo\\?bar*", "+_fo +oo\\? +\\?ba +bar" },
{ "foo*bar", "+_fo +foo +bar +r__" },
{ "foo?bar", "+_fo +foo +bar +r__" },
{ "?foo*bar?", "+foo +bar" },
{ "*foobar", "+eoo +ooa +oaa +aaq +aq_ +q__" },
{ "foobar*", "+_eo +eoo +ooa +oaa +aaq" },
{ "foo\\*bar*", "+_eo +eoo +oo\\/ +o\\/a +\\/aa +aaq" },
{ "foo\\?bar*", "+_eo +eoo +oo\\/ +o\\/a +\\/aa +aaq" },
{ "foo*bar", "+_eo +eoo +aaq +aq_ +q__" },
{ "foo?bar", "+_eo +eoo +aaq +aq_ +q__" },
{ "?foo*bar?", "+eoo +aaq" },
{ "*c", "+c__" } };
for (String[] test : tests) {
String pattern = test[0];
@ -601,10 +649,10 @@ public class WildcardFieldMapperTests extends ESTestCase {
public void testFuzzyAcceleration() throws IOException, ParseException {
FuzzyTest[] tests = {
new FuzzyTest("123456", 0, Fuzziness.ONE, null, 1, "123 456"),
new FuzzyTest("1234567890", 2, Fuzziness.ONE, "_12", 1, "345 678"),
new FuzzyTest("12345678901", 2, Fuzziness.ONE, "_12", 2, "345 678 901"),
new FuzzyTest("12345678", 4, Fuzziness.ONE, "_12 234", 0, null)
new FuzzyTest("123456", 0, Fuzziness.ONE, null, 1, "113 355"),
new FuzzyTest("1234567890", 2, Fuzziness.ONE, "_11", 1, "335 577"),
new FuzzyTest("12345678901", 2, Fuzziness.ONE, "_11", 2, "335 577 901"),
new FuzzyTest("12345678", 4, Fuzziness.ONE, "_11 113 133", 0, null)
};
for (FuzzyTest test : tests) {
Query wildcardFieldQuery = test.getFuzzyQuery();
@ -650,8 +698,8 @@ public class WildcardFieldMapperTests extends ESTestCase {
public void testRangeAcceleration() throws IOException, ParseException {
RangeTest[] tests = {
new RangeTest("c:/a.txt", "c:/z.txt", "_c: c:/"),
new RangeTest("C:/ProgramFiles/a.txt", "C:/ProgramFiles/z.txt", "_c: :/p pro ogr ram mfi ile es/"),
new RangeTest("c:/a.txt", "c:/z.txt", "_c/ c//"),
new RangeTest("C:/ProgramFiles/a.txt", "C:/ProgramFiles/z/txt", "_c/ c// //o /oq oqo qog ogq gqa qam ame mei eik ike kes es/"),
};
for (RangeTest test : tests) {
Query wildcardFieldQuery = test.getRangeQuery();
@ -893,9 +941,9 @@ public class WildcardFieldMapperTests extends ESTestCase {
iw.addDocument(doc);
}
protected IndexSettings createIndexSettings() {
protected IndexSettings createIndexSettings(Version version) {
return new IndexSettings(
IndexMetadata.builder("_index").settings(Settings.builder().put(IndexMetadata.SETTING_VERSION_CREATED, Version.CURRENT))
IndexMetadata.builder("_index").settings(Settings.builder().put(IndexMetadata.SETTING_VERSION_CREATED, version))
.numberOfShards(1).numberOfReplicas(0).creationDate(System.currentTimeMillis()).build(),
Settings.EMPTY);
}