Adds normalisation of ngram tokens to reduce disk space. All punctuation becomes / char and for A-Z0-9 chars turn even codepoints to prior odd e.g. aab becomes aaa Closes #62817
This commit is contained in:
parent
5370f270d7
commit
bfb3071539
|
@ -8,6 +8,8 @@
|
|||
package org.elasticsearch.xpack.wildcard.mapper;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.ngram.NGramTokenizer;
|
||||
|
@ -37,6 +39,7 @@ import org.apache.lucene.util.automaton.Automaton;
|
|||
import org.apache.lucene.util.automaton.RegExp;
|
||||
import org.apache.lucene.util.automaton.RegExp.Kind;
|
||||
import org.elasticsearch.ElasticsearchParseException;
|
||||
import org.elasticsearch.Version;
|
||||
import org.elasticsearch.common.geo.ShapeRelation;
|
||||
import org.elasticsearch.common.lucene.BytesRefs;
|
||||
import org.elasticsearch.common.lucene.Lucene;
|
||||
|
@ -91,13 +94,94 @@ public class WildcardFieldMapper extends FieldMapper {
|
|||
public static final String CONTENT_TYPE = "wildcard";
|
||||
public static short MAX_CLAUSES_IN_APPROXIMATION_QUERY = 10;
|
||||
public static final int NGRAM_SIZE = 3;
|
||||
static final NamedAnalyzer WILDCARD_ANALYZER = new NamedAnalyzer("_wildcard", AnalyzerScope.GLOBAL, new Analyzer() {
|
||||
static final NamedAnalyzer WILDCARD_ANALYZER_7_10 = new NamedAnalyzer("_wildcard_7_10", AnalyzerScope.GLOBAL, new Analyzer() {
|
||||
@Override
|
||||
public TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer tokenizer = new NGramTokenizer(NGRAM_SIZE, NGRAM_SIZE);
|
||||
return new TokenStreamComponents(tokenizer);
|
||||
|
||||
TokenStream tok = new LowerCaseFilter(tokenizer);
|
||||
tok = new PunctuationFoldingFilter(tok);
|
||||
|
||||
return new TokenStreamComponents(r -> {
|
||||
tokenizer.setReader(r);
|
||||
}, tok);
|
||||
|
||||
|
||||
}
|
||||
});
|
||||
|
||||
// @deprecated - used for BWC with elasticsearch 7.9
|
||||
static final NamedAnalyzer WILDCARD_ANALYZER_7_9 = new NamedAnalyzer("_wildcard", AnalyzerScope.GLOBAL, new Analyzer() {
|
||||
@Override
|
||||
public TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer tokenizer = new NGramTokenizer(NGRAM_SIZE, NGRAM_SIZE);
|
||||
TokenStream tok = new LowerCaseFilter(tokenizer);
|
||||
return new TokenStreamComponents(r -> {
|
||||
tokenizer.setReader(r);
|
||||
}, tok);
|
||||
}
|
||||
});
|
||||
|
||||
public static class PunctuationFoldingFilter extends TokenFilter {
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
|
||||
/**
|
||||
* Create a new PunctuationFoldingFilter, that normalizes token text such that even-numbered ascii values
|
||||
* are made odd and punctuation is replaced with /
|
||||
*
|
||||
* @param in TokenStream to filter
|
||||
*/
|
||||
public PunctuationFoldingFilter(TokenStream in) {
|
||||
super(in);
|
||||
}
|
||||
|
||||
@Override
|
||||
public final boolean incrementToken() throws IOException {
|
||||
if (input.incrementToken()) {
|
||||
normalize(termAtt.buffer(), 0, termAtt.length());
|
||||
return true;
|
||||
} else
|
||||
return false;
|
||||
}
|
||||
|
||||
public static String normalize(String s) {
|
||||
char[] chars = s.toCharArray();
|
||||
normalize(chars, 0, chars.length);
|
||||
return new String(chars);
|
||||
}
|
||||
|
||||
/**
|
||||
* Normalizes a token
|
||||
*/
|
||||
public static void normalize(final char[] buffer, final int offset, final int limit) {
|
||||
assert buffer.length >= limit;
|
||||
assert 0 <= offset && offset <= buffer.length;
|
||||
for (int i = offset; i < limit;) {
|
||||
int codepoint = Character.codePointAt(buffer, i, limit);
|
||||
i += Character.toChars(
|
||||
normalize(codepoint), buffer, i);
|
||||
}
|
||||
}
|
||||
|
||||
private static int normalize(int codepoint) {
|
||||
if (codepoint == TOKEN_START_OR_END_CHAR) {
|
||||
return codepoint;
|
||||
}
|
||||
if (Character.isLetterOrDigit(codepoint) == false) {
|
||||
// Replace non letters or digits with /
|
||||
return 47;
|
||||
}
|
||||
// All other ascii characters, normalize even numbers to prior odd.
|
||||
if (codepoint > 48 && codepoint <= 128 && codepoint % 2 == 0) {
|
||||
// Odd ascii chars in 0-9 a-z range.
|
||||
return codepoint - 1;
|
||||
} else {
|
||||
// return even ascii char or non-ascii chars
|
||||
return codepoint;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public static class Defaults {
|
||||
public static final FieldType FIELD_TYPE = new FieldType();
|
||||
|
@ -172,8 +256,14 @@ public class WildcardFieldMapper extends FieldMapper {
|
|||
@Override
|
||||
public WildcardFieldMapper build(BuilderContext context) {
|
||||
return new WildcardFieldMapper(
|
||||
name, fieldType, new WildcardFieldType(buildFullName(context), fieldType, meta), ignoreAbove,
|
||||
multiFieldsBuilder.build(this, context), copyTo, nullValue);
|
||||
name,
|
||||
fieldType,
|
||||
new WildcardFieldType(buildFullName(context), fieldType, meta, context.indexCreatedVersion()),
|
||||
ignoreAbove,
|
||||
multiFieldsBuilder.build(this, context),
|
||||
copyTo,
|
||||
nullValue
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -212,17 +302,21 @@ public class WildcardFieldMapper extends FieldMapper {
|
|||
|
||||
static Analyzer lowercaseNormalizer = new LowercaseNormalizer();
|
||||
|
||||
private WildcardFieldType(String name, FieldType fieldType, Map<String, String> meta) {
|
||||
private WildcardFieldType(String name, FieldType fieldType, Map<String, String> meta, Version version) {
|
||||
super(name, true, fieldType.stored(), true,
|
||||
new TextSearchInfo(fieldType, null, Lucene.KEYWORD_ANALYZER, Lucene.KEYWORD_ANALYZER), meta);
|
||||
setIndexAnalyzer(WILDCARD_ANALYZER);
|
||||
|
||||
if (version.onOrAfter(Version.V_7_10_0)) {
|
||||
setIndexAnalyzer(WILDCARD_ANALYZER_7_10);
|
||||
} else {
|
||||
setIndexAnalyzer(WILDCARD_ANALYZER_7_9);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Query wildcardQuery(String wildcardPattern, RewriteMethod method, boolean caseInsensitive, QueryShardContext context) {
|
||||
|
||||
String ngramIndexPattern = addLineEndChars(toLowerCase(wildcardPattern));
|
||||
|
||||
String ngramIndexPattern = addLineEndChars(wildcardPattern);
|
||||
// Break search term into tokens
|
||||
Set<String> tokens = new LinkedHashSet<>();
|
||||
StringBuilder sequence = new StringBuilder();
|
||||
|
@ -305,8 +399,8 @@ public class WildcardFieldMapper extends FieldMapper {
|
|||
if (value.length() == 0) {
|
||||
return new MatchNoDocsQuery();
|
||||
}
|
||||
|
||||
RegExp ngramRegex = new RegExp(addLineEndChars(toLowerCase(value)), syntaxFlags, matchFlags);
|
||||
|
||||
RegExp ngramRegex = new RegExp(addLineEndChars(value), syntaxFlags, matchFlags);
|
||||
|
||||
Query approxBooleanQuery = toApproximationQuery(ngramRegex);
|
||||
Query approxNgramQuery = rewriteBoolToNgramQuery(approxBooleanQuery);
|
||||
|
@ -590,7 +684,7 @@ public class WildcardFieldMapper extends FieldMapper {
|
|||
return q instanceof MatchAllDocsQuery || q instanceof MatchAllButRequireVerificationQuery;
|
||||
}
|
||||
|
||||
protected String firstNgramToken(String fragment) {
|
||||
protected String firstNgramToken(String fragment, Analyzer analyzer) {
|
||||
LinkedHashSet<String> tokens = new LinkedHashSet<>();
|
||||
getNgramTokens(tokens, fragment);
|
||||
return tokens.iterator().next();
|
||||
|
@ -603,41 +697,30 @@ public class WildcardFieldMapper extends FieldMapper {
|
|||
return;
|
||||
}
|
||||
// Break fragment into multiple Ngrams
|
||||
TokenStream tokenizer = WILDCARD_ANALYZER.tokenStream(name(), fragment);
|
||||
TokenStream tokenizer = indexAnalyzer().tokenStream(name(), fragment);
|
||||
CharTermAttribute termAtt = tokenizer.addAttribute(CharTermAttribute.class);
|
||||
// If fragment length < NGRAM_SIZE then it is not emitted by token stream so need
|
||||
// to initialise with the value here
|
||||
String lastUnusedToken = fragment;
|
||||
int foundTokens = 0;
|
||||
try {
|
||||
tokenizer.reset();
|
||||
boolean takeThis = true;
|
||||
// minimise number of terms searched - eg for "12345" and 3grams we only need terms
|
||||
// `123` and `345` - no need to search for 234. We take every other ngram.
|
||||
while (tokenizer.incrementToken()) {
|
||||
String tokenValue = termAtt.toString();
|
||||
if (takeThis) {
|
||||
tokens.add(tokenValue);
|
||||
lastUnusedToken = null;
|
||||
} else {
|
||||
lastUnusedToken = tokenValue;
|
||||
}
|
||||
// alternate
|
||||
takeThis = !takeThis;
|
||||
if (tokens.size() >= MAX_CLAUSES_IN_APPROXIMATION_QUERY) {
|
||||
lastUnusedToken = null;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (lastUnusedToken != null) {
|
||||
// given `cake` and 3 grams the loop above would output only `cak` and we need to add trailing
|
||||
// `ake` to complete the logic.
|
||||
tokens.add(lastUnusedToken);
|
||||
tokens.add(tokenValue);
|
||||
foundTokens++;
|
||||
}
|
||||
tokenizer.end();
|
||||
tokenizer.close();
|
||||
} catch (IOException ioe) {
|
||||
throw new ElasticsearchParseException("Error parsing wildcard regex pattern fragment [" + fragment + "]");
|
||||
}
|
||||
|
||||
if (foundTokens == 0 && fragment.length() > 0) {
|
||||
// fragment must have been less than NGRAM_SIZE - add a placeholder which may be used in a prefix query e.g. ab*
|
||||
fragment = toLowerCase(fragment);
|
||||
if (indexAnalyzer() == WILDCARD_ANALYZER_7_10) {
|
||||
fragment = PunctuationFoldingFilter.normalize(fragment);
|
||||
}
|
||||
tokens.add(fragment);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
@ -678,8 +761,8 @@ public class WildcardFieldMapper extends FieldMapper {
|
|||
// Long common prefixes e.g. "C:/Program Files/a,txt" to "C:/Program Files/z,txt"
|
||||
// can be accelerated by searching for all the common leading ngrams e.g. c:/, /pr, rog, gra etc
|
||||
StringBuilder commonPrefix = new StringBuilder();
|
||||
String lowerS = addLineEndChars(toLowerCase(lower.utf8ToString()));
|
||||
String upperS = addLineEndChars(toLowerCase(upper.utf8ToString()));
|
||||
String lowerS = addLineEndChars(lower.utf8ToString());
|
||||
String upperS = addLineEndChars(upper.utf8ToString());
|
||||
for (int i = 0; i < Math.min(lowerS.length(), upperS.length());) {
|
||||
final int cL = lowerS.codePointAt(i);
|
||||
final int cU = upperS.codePointAt(i);
|
||||
|
@ -717,23 +800,15 @@ public class WildcardFieldMapper extends FieldMapper {
|
|||
}
|
||||
}
|
||||
}
|
||||
if (accelerationQuery == null) {
|
||||
// Fallback - if there is no common prefix sequence then we look for the range of ngrams that appear at the start
|
||||
// of the string e.g. given 100 to 999 we would search for ngrams in the range
|
||||
// TOKEN_START_OR_END_CHAR + "10" to
|
||||
// TOKEN_START_OR_END_CHAR + "99"
|
||||
BytesRef lowerNgram = lower == null ? null : new BytesRef(firstNgramToken(
|
||||
addLineEndChars(toLowerCase(lower.utf8ToString()))));
|
||||
BytesRef upperNgram = upper == null ? null : new BytesRef(firstNgramToken(
|
||||
addLineEndChars(toLowerCase(upper.utf8ToString()))));
|
||||
accelerationQuery = new TermRangeQuery(name(), lowerNgram, upperNgram, true, true);
|
||||
}
|
||||
|
||||
Supplier <Automaton> deferredAutomatonSupplier = ()->{
|
||||
return TermRangeQuery.toAutomaton(lower, upper, includeLower, includeUpper);
|
||||
};
|
||||
AutomatonQueryOnBinaryDv slowQuery = new AutomatonQueryOnBinaryDv(name(), lower + "-" + upper, deferredAutomatonSupplier);
|
||||
|
||||
if (accelerationQuery == null) {
|
||||
return slowQuery;
|
||||
}
|
||||
|
||||
BooleanQuery.Builder qBuilder = new BooleanQuery.Builder();
|
||||
qBuilder.add(accelerationQuery, Occur.MUST);
|
||||
qBuilder.add(slowQuery, Occur.MUST);
|
||||
|
@ -750,26 +825,25 @@ public class WildcardFieldMapper extends FieldMapper {
|
|||
QueryShardContext context
|
||||
) {
|
||||
String searchTerm = BytesRefs.toString(value);
|
||||
String lowerSearchTerm = toLowerCase(searchTerm);
|
||||
try {
|
||||
BooleanQuery.Builder bqBuilder = new BooleanQuery.Builder();
|
||||
//The approximation query can have a prefix and any number of ngrams.
|
||||
BooleanQuery.Builder approxBuilder = new BooleanQuery.Builder();
|
||||
|
||||
String postPrefixString = lowerSearchTerm;
|
||||
String postPrefixString = searchTerm;
|
||||
|
||||
// Add all content prior to prefixLength as a MUST clause to the ngram index query
|
||||
if (prefixLength > 0) {
|
||||
Set<String> prefixTokens = new LinkedHashSet<>();
|
||||
postPrefixString = lowerSearchTerm.substring(prefixLength);
|
||||
String prefixCandidate = TOKEN_START_OR_END_CHAR + lowerSearchTerm.substring(0, prefixLength);
|
||||
postPrefixString = searchTerm.substring(prefixLength);
|
||||
String prefixCandidate = TOKEN_START_OR_END_CHAR + searchTerm.substring(0, prefixLength);
|
||||
getNgramTokens(prefixTokens, prefixCandidate);
|
||||
for (String prefixToken : prefixTokens) {
|
||||
addClause(prefixToken, approxBuilder, Occur.MUST);
|
||||
}
|
||||
}
|
||||
// Tokenize all content after the prefix
|
||||
TokenStream tokenizer = WILDCARD_ANALYZER.tokenStream(name(), postPrefixString);
|
||||
TokenStream tokenizer = indexAnalyzer().tokenStream(name(), postPrefixString);
|
||||
CharTermAttribute termAtt = tokenizer.addAttribute(CharTermAttribute.class);
|
||||
ArrayList<String> postPrefixTokens = new ArrayList<>();
|
||||
String firstToken = null;
|
||||
|
@ -985,10 +1059,7 @@ public class WildcardFieldMapper extends FieldMapper {
|
|||
if (value == null || value.length() > ignoreAbove) {
|
||||
return;
|
||||
}
|
||||
// Always lower case the ngram index and value - helps with
|
||||
// a) speed (less ngram variations to explore on disk and in RAM-based automaton) and
|
||||
// b) uses less disk space
|
||||
String ngramValue = addLineEndChars(WildcardFieldType.toLowerCase(value));
|
||||
String ngramValue = addLineEndChars(value);
|
||||
Field ngramField = new Field(fieldType().name(), ngramValue, ngramFieldType);
|
||||
fields.add(ngramField);
|
||||
|
||||
|
|
|
@ -77,18 +77,21 @@ import static org.mockito.Mockito.when;
|
|||
|
||||
public class WildcardFieldMapperTests extends ESTestCase {
|
||||
|
||||
static QueryShardContext createMockQueryShardContext(boolean allowExpensiveQueries) {
|
||||
static QueryShardContext createMockQueryShardContext(boolean allowExpensiveQueries, Version version) {
|
||||
QueryShardContext queryShardContext = mock(QueryShardContext.class);
|
||||
when(queryShardContext.allowExpensiveQueries()).thenReturn(allowExpensiveQueries);
|
||||
when(queryShardContext.indexVersionCreated()).thenReturn(version);
|
||||
return queryShardContext;
|
||||
}
|
||||
}
|
||||
|
||||
private static final String KEYWORD_FIELD_NAME = "keyword_field";
|
||||
private static final String WILDCARD_FIELD_NAME = "wildcard_field";
|
||||
public static final QueryShardContext MOCK_QSC = createMockQueryShardContext(true);
|
||||
public static final QueryShardContext MOCK_QSC = createMockQueryShardContext(true, Version.CURRENT);
|
||||
public static final QueryShardContext MOCK_7_9_QSC = createMockQueryShardContext(true, Version.V_7_9_0);
|
||||
|
||||
static final int MAX_FIELD_LENGTH = 30;
|
||||
static WildcardFieldMapper wildcardFieldType;
|
||||
static WildcardFieldMapper wildcardFieldType79;
|
||||
static KeywordFieldMapper keywordFieldType;
|
||||
|
||||
@Override
|
||||
|
@ -96,11 +99,17 @@ public class WildcardFieldMapperTests extends ESTestCase {
|
|||
public void setUp() throws Exception {
|
||||
Builder builder = new WildcardFieldMapper.Builder(WILDCARD_FIELD_NAME);
|
||||
builder.ignoreAbove(MAX_FIELD_LENGTH);
|
||||
wildcardFieldType = builder.build(new Mapper.BuilderContext(createIndexSettings().getSettings(), new ContentPath(0)));
|
||||
|
||||
wildcardFieldType = builder.build(
|
||||
new Mapper.BuilderContext(createIndexSettings(Version.CURRENT).getSettings(), new ContentPath(0))
|
||||
);
|
||||
wildcardFieldType79 = builder.build(
|
||||
new Mapper.BuilderContext(createIndexSettings(Version.V_7_9_0).getSettings(), new ContentPath(0))
|
||||
);
|
||||
|
||||
org.elasticsearch.index.mapper.KeywordFieldMapper.Builder kwBuilder = new KeywordFieldMapper.Builder(KEYWORD_FIELD_NAME);
|
||||
keywordFieldType = kwBuilder.build(new Mapper.BuilderContext(createIndexSettings().getSettings(), new ContentPath(0)));
|
||||
keywordFieldType = kwBuilder.build(
|
||||
new Mapper.BuilderContext(createIndexSettings(Version.CURRENT).getSettings(), new ContentPath(0))
|
||||
);
|
||||
super.setUp();
|
||||
}
|
||||
|
||||
|
@ -120,7 +129,7 @@ public class WildcardFieldMapperTests extends ESTestCase {
|
|||
|
||||
public void testTooBigKeywordField() throws IOException {
|
||||
Directory dir = newDirectory();
|
||||
IndexWriterConfig iwc = newIndexWriterConfig(WildcardFieldMapper.WILDCARD_ANALYZER);
|
||||
IndexWriterConfig iwc = newIndexWriterConfig(WildcardFieldMapper.WILDCARD_ANALYZER_7_10);
|
||||
iwc.setMergePolicy(newTieredMergePolicy(random()));
|
||||
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
|
||||
|
||||
|
@ -143,11 +152,44 @@ public class WildcardFieldMapperTests extends ESTestCase {
|
|||
reader.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
public void testBWCIndexVersion() throws IOException {
|
||||
// Create old format index using wildcard ngram analyzer used in 7.9 launch
|
||||
Directory dir = newDirectory();
|
||||
IndexWriterConfig iwc = newIndexWriterConfig(WildcardFieldMapper.WILDCARD_ANALYZER_7_9);
|
||||
iwc.setMergePolicy(newTieredMergePolicy(random()));
|
||||
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
|
||||
|
||||
Document doc = new Document();
|
||||
ParseContext.Document parseDoc = new ParseContext.Document();
|
||||
addFields(parseDoc, doc, "a b");
|
||||
indexDoc(parseDoc, doc, iw);
|
||||
|
||||
iw.forceMerge(1);
|
||||
DirectoryReader reader = iw.getReader();
|
||||
IndexSearcher searcher = newSearcher(reader);
|
||||
iw.close();
|
||||
|
||||
|
||||
// Unnatural circumstance - testing we fail if we were to use the new analyzer on old index
|
||||
Query oldWildcardFieldQuery = wildcardFieldType.fieldType().wildcardQuery("a b", null, null);
|
||||
TopDocs oldWildcardFieldTopDocs = searcher.search(oldWildcardFieldQuery, 10, Sort.INDEXORDER);
|
||||
assertThat(oldWildcardFieldTopDocs.totalHits.value, equalTo(0L));
|
||||
|
||||
|
||||
// Natural circumstance test we revert to the old analyzer for old indices
|
||||
Query wildcardFieldQuery = wildcardFieldType79.fieldType().wildcardQuery("a b", null, null);
|
||||
TopDocs wildcardFieldTopDocs = searcher.search(wildcardFieldQuery, 10, Sort.INDEXORDER);
|
||||
assertThat(wildcardFieldTopDocs.totalHits.value, equalTo(1L));
|
||||
|
||||
reader.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
//Test long query strings don't cause exceptions
|
||||
public void testTooBigQueryField() throws IOException {
|
||||
Directory dir = newDirectory();
|
||||
IndexWriterConfig iwc = newIndexWriterConfig(WildcardFieldMapper.WILDCARD_ANALYZER);
|
||||
IndexWriterConfig iwc = newIndexWriterConfig(WildcardFieldMapper.WILDCARD_ANALYZER_7_10);
|
||||
iwc.setMergePolicy(newTieredMergePolicy(random()));
|
||||
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
|
||||
|
||||
|
@ -182,7 +224,7 @@ public class WildcardFieldMapperTests extends ESTestCase {
|
|||
|
||||
public void testTermAndPrefixQueryIgnoreWildcardSyntax() throws IOException {
|
||||
Directory dir = newDirectory();
|
||||
IndexWriterConfig iwc = newIndexWriterConfig(WildcardFieldMapper.WILDCARD_ANALYZER);
|
||||
IndexWriterConfig iwc = newIndexWriterConfig(WildcardFieldMapper.WILDCARD_ANALYZER_7_10);
|
||||
iwc.setMergePolicy(newTieredMergePolicy(random()));
|
||||
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
|
||||
|
||||
|
@ -223,7 +265,7 @@ public class WildcardFieldMapperTests extends ESTestCase {
|
|||
|
||||
public void testSearchResultsVersusKeywordField() throws IOException {
|
||||
Directory dir = newDirectory();
|
||||
IndexWriterConfig iwc = newIndexWriterConfig(WildcardFieldMapper.WILDCARD_ANALYZER);
|
||||
IndexWriterConfig iwc = newIndexWriterConfig(WildcardFieldMapper.WILDCARD_ANALYZER_7_10);
|
||||
iwc.setMergePolicy(newTieredMergePolicy(random()));
|
||||
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
|
||||
|
||||
|
@ -361,7 +403,7 @@ public class WildcardFieldMapperTests extends ESTestCase {
|
|||
|
||||
public void testRangeQueryVersusKeywordField() throws IOException {
|
||||
Directory dir = newDirectory();
|
||||
IndexWriterConfig iwc = newIndexWriterConfig(WildcardFieldMapper.WILDCARD_ANALYZER);
|
||||
IndexWriterConfig iwc = newIndexWriterConfig(WildcardFieldMapper.WILDCARD_ANALYZER_7_10);
|
||||
iwc.setMergePolicy(newTieredMergePolicy(random()));
|
||||
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
|
||||
|
||||
|
@ -374,6 +416,10 @@ public class WildcardFieldMapperTests extends ESTestCase {
|
|||
indexDoc(iw, "a.txt");
|
||||
indexDoc(iw, "n.txt");
|
||||
indexDoc(iw, "z.txt");
|
||||
indexDoc(iw, "A.txt");
|
||||
indexDoc(iw, "N.txt");
|
||||
indexDoc(iw, "^.txt");
|
||||
indexDoc(iw, "Z.txt");
|
||||
|
||||
iw.forceMerge(1);
|
||||
DirectoryReader reader = iw.getReader();
|
||||
|
@ -390,7 +436,8 @@ public class WildcardFieldMapperTests extends ESTestCase {
|
|||
{"a.txt", "z.txt"},
|
||||
{"a.txt", "n.txt"},
|
||||
{null, "z.txt"},
|
||||
{"a.txt", null}
|
||||
{"a.txt", null},
|
||||
{"A.txt", "z.txt"}
|
||||
};
|
||||
|
||||
for (String[] bounds : rangeTests) {
|
||||
|
@ -436,15 +483,16 @@ public class WildcardFieldMapperTests extends ESTestCase {
|
|||
|
||||
// All of these regexes should be accelerated as the equivalent of the given QueryString query
|
||||
String acceleratedTests[][] = {
|
||||
{".*foo.*", "foo"},
|
||||
{"..foobar","+foo +oba +ar_ +r__"},
|
||||
{"(maynotexist)?foobar","+foo +oba +ar_ +r__"},
|
||||
{".*/etc/passw.*", "+\\/et +tc\\/ +\\/pa +ass +ssw"},
|
||||
{".*etc/passwd", "+etc +c\\/p +pas +ssw +wd_ +d__"},
|
||||
{"(http|ftp)://foo.*", "+((+htt +ttp) ftp) +(+\\:\\/\\/ +\\/fo +foo)"},
|
||||
{"[Pp][Oo][Ww][Ee][Rr][Ss][Hh][Ee][Ll][Ll]\\.[Ee][Xx][Ee]", "+_po +owe +ers +she +ell +l\\.e +exe +e__"},
|
||||
{"foo<1-100>bar", "+(+_fo +foo) +(+bar +r__ )"},
|
||||
{"(aaa.+&.+bbb)cat", "+cat +t__"},
|
||||
{".*foo.*", "eoo"},
|
||||
{"..foobar","+eoo +ooa +oaa +aaq +aq_ +q__"},
|
||||
{"(maynotexist)?foobar","+eoo +ooa +oaa +aaq +aq_ +q__"},
|
||||
{".*/etc/passw.*", "+\\/es +esc +sc\\/ +c\\/o +\\/oa +oas +ass +ssw"},
|
||||
{".*etc/passwd", " +esc +sc\\/ +c\\/o +\\/oa +oas +ass +ssw +swc +wc_ +c__"},
|
||||
{"(http|ftp)://foo.*", "+((+gss +sso) eso) +(+\\/\\/\\/ +\\/\\/e +\\/eo +eoo)"},
|
||||
{"[Pp][Oo][Ww][Ee][Rr][Ss][Hh][Ee][Ll][Ll]\\.[Ee][Xx][Ee]",
|
||||
"+_oo +oow +owe +weq +eqs +qsg +sge +gek +ekk +kk\\/ +k\\/e +\\/ew +ewe +we_ +e__"},
|
||||
{"foo<1-100>bar", "+(+_eo +eoo) +(+aaq +aq_ +q__)"},
|
||||
{"(aaa.+&.+bbb)cat", "+cas +as_ +s__"},
|
||||
{".a", "a__"}
|
||||
};
|
||||
for (String[] test : acceleratedTests) {
|
||||
|
@ -469,7 +517,7 @@ public class WildcardFieldMapperTests extends ESTestCase {
|
|||
String suboptimalTests[][] = {
|
||||
// TODO short wildcards like a* OR b* aren't great so we just drop them.
|
||||
// Ideally we would attach to successors to create (acd OR bcd)
|
||||
{ "[ab]cd", "+cd_ +d__"}
|
||||
{ "[ab]cd", "+cc_ +c__"}
|
||||
};
|
||||
for (String[] test : suboptimalTests) {
|
||||
String regex = test[0];
|
||||
|
@ -499,13 +547,13 @@ public class WildcardFieldMapperTests extends ESTestCase {
|
|||
|
||||
// All of these patterns should be accelerated.
|
||||
String tests[][] = {
|
||||
{ "*foobar", "+foo +oba +ar_ +r__" },
|
||||
{ "foobar*", "+_fo +oob +bar" },
|
||||
{ "foo\\*bar*", "+_fo +oo\\* +\\*ba +bar" },
|
||||
{ "foo\\?bar*", "+_fo +oo\\? +\\?ba +bar" },
|
||||
{ "foo*bar", "+_fo +foo +bar +r__" },
|
||||
{ "foo?bar", "+_fo +foo +bar +r__" },
|
||||
{ "?foo*bar?", "+foo +bar" },
|
||||
{ "*foobar", "+eoo +ooa +oaa +aaq +aq_ +q__" },
|
||||
{ "foobar*", "+_eo +eoo +ooa +oaa +aaq" },
|
||||
{ "foo\\*bar*", "+_eo +eoo +oo\\/ +o\\/a +\\/aa +aaq" },
|
||||
{ "foo\\?bar*", "+_eo +eoo +oo\\/ +o\\/a +\\/aa +aaq" },
|
||||
{ "foo*bar", "+_eo +eoo +aaq +aq_ +q__" },
|
||||
{ "foo?bar", "+_eo +eoo +aaq +aq_ +q__" },
|
||||
{ "?foo*bar?", "+eoo +aaq" },
|
||||
{ "*c", "+c__" } };
|
||||
for (String[] test : tests) {
|
||||
String pattern = test[0];
|
||||
|
@ -601,10 +649,10 @@ public class WildcardFieldMapperTests extends ESTestCase {
|
|||
public void testFuzzyAcceleration() throws IOException, ParseException {
|
||||
|
||||
FuzzyTest[] tests = {
|
||||
new FuzzyTest("123456", 0, Fuzziness.ONE, null, 1, "123 456"),
|
||||
new FuzzyTest("1234567890", 2, Fuzziness.ONE, "_12", 1, "345 678"),
|
||||
new FuzzyTest("12345678901", 2, Fuzziness.ONE, "_12", 2, "345 678 901"),
|
||||
new FuzzyTest("12345678", 4, Fuzziness.ONE, "_12 234", 0, null)
|
||||
new FuzzyTest("123456", 0, Fuzziness.ONE, null, 1, "113 355"),
|
||||
new FuzzyTest("1234567890", 2, Fuzziness.ONE, "_11", 1, "335 577"),
|
||||
new FuzzyTest("12345678901", 2, Fuzziness.ONE, "_11", 2, "335 577 901"),
|
||||
new FuzzyTest("12345678", 4, Fuzziness.ONE, "_11 113 133", 0, null)
|
||||
};
|
||||
for (FuzzyTest test : tests) {
|
||||
Query wildcardFieldQuery = test.getFuzzyQuery();
|
||||
|
@ -650,8 +698,8 @@ public class WildcardFieldMapperTests extends ESTestCase {
|
|||
public void testRangeAcceleration() throws IOException, ParseException {
|
||||
|
||||
RangeTest[] tests = {
|
||||
new RangeTest("c:/a.txt", "c:/z.txt", "_c: c:/"),
|
||||
new RangeTest("C:/ProgramFiles/a.txt", "C:/ProgramFiles/z.txt", "_c: :/p pro ogr ram mfi ile es/"),
|
||||
new RangeTest("c:/a.txt", "c:/z.txt", "_c/ c//"),
|
||||
new RangeTest("C:/ProgramFiles/a.txt", "C:/ProgramFiles/z/txt", "_c/ c// //o /oq oqo qog ogq gqa qam ame mei eik ike kes es/"),
|
||||
};
|
||||
for (RangeTest test : tests) {
|
||||
Query wildcardFieldQuery = test.getRangeQuery();
|
||||
|
@ -893,9 +941,9 @@ public class WildcardFieldMapperTests extends ESTestCase {
|
|||
iw.addDocument(doc);
|
||||
}
|
||||
|
||||
protected IndexSettings createIndexSettings() {
|
||||
protected IndexSettings createIndexSettings(Version version) {
|
||||
return new IndexSettings(
|
||||
IndexMetadata.builder("_index").settings(Settings.builder().put(IndexMetadata.SETTING_VERSION_CREATED, Version.CURRENT))
|
||||
IndexMetadata.builder("_index").settings(Settings.builder().put(IndexMetadata.SETTING_VERSION_CREATED, version))
|
||||
.numberOfShards(1).numberOfReplicas(0).creationDate(System.currentTimeMillis()).build(),
|
||||
Settings.EMPTY);
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue