Disable graph analysis at query time for shingle and cjk filters producing tokens of different size (#23920)
This change disables graph analysis of token streams containing a shingle or a cjk filters that produce shingle or ngram of different size. The graph analysis is disabled for phrase and boolean queries. Closes #23918
This commit is contained in:
parent
203f8433c2
commit
38009efedd
|
@ -0,0 +1,32 @@
|
||||||
|
/*
|
||||||
|
* Licensed to Elasticsearch under one or more contributor
|
||||||
|
* license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright
|
||||||
|
* ownership. Elasticsearch licenses this file to you under
|
||||||
|
* the Apache License, Version 2.0 (the "License"); you may
|
||||||
|
* not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing,
|
||||||
|
* software distributed under the License is distributed on an
|
||||||
|
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
* KIND, either express or implied. See the License for the
|
||||||
|
* specific language governing permissions and limitations
|
||||||
|
* under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.lucene.analysis.miscellaneous;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.util.Attribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This attribute can be used to indicate that the {@link PositionLengthAttribute}
|
||||||
|
* should not be taken in account in this {@link TokenStream}.
|
||||||
|
* Query parsers can extract this information to decide if this token stream should be analyzed
|
||||||
|
* as a graph or not.
|
||||||
|
*/
|
||||||
|
public interface DisableGraphAttribute extends Attribute {}
|
|
@ -0,0 +1,38 @@
|
||||||
|
/*
|
||||||
|
* Licensed to Elasticsearch under one or more contributor
|
||||||
|
* license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright
|
||||||
|
* ownership. Elasticsearch licenses this file to you under
|
||||||
|
* the Apache License, Version 2.0 (the "License"); you may
|
||||||
|
* not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing,
|
||||||
|
* software distributed under the License is distributed on an
|
||||||
|
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
* KIND, either express or implied. See the License for the
|
||||||
|
* specific language governing permissions and limitations
|
||||||
|
* under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.lucene.analysis.miscellaneous;
|
||||||
|
|
||||||
|
import org.apache.lucene.util.AttributeImpl;
|
||||||
|
import org.apache.lucene.util.AttributeReflector;
|
||||||
|
|
||||||
|
/** Default implementation of {@link DisableGraphAttribute}. */
|
||||||
|
public class DisableGraphAttributeImpl extends AttributeImpl implements DisableGraphAttribute {
|
||||||
|
public DisableGraphAttributeImpl() {}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void clear() {}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void reflectWith(AttributeReflector reflector) {
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void copyTo(AttributeImpl target) {}
|
||||||
|
}
|
|
@ -20,6 +20,7 @@
|
||||||
package org.apache.lucene.queryparser.classic;
|
package org.apache.lucene.queryparser.classic;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.miscellaneous.DisableGraphAttribute;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
|
@ -49,6 +50,7 @@ import org.elasticsearch.index.mapper.MapperService;
|
||||||
import org.elasticsearch.index.mapper.StringFieldType;
|
import org.elasticsearch.index.mapper.StringFieldType;
|
||||||
import org.elasticsearch.index.query.QueryShardContext;
|
import org.elasticsearch.index.query.QueryShardContext;
|
||||||
import org.elasticsearch.index.query.support.QueryParsers;
|
import org.elasticsearch.index.query.support.QueryParsers;
|
||||||
|
import org.elasticsearch.index.analysis.ShingleTokenFilterFactory;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
@ -56,7 +58,6 @@ import java.util.Collection;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Objects;
|
|
||||||
|
|
||||||
import static java.util.Collections.unmodifiableMap;
|
import static java.util.Collections.unmodifiableMap;
|
||||||
import static org.elasticsearch.common.lucene.search.Queries.fixNegativeQueryIfNeeded;
|
import static org.elasticsearch.common.lucene.search.Queries.fixNegativeQueryIfNeeded;
|
||||||
|
@ -805,4 +806,30 @@ public class MapperQueryParser extends AnalyzingQueryParser {
|
||||||
}
|
}
|
||||||
return super.parse(query);
|
return super.parse(query);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Checks if graph analysis should be enabled for the field depending
|
||||||
|
* on the provided {@link Analyzer}
|
||||||
|
*/
|
||||||
|
protected Query createFieldQuery(Analyzer analyzer, BooleanClause.Occur operator, String field,
|
||||||
|
String queryText, boolean quoted, int phraseSlop) {
|
||||||
|
assert operator == BooleanClause.Occur.SHOULD || operator == BooleanClause.Occur.MUST;
|
||||||
|
|
||||||
|
// Use the analyzer to get all the tokens, and then build an appropriate
|
||||||
|
// query based on the analysis chain.
|
||||||
|
try (TokenStream source = analyzer.tokenStream(field, queryText)) {
|
||||||
|
if (source.hasAttribute(DisableGraphAttribute.class)) {
|
||||||
|
/**
|
||||||
|
* A {@link TokenFilter} in this {@link TokenStream} disabled the graph analysis to avoid
|
||||||
|
* paths explosion. See {@link ShingleTokenFilterFactory} for details.
|
||||||
|
*/
|
||||||
|
setEnableGraphQueries(false);
|
||||||
|
}
|
||||||
|
Query query = super.createFieldQuery(source, operator, field, quoted, phraseSlop);
|
||||||
|
setEnableGraphQueries(true);
|
||||||
|
return query;
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new RuntimeException("Error analyzing query text", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -21,6 +21,7 @@ package org.elasticsearch.index.analysis;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.cjk.CJKBigramFilter;
|
import org.apache.lucene.analysis.cjk.CJKBigramFilter;
|
||||||
|
import org.apache.lucene.analysis.miscellaneous.DisableGraphAttribute;
|
||||||
import org.elasticsearch.common.settings.Settings;
|
import org.elasticsearch.common.settings.Settings;
|
||||||
import org.elasticsearch.env.Environment;
|
import org.elasticsearch.env.Environment;
|
||||||
import org.elasticsearch.index.IndexSettings;
|
import org.elasticsearch.index.IndexSettings;
|
||||||
|
@ -74,7 +75,17 @@ public final class CJKBigramFilterFactory extends AbstractTokenFilterFactory {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public TokenStream create(TokenStream tokenStream) {
|
public TokenStream create(TokenStream tokenStream) {
|
||||||
return new CJKBigramFilter(tokenStream, flags, outputUnigrams);
|
CJKBigramFilter filter = new CJKBigramFilter(tokenStream, flags, outputUnigrams);
|
||||||
|
if (outputUnigrams) {
|
||||||
|
/**
|
||||||
|
* We disable the graph analysis on this token stream
|
||||||
|
* because it produces bigrams AND unigrams.
|
||||||
|
* Graph analysis on such token stream is useless and dangerous as it may create too many paths
|
||||||
|
* since shingles of different size are not aligned in terms of positions.
|
||||||
|
*/
|
||||||
|
filter.addAttribute(DisableGraphAttribute.class);
|
||||||
|
}
|
||||||
|
return filter;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -20,6 +20,7 @@
|
||||||
package org.elasticsearch.index.analysis;
|
package org.elasticsearch.index.analysis;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.miscellaneous.DisableGraphAttribute;
|
||||||
import org.apache.lucene.analysis.shingle.ShingleFilter;
|
import org.apache.lucene.analysis.shingle.ShingleFilter;
|
||||||
import org.elasticsearch.common.settings.Settings;
|
import org.elasticsearch.common.settings.Settings;
|
||||||
import org.elasticsearch.env.Environment;
|
import org.elasticsearch.env.Environment;
|
||||||
|
@ -86,6 +87,15 @@ public class ShingleTokenFilterFactory extends AbstractTokenFilterFactory {
|
||||||
filter.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles);
|
filter.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles);
|
||||||
filter.setTokenSeparator(tokenSeparator);
|
filter.setTokenSeparator(tokenSeparator);
|
||||||
filter.setFillerToken(fillerToken);
|
filter.setFillerToken(fillerToken);
|
||||||
|
if (outputUnigrams || (minShingleSize != maxShingleSize)) {
|
||||||
|
/**
|
||||||
|
* We disable the graph analysis on this token stream
|
||||||
|
* because it produces shingles of different size.
|
||||||
|
* Graph analysis on such token stream is useless and dangerous as it may create too many paths
|
||||||
|
* since shingles of different size are not aligned in terms of positions.
|
||||||
|
*/
|
||||||
|
filter.addAttribute(DisableGraphAttribute.class);
|
||||||
|
}
|
||||||
return filter;
|
return filter;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -19,6 +19,7 @@
|
||||||
package org.elasticsearch.index.query;
|
package org.elasticsearch.index.query;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.miscellaneous.DisableGraphAttribute;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
|
@ -31,6 +32,7 @@ import org.apache.lucene.search.PrefixQuery;
|
||||||
import org.apache.lucene.search.Query;
|
import org.apache.lucene.search.Query;
|
||||||
import org.apache.lucene.search.SynonymQuery;
|
import org.apache.lucene.search.SynonymQuery;
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
import org.elasticsearch.index.analysis.ShingleTokenFilterFactory;
|
||||||
import org.elasticsearch.index.mapper.MappedFieldType;
|
import org.elasticsearch.index.mapper.MappedFieldType;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
@ -167,6 +169,32 @@ public class SimpleQueryParser extends org.apache.lucene.queryparser.simple.Simp
|
||||||
return super.simplify(bq.build());
|
return super.simplify(bq.build());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Checks if graph analysis should be enabled for the field depending
|
||||||
|
* on the provided {@link Analyzer}
|
||||||
|
*/
|
||||||
|
protected Query createFieldQuery(Analyzer analyzer, BooleanClause.Occur operator, String field,
|
||||||
|
String queryText, boolean quoted, int phraseSlop) {
|
||||||
|
assert operator == BooleanClause.Occur.SHOULD || operator == BooleanClause.Occur.MUST;
|
||||||
|
|
||||||
|
// Use the analyzer to get all the tokens, and then build an appropriate
|
||||||
|
// query based on the analysis chain.
|
||||||
|
try (TokenStream source = analyzer.tokenStream(field, queryText)) {
|
||||||
|
if (source.hasAttribute(DisableGraphAttribute.class)) {
|
||||||
|
/**
|
||||||
|
* A {@link TokenFilter} in this {@link TokenStream} disabled the graph analysis to avoid
|
||||||
|
* paths explosion. See {@link ShingleTokenFilterFactory} for details.
|
||||||
|
*/
|
||||||
|
setEnableGraphQueries(false);
|
||||||
|
}
|
||||||
|
Query query = super.createFieldQuery(source, operator, field, quoted, phraseSlop);
|
||||||
|
setEnableGraphQueries(true);
|
||||||
|
return query;
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new RuntimeException("Error analyzing query text", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private static Query wrapWithBoost(Query query, float boost) {
|
private static Query wrapWithBoost(Query query, float boost) {
|
||||||
if (boost != AbstractQueryBuilder.DEFAULT_BOOST) {
|
if (boost != AbstractQueryBuilder.DEFAULT_BOOST) {
|
||||||
return new BoostQuery(query, boost);
|
return new BoostQuery(query, boost);
|
||||||
|
|
|
@ -20,6 +20,8 @@
|
||||||
package org.elasticsearch.index.search;
|
package org.elasticsearch.index.search;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.miscellaneous.DisableGraphAttribute;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.index.Term;
|
import org.apache.lucene.index.Term;
|
||||||
import org.apache.lucene.queries.ExtendedCommonTermsQuery;
|
import org.apache.lucene.queries.ExtendedCommonTermsQuery;
|
||||||
import org.apache.lucene.search.BooleanClause;
|
import org.apache.lucene.search.BooleanClause;
|
||||||
|
@ -49,6 +51,7 @@ import org.elasticsearch.common.lucene.all.AllTermQuery;
|
||||||
import org.elasticsearch.common.lucene.search.MultiPhrasePrefixQuery;
|
import org.elasticsearch.common.lucene.search.MultiPhrasePrefixQuery;
|
||||||
import org.elasticsearch.common.lucene.search.Queries;
|
import org.elasticsearch.common.lucene.search.Queries;
|
||||||
import org.elasticsearch.common.unit.Fuzziness;
|
import org.elasticsearch.common.unit.Fuzziness;
|
||||||
|
import org.elasticsearch.index.analysis.ShingleTokenFilterFactory;
|
||||||
import org.elasticsearch.index.mapper.MappedFieldType;
|
import org.elasticsearch.index.mapper.MappedFieldType;
|
||||||
import org.elasticsearch.index.query.QueryShardContext;
|
import org.elasticsearch.index.query.QueryShardContext;
|
||||||
import org.elasticsearch.index.query.support.QueryParsers;
|
import org.elasticsearch.index.query.support.QueryParsers;
|
||||||
|
@ -320,6 +323,32 @@ public class MatchQuery {
|
||||||
return blendTermsQuery(terms, mapper);
|
return blendTermsQuery(terms, mapper);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Checks if graph analysis should be enabled for the field depending
|
||||||
|
* on the provided {@link Analyzer}
|
||||||
|
*/
|
||||||
|
protected Query createFieldQuery(Analyzer analyzer, BooleanClause.Occur operator, String field,
|
||||||
|
String queryText, boolean quoted, int phraseSlop) {
|
||||||
|
assert operator == BooleanClause.Occur.SHOULD || operator == BooleanClause.Occur.MUST;
|
||||||
|
|
||||||
|
// Use the analyzer to get all the tokens, and then build an appropriate
|
||||||
|
// query based on the analysis chain.
|
||||||
|
try (TokenStream source = analyzer.tokenStream(field, queryText)) {
|
||||||
|
if (source.hasAttribute(DisableGraphAttribute.class)) {
|
||||||
|
/**
|
||||||
|
* A {@link TokenFilter} in this {@link TokenStream} disabled the graph analysis to avoid
|
||||||
|
* paths explosion. See {@link ShingleTokenFilterFactory} for details.
|
||||||
|
*/
|
||||||
|
setEnableGraphQueries(false);
|
||||||
|
}
|
||||||
|
Query query = super.createFieldQuery(source, operator, field, quoted, phraseSlop);
|
||||||
|
setEnableGraphQueries(true);
|
||||||
|
return query;
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new RuntimeException("Error analyzing query text", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public Query createPhrasePrefixQuery(String field, String queryText, int phraseSlop, int maxExpansions) {
|
public Query createPhrasePrefixQuery(String field, String queryText, int phraseSlop, int maxExpansions) {
|
||||||
final Query query = createFieldQuery(getAnalyzer(), Occur.MUST, field, queryText, true, phraseSlop);
|
final Query query = createFieldQuery(getAnalyzer(), Occur.MUST, field, queryText, true, phraseSlop);
|
||||||
return toMultiPhrasePrefix(query, phraseSlop, maxExpansions);
|
return toMultiPhrasePrefix(query, phraseSlop, maxExpansions);
|
||||||
|
|
|
@ -19,7 +19,9 @@
|
||||||
|
|
||||||
package org.elasticsearch.index.analysis;
|
package org.elasticsearch.index.analysis;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.miscellaneous.DisableGraphAttribute;
|
||||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
import org.elasticsearch.test.ESTestCase;
|
import org.elasticsearch.test.ESTestCase;
|
||||||
import org.elasticsearch.test.ESTokenStreamTestCase;
|
import org.elasticsearch.test.ESTokenStreamTestCase;
|
||||||
|
@ -69,4 +71,25 @@ public class CJKFilterFactoryTests extends ESTokenStreamTestCase {
|
||||||
tokenizer.setReader(new StringReader(source));
|
tokenizer.setReader(new StringReader(source));
|
||||||
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
|
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testDisableGraph() throws IOException {
|
||||||
|
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromClassPath(createTempDir(), RESOURCE);
|
||||||
|
TokenFilterFactory allFlagsFactory = analysis.tokenFilter.get("cjk_all_flags");
|
||||||
|
TokenFilterFactory hanOnlyFactory = analysis.tokenFilter.get("cjk_han_only");
|
||||||
|
|
||||||
|
String source = "多くの学生が試験に落ちた。";
|
||||||
|
Tokenizer tokenizer = new StandardTokenizer();
|
||||||
|
tokenizer.setReader(new StringReader(source));
|
||||||
|
try (TokenStream tokenStream = allFlagsFactory.create(tokenizer)) {
|
||||||
|
// This config outputs different size of ngrams so graph analysis is disabled
|
||||||
|
assertTrue(tokenStream.hasAttribute(DisableGraphAttribute.class));
|
||||||
|
}
|
||||||
|
|
||||||
|
tokenizer = new StandardTokenizer();
|
||||||
|
tokenizer.setReader(new StringReader(source));
|
||||||
|
try (TokenStream tokenStream = hanOnlyFactory.create(tokenizer)) {
|
||||||
|
// This config uses only bigrams so graph analysis is enabled
|
||||||
|
assertFalse(tokenStream.hasAttribute(DisableGraphAttribute.class));
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -26,6 +26,7 @@ import org.apache.lucene.analysis.StopFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||||
|
import org.apache.lucene.analysis.miscellaneous.DisableGraphAttribute;
|
||||||
import org.elasticsearch.test.ESTestCase;
|
import org.elasticsearch.test.ESTestCase;
|
||||||
import org.elasticsearch.test.ESTokenStreamTestCase;
|
import org.elasticsearch.test.ESTokenStreamTestCase;
|
||||||
|
|
||||||
|
@ -80,4 +81,25 @@ public class ShingleTokenFilterFactoryTests extends ESTokenStreamTestCase {
|
||||||
TokenStream stream = new StopFilter(tokenizer, StopFilter.makeStopSet("the"));
|
TokenStream stream = new StopFilter(tokenizer, StopFilter.makeStopSet("the"));
|
||||||
assertTokenStreamContents(tokenFilter.create(stream), expected);
|
assertTokenStreamContents(tokenFilter.create(stream), expected);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testDisableGraph() throws IOException {
|
||||||
|
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromClassPath(createTempDir(), RESOURCE);
|
||||||
|
TokenFilterFactory shingleFiller = analysis.tokenFilter.get("shingle_filler");
|
||||||
|
TokenFilterFactory shingleInverse = analysis.tokenFilter.get("shingle_inverse");
|
||||||
|
|
||||||
|
String source = "hello world";
|
||||||
|
Tokenizer tokenizer = new WhitespaceTokenizer();
|
||||||
|
tokenizer.setReader(new StringReader(source));
|
||||||
|
try (TokenStream stream = shingleFiller.create(tokenizer)) {
|
||||||
|
// This config uses different size of shingles so graph analysis is disabled
|
||||||
|
assertTrue(stream.hasAttribute(DisableGraphAttribute.class));
|
||||||
|
}
|
||||||
|
|
||||||
|
tokenizer = new WhitespaceTokenizer();
|
||||||
|
tokenizer.setReader(new StringReader(source));
|
||||||
|
try (TokenStream stream = shingleInverse.create(tokenizer)) {
|
||||||
|
// This config uses a single size of shingles so graph analysis is enabled
|
||||||
|
assertFalse(stream.hasAttribute(DisableGraphAttribute.class));
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,251 @@
|
||||||
|
/*
|
||||||
|
* Licensed to Elasticsearch under one or more contributor
|
||||||
|
* license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright
|
||||||
|
* ownership. Elasticsearch licenses this file to you under
|
||||||
|
* the Apache License, Version 2.0 (the "License"); you may
|
||||||
|
* not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing,
|
||||||
|
* software distributed under the License is distributed on an
|
||||||
|
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
* KIND, either express or implied. See the License for the
|
||||||
|
* specific language governing permissions and limitations
|
||||||
|
* under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.elasticsearch.index.query;
|
||||||
|
|
||||||
|
import org.apache.lucene.index.Term;
|
||||||
|
import org.apache.lucene.search.Query;
|
||||||
|
import org.apache.lucene.search.SynonymQuery;
|
||||||
|
import org.apache.lucene.search.BooleanQuery;
|
||||||
|
import org.apache.lucene.search.BooleanClause;
|
||||||
|
import org.apache.lucene.search.TermQuery;
|
||||||
|
import org.apache.lucene.search.PhraseQuery;
|
||||||
|
import org.apache.lucene.search.DisjunctionMaxQuery;
|
||||||
|
import org.apache.lucene.search.MultiPhraseQuery;
|
||||||
|
import org.elasticsearch.common.settings.Settings;
|
||||||
|
import org.elasticsearch.index.IndexService;
|
||||||
|
import org.elasticsearch.index.search.MatchQuery;
|
||||||
|
import org.elasticsearch.test.ESSingleNodeTestCase;
|
||||||
|
import org.junit.After;
|
||||||
|
import org.junit.Before;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import static org.hamcrest.Matchers.equalTo;
|
||||||
|
import static org.hamcrest.Matchers.instanceOf;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Makes sure that graph analysis is disabled with shingle filters of different size
|
||||||
|
*/
|
||||||
|
public class DisableGraphQueryTests extends ESSingleNodeTestCase {
|
||||||
|
private static IndexService indexService;
|
||||||
|
private static QueryShardContext shardContext;
|
||||||
|
private static Query expectedQuery;
|
||||||
|
private static Query expectedPhraseQuery;
|
||||||
|
private static Query expectedQueryWithUnigram;
|
||||||
|
private static Query expectedPhraseQueryWithUnigram;
|
||||||
|
|
||||||
|
@Before
|
||||||
|
public void setup() {
|
||||||
|
Settings settings = Settings.builder()
|
||||||
|
.put("index.analysis.filter.shingle.type", "shingle")
|
||||||
|
.put("index.analysis.filter.shingle.output_unigrams", false)
|
||||||
|
.put("index.analysis.filter.shingle.min_size", 2)
|
||||||
|
.put("index.analysis.filter.shingle.max_size", 2)
|
||||||
|
.put("index.analysis.filter.shingle_unigram.type", "shingle")
|
||||||
|
.put("index.analysis.filter.shingle_unigram.output_unigrams", true)
|
||||||
|
.put("index.analysis.filter.shingle_unigram.min_size", 2)
|
||||||
|
.put("index.analysis.filter.shingle_unigram.max_size", 2)
|
||||||
|
.put("index.analysis.analyzer.text_shingle.tokenizer", "whitespace")
|
||||||
|
.put("index.analysis.analyzer.text_shingle.filter", "lowercase, shingle")
|
||||||
|
.put("index.analysis.analyzer.text_shingle_unigram.tokenizer", "whitespace")
|
||||||
|
.put("index.analysis.analyzer.text_shingle_unigram.filter",
|
||||||
|
"lowercase, shingle_unigram")
|
||||||
|
.build();
|
||||||
|
indexService = createIndex("test", settings, "t",
|
||||||
|
"text_shingle", "type=text,analyzer=text_shingle",
|
||||||
|
"text_shingle_unigram", "type=text,analyzer=text_shingle_unigram");
|
||||||
|
shardContext = indexService.newQueryShardContext(0, null, () -> 0L);
|
||||||
|
|
||||||
|
// parsed queries for "text_shingle_unigram:(foo bar baz)" with query parsers
|
||||||
|
// that ignores position length attribute
|
||||||
|
expectedQueryWithUnigram= new BooleanQuery.Builder()
|
||||||
|
.add(
|
||||||
|
new SynonymQuery(
|
||||||
|
new Term("text_shingle_unigram", "foo"),
|
||||||
|
new Term("text_shingle_unigram", "foo bar")
|
||||||
|
), BooleanClause.Occur.SHOULD)
|
||||||
|
.add(
|
||||||
|
new SynonymQuery(
|
||||||
|
new Term("text_shingle_unigram", "bar"),
|
||||||
|
new Term("text_shingle_unigram", "bar baz")
|
||||||
|
), BooleanClause.Occur.SHOULD)
|
||||||
|
.add(
|
||||||
|
new TermQuery(
|
||||||
|
new Term("text_shingle_unigram", "baz")
|
||||||
|
), BooleanClause.Occur.SHOULD)
|
||||||
|
.build();
|
||||||
|
|
||||||
|
// parsed query for "text_shingle_unigram:\"foo bar baz\" with query parsers
|
||||||
|
// that ignores position length attribute
|
||||||
|
expectedPhraseQueryWithUnigram = new MultiPhraseQuery.Builder()
|
||||||
|
.add(
|
||||||
|
new Term[] {
|
||||||
|
new Term("text_shingle_unigram", "foo"),
|
||||||
|
new Term("text_shingle_unigram", "foo bar")
|
||||||
|
}, 0)
|
||||||
|
.add(
|
||||||
|
new Term[] {
|
||||||
|
new Term("text_shingle_unigram", "bar"),
|
||||||
|
new Term("text_shingle_unigram", "bar baz")
|
||||||
|
}, 1)
|
||||||
|
.add(
|
||||||
|
new Term[] {
|
||||||
|
new Term("text_shingle_unigram", "baz"),
|
||||||
|
}, 2)
|
||||||
|
.build();
|
||||||
|
|
||||||
|
// parsed query for "text_shingle:(foo bar baz)
|
||||||
|
expectedQuery = new BooleanQuery.Builder()
|
||||||
|
.add(
|
||||||
|
new TermQuery(new Term("text_shingle", "foo bar")),
|
||||||
|
BooleanClause.Occur.SHOULD
|
||||||
|
)
|
||||||
|
.add(
|
||||||
|
new TermQuery(new Term("text_shingle","bar baz")),
|
||||||
|
BooleanClause.Occur.SHOULD
|
||||||
|
)
|
||||||
|
.add(
|
||||||
|
new TermQuery(new Term("text_shingle","baz biz")),
|
||||||
|
BooleanClause.Occur.SHOULD
|
||||||
|
)
|
||||||
|
.build();
|
||||||
|
|
||||||
|
// parsed query for "text_shingle:"foo bar baz"
|
||||||
|
expectedPhraseQuery = new PhraseQuery.Builder()
|
||||||
|
.add(
|
||||||
|
new Term("text_shingle", "foo bar")
|
||||||
|
)
|
||||||
|
.add(
|
||||||
|
new Term("text_shingle","bar baz")
|
||||||
|
)
|
||||||
|
.add(
|
||||||
|
new Term("text_shingle","baz biz")
|
||||||
|
)
|
||||||
|
.build();
|
||||||
|
}
|
||||||
|
|
||||||
|
@After
|
||||||
|
public void cleanup() {
|
||||||
|
indexService = null;
|
||||||
|
shardContext = null;
|
||||||
|
expectedQuery = null;
|
||||||
|
expectedPhraseQuery = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testMatchPhraseQuery() throws IOException {
|
||||||
|
MatchPhraseQueryBuilder builder =
|
||||||
|
new MatchPhraseQueryBuilder("text_shingle_unigram", "foo bar baz");
|
||||||
|
Query query = builder.doToQuery(shardContext);
|
||||||
|
assertThat(expectedPhraseQueryWithUnigram, equalTo(query));
|
||||||
|
|
||||||
|
builder =
|
||||||
|
new MatchPhraseQueryBuilder("text_shingle", "foo bar baz biz");
|
||||||
|
query = builder.doToQuery(shardContext);
|
||||||
|
assertThat(expectedPhraseQuery, equalTo(query));
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testMatchQuery() throws IOException {
|
||||||
|
MatchQueryBuilder builder =
|
||||||
|
new MatchQueryBuilder("text_shingle_unigram", "foo bar baz");
|
||||||
|
Query query = builder.doToQuery(shardContext);
|
||||||
|
assertThat(expectedQueryWithUnigram, equalTo(query));
|
||||||
|
|
||||||
|
builder = new MatchQueryBuilder("text_shingle", "foo bar baz biz");
|
||||||
|
query = builder.doToQuery(shardContext);
|
||||||
|
assertThat(expectedQuery, equalTo(query));
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testMultiMatchQuery() throws IOException {
|
||||||
|
MultiMatchQueryBuilder builder = new MultiMatchQueryBuilder("foo bar baz",
|
||||||
|
"text_shingle_unigram");
|
||||||
|
Query query = builder.doToQuery(shardContext);
|
||||||
|
assertThat(expectedQueryWithUnigram, equalTo(query));
|
||||||
|
|
||||||
|
builder.type(MatchQuery.Type.PHRASE);
|
||||||
|
query = builder.doToQuery(shardContext);
|
||||||
|
assertThat(expectedPhraseQueryWithUnigram, equalTo(query));
|
||||||
|
|
||||||
|
builder = new MultiMatchQueryBuilder("foo bar baz biz", "text_shingle");
|
||||||
|
query = builder.doToQuery(shardContext);
|
||||||
|
assertThat(expectedQuery, equalTo(query));
|
||||||
|
|
||||||
|
builder.type(MatchQuery.Type.PHRASE);
|
||||||
|
query = builder.doToQuery(shardContext);
|
||||||
|
assertThat(expectedPhraseQuery, equalTo(query));
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testSimpleQueryString() throws IOException {
|
||||||
|
SimpleQueryStringBuilder builder = new SimpleQueryStringBuilder("foo bar baz");
|
||||||
|
builder.field("text_shingle_unigram");
|
||||||
|
builder.flags(SimpleQueryStringFlag.NONE);
|
||||||
|
Query query = builder.doToQuery(shardContext);
|
||||||
|
assertThat(expectedQueryWithUnigram, equalTo(query));
|
||||||
|
|
||||||
|
builder = new SimpleQueryStringBuilder("\"foo bar baz\"");
|
||||||
|
builder.field("text_shingle_unigram");
|
||||||
|
builder.flags(SimpleQueryStringFlag.PHRASE);
|
||||||
|
query = builder.doToQuery(shardContext);
|
||||||
|
assertThat(expectedPhraseQueryWithUnigram, equalTo(query));
|
||||||
|
|
||||||
|
builder = new SimpleQueryStringBuilder("foo bar baz biz");
|
||||||
|
builder.field("text_shingle");
|
||||||
|
builder.flags(SimpleQueryStringFlag.NONE);
|
||||||
|
query = builder.doToQuery(shardContext);
|
||||||
|
assertThat(expectedQuery, equalTo(query));
|
||||||
|
|
||||||
|
builder = new SimpleQueryStringBuilder("\"foo bar baz biz\"");
|
||||||
|
builder.field("text_shingle");
|
||||||
|
builder.flags(SimpleQueryStringFlag.PHRASE);
|
||||||
|
query = builder.doToQuery(shardContext);
|
||||||
|
assertThat(expectedPhraseQuery, equalTo(query));
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testQueryString() throws IOException {
|
||||||
|
QueryStringQueryBuilder builder = new QueryStringQueryBuilder("foo bar baz");
|
||||||
|
builder.field("text_shingle_unigram");
|
||||||
|
builder.splitOnWhitespace(false);
|
||||||
|
Query query = builder.doToQuery(shardContext);
|
||||||
|
assertThat(expectedQueryWithUnigram, equalTo(query));
|
||||||
|
|
||||||
|
builder = new QueryStringQueryBuilder("\"foo bar baz\"");
|
||||||
|
builder.field("text_shingle_unigram");
|
||||||
|
builder.splitOnWhitespace(false);
|
||||||
|
query = builder.doToQuery(shardContext);
|
||||||
|
assertThat(query, instanceOf(DisjunctionMaxQuery.class));
|
||||||
|
DisjunctionMaxQuery maxQuery = (DisjunctionMaxQuery) query;
|
||||||
|
assertThat(maxQuery.getDisjuncts().size(), equalTo(1));
|
||||||
|
assertThat(expectedPhraseQueryWithUnigram, equalTo(maxQuery.getDisjuncts().get(0)));
|
||||||
|
|
||||||
|
builder = new QueryStringQueryBuilder("foo bar baz biz");
|
||||||
|
builder.field("text_shingle");
|
||||||
|
builder.splitOnWhitespace(false);
|
||||||
|
query = builder.doToQuery(shardContext);
|
||||||
|
assertThat(expectedQuery, equalTo(query));
|
||||||
|
|
||||||
|
builder = new QueryStringQueryBuilder("\"foo bar baz biz\"");
|
||||||
|
builder.field("text_shingle");
|
||||||
|
builder.splitOnWhitespace(false);
|
||||||
|
query = builder.doToQuery(shardContext);
|
||||||
|
assertThat(query, instanceOf(DisjunctionMaxQuery.class));
|
||||||
|
maxQuery = (DisjunctionMaxQuery) query;
|
||||||
|
assertThat(maxQuery.getDisjuncts().size(), equalTo(1));
|
||||||
|
assertThat(expectedPhraseQuery, equalTo(maxQuery.getDisjuncts().get(0)));
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue