SOLR-11662: synonymQueryStyle option for FieldType used by query parser

This commit is contained in:
David Smiley 2017-12-04 13:25:16 -05:00
parent 929ce7ca30
commit 83753d0a2a
12 changed files with 222 additions and 16 deletions

View File

@ -62,7 +62,7 @@ public class QueryBuilder {
protected boolean enablePositionIncrements = true;
protected boolean enableGraphQueries = true;
protected boolean autoGenerateMultiTermSynonymsPhraseQuery = false;
/** Creates a new QueryBuilder using the given analyzer. */
public QueryBuilder(Analyzer analyzer) {
this.analyzer = analyzer;

View File

@ -102,6 +102,9 @@ New Features
* SOLR-11250: A new DefaultWrapperModel class for loading of large and/or externally stored
LTRScoringModel definitions. (Yuki Yano, shalin, Christine Poerschke)
* SOLR-11662: New synonymQueryStyle option to configure whether SynonymQuery, a DisjunctionMaxQuery, or BooleanQuery
occurs over query terms that overlap their position. (Doug Turnbull, David Smiley)
Bug Fixes
----------------------

View File

@ -52,13 +52,14 @@ public class QueryParser extends SolrQueryParserBase implements QueryParserConst
@Override
protected Query newFieldQuery(Analyzer analyzer, String field, String queryText,
boolean quoted, boolean fieldAutoGenPhraseQueries, boolean fieldEnableGraphQueries)
boolean quoted, boolean fieldAutoGenPhraseQueries, boolean fieldEnableGraphQueries,
SynonymQueryStyle synonymQueryStyle)
throws SyntaxError {
setAutoGenerateMultiTermSynonymsPhraseQuery(fieldAutoGenPhraseQueries || getAutoGeneratePhraseQueries());
// Don't auto-quote graph-aware field queries
boolean treatAsQuoted = getSplitOnWhitespace()
? (quoted || fieldAutoGenPhraseQueries || getAutoGeneratePhraseQueries()) : quoted;
return super.newFieldQuery(analyzer, field, queryText, treatAsQuoted, false, fieldEnableGraphQueries);
return super.newFieldQuery(analyzer, field, queryText, treatAsQuoted, false, fieldEnableGraphQueries, synonymQueryStyle);
}
// * Query ::= ( Clause )*

View File

@ -34,6 +34,7 @@ import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.BoostQuery;
import org.apache.lucene.search.ConstantScoreQuery;
import org.apache.lucene.search.DisjunctionMaxQuery;
import org.apache.lucene.search.FuzzyQuery;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.MultiPhraseQuery;
@ -61,6 +62,8 @@ import org.apache.solr.search.QueryUtils;
import org.apache.solr.search.SolrConstantScoreQuery;
import org.apache.solr.search.SyntaxError;
import static org.apache.solr.parser.SolrQueryParserBase.SynonymQueryStyle.*;
/** This class is overridden by QueryParser in QueryParser.jj
* and acts to separate the majority of the Java code from the .jj grammar file.
*/
@ -78,6 +81,39 @@ public abstract class SolrQueryParserBase extends QueryBuilder {
static final int MOD_NOT = 10;
static final int MOD_REQ = 11;
protected SynonymQueryStyle synonymQueryStyle = AS_SAME_TERM;
/**
* Query strategy when analyzed query terms overlap the same position (ie synonyms)
* consider if pants and khakis are query time synonyms
*
* {@link #AS_SAME_TERM}
* {@link #PICK_BEST}
* {@link #AS_DISTINCT_TERMS}
*/
public static enum SynonymQueryStyle {
/** (default) synonym terms share doc freq
* so if "pants" has df 500, and "khakis" a df of 50, uses 500 df when scoring both terms
* appropriate for exact synonyms
* see {@link org.apache.lucene.search.SynonymQuery}
* */
AS_SAME_TERM,
/** highest scoring term match chosen (ie dismax)
* so if "pants" has df 500, and "khakis" a df of 50, khakis matches are scored higher
* appropriate when more specific synonyms should score higher
* */
PICK_BEST,
/** each synonym scored indepedently, then added together (ie boolean query)
* so if "pants" has df 500, and "khakis" a df of 50, khakis matches are scored higher but
* summed with any "pants" matches
* appropriate when more specific synonyms should score higher, but we don't want to ignore
* less specific synonyms
* */
AS_DISTINCT_TERMS
}
// make it possible to call setDefaultOperator() without accessing
// the nested class:
/** Alternative form of QueryParser.Operator.AND */
@ -330,6 +366,19 @@ public abstract class SolrQueryParserBase extends QueryBuilder {
this.allowSubQueryParsing = allowSubQueryParsing;
}
/**
* Set how overlapping query terms (ie synonyms) should be scored, as if they're the same term,
* picking highest scoring term, or OR'ing them together
* @param synonymQueryStyle how to score terms that overlap see {{@link SynonymQueryStyle}}
*/
public void setSynonymQueryStyle(SynonymQueryStyle synonymQueryStyle) {this.synonymQueryStyle = synonymQueryStyle;}
/**
* Gets how overlapping query terms should be scored
*/
public SynonymQueryStyle getSynonymQueryStyle() {return this.synonymQueryStyle;}
/**
* Set to <code>true</code> to allow leading wildcard characters.
* <p>
@ -460,13 +509,16 @@ public abstract class SolrQueryParserBase extends QueryBuilder {
}
protected Query newFieldQuery(Analyzer analyzer, String field, String queryText,
boolean quoted, boolean fieldAutoGenPhraseQueries, boolean fieldEnableGraphQueries)
boolean quoted, boolean fieldAutoGenPhraseQueries, boolean fieldEnableGraphQueries,
SynonymQueryStyle synonymQueryStyle)
throws SyntaxError {
BooleanClause.Occur occur = operator == Operator.AND ? BooleanClause.Occur.MUST : BooleanClause.Occur.SHOULD;
setEnableGraphQueries(fieldEnableGraphQueries);
setSynonymQueryStyle(synonymQueryStyle);
Query query = createFieldQuery(analyzer, occur, field, queryText,
quoted || fieldAutoGenPhraseQueries || autoGeneratePhraseQueries, phraseSlop);
setEnableGraphQueries(true); // reset back to default
setSynonymQueryStyle(AS_SAME_TERM);
return query;
}
@ -539,6 +591,29 @@ public abstract class SolrQueryParserBase extends QueryBuilder {
return query;
}
@Override
protected Query newSynonymQuery(Term terms[]) {
switch (synonymQueryStyle) {
case PICK_BEST:
List<Query> currPosnClauses = new ArrayList<Query>(terms.length);
for (Term term : terms) {
currPosnClauses.add(newTermQuery(term));
}
DisjunctionMaxQuery dm = new DisjunctionMaxQuery(currPosnClauses, 0.0f);
return dm;
case AS_DISTINCT_TERMS:
BooleanQuery.Builder builder = new BooleanQuery.Builder();
for (Term term : terms) {
builder.add(newTermQuery(term), BooleanClause.Occur.SHOULD);
}
return builder.build();
case AS_SAME_TERM:
return super.newSynonymQuery(terms);
default:
throw new AssertionError("unrecognized synonymQueryStyle passed when creating newSynonymQuery");
}
}
/**
* Builds a new FuzzyQuery instance
* @param term Term
@ -661,8 +736,13 @@ public abstract class SolrQueryParserBase extends QueryBuilder {
boolean fieldAutoGenPhraseQueries = ft instanceof TextField && ((TextField)ft).getAutoGeneratePhraseQueries();
boolean fieldEnableGraphQueries = ft instanceof TextField && ((TextField)ft).getEnableGraphQueries();
SynonymQueryStyle synonymQueryStyle = AS_SAME_TERM;
if (ft instanceof TextField) {
synonymQueryStyle = ((TextField)(ft)).getSynonymQueryStyle();
}
subq = newFieldQuery(getAnalyzer(), sfield.getName(), rawq.getJoinedExternalVal(),
false, fieldAutoGenPhraseQueries, fieldEnableGraphQueries);
false, fieldAutoGenPhraseQueries, fieldEnableGraphQueries, synonymQueryStyle);
booleanBuilder.add(subq, BooleanClause.Occur.SHOULD);
} else {
for (String externalVal : rawq.getExternalVals()) {
@ -979,7 +1059,11 @@ public abstract class SolrQueryParserBase extends QueryBuilder {
if (ft.isTokenized() && sf.indexed()) {
boolean fieldAutoGenPhraseQueries = ft instanceof TextField && ((TextField)ft).getAutoGeneratePhraseQueries();
boolean fieldEnableGraphQueries = ft instanceof TextField && ((TextField)ft).getEnableGraphQueries();
return newFieldQuery(getAnalyzer(), field, queryText, quoted, fieldAutoGenPhraseQueries, fieldEnableGraphQueries);
SynonymQueryStyle synonymQueryStyle = AS_SAME_TERM;
if (ft instanceof TextField) {
synonymQueryStyle = ((TextField)(ft)).getSynonymQueryStyle();
}
return newFieldQuery(getAnalyzer(), field, queryText, quoted, fieldAutoGenPhraseQueries, fieldEnableGraphQueries, synonymQueryStyle);
} else {
if (raw) {
return new RawQuery(sf, queryText);
@ -990,7 +1074,7 @@ public abstract class SolrQueryParserBase extends QueryBuilder {
}
// default to a normal field query
return newFieldQuery(getAnalyzer(), field, queryText, quoted, false, true);
return newFieldQuery(getAnalyzer(), field, queryText, quoted, false, true, AS_SAME_TERM);
}
// Assumption: quoted is always false
@ -1024,8 +1108,12 @@ public abstract class SolrQueryParserBase extends QueryBuilder {
String queryText = queryTerms.size() == 1 ? queryTerms.get(0) : String.join(" ", queryTerms);
boolean fieldAutoGenPhraseQueries = ft instanceof TextField && ((TextField)ft).getAutoGeneratePhraseQueries();
boolean fieldEnableGraphQueries = ft instanceof TextField && ((TextField)ft).getEnableGraphQueries();
SynonymQueryStyle synonymQueryStyle = AS_SAME_TERM;
if (ft instanceof TextField) {
synonymQueryStyle = ((TextField)(ft)).getSynonymQueryStyle();
}
return newFieldQuery
(getAnalyzer(), field, queryText, false, fieldAutoGenPhraseQueries, fieldEnableGraphQueries);
(getAnalyzer(), field, queryText, false, fieldAutoGenPhraseQueries, fieldEnableGraphQueries, synonymQueryStyle);
} else {
if (raw) {
return new RawQuery(sf, queryTerms);
@ -1057,7 +1145,7 @@ public abstract class SolrQueryParserBase extends QueryBuilder {
// default to a normal field query
String queryText = queryTerms.size() == 1 ? queryTerms.get(0) : String.join(" ", queryTerms);
return newFieldQuery(getAnalyzer(), field, queryText, false, false, true);
return newFieldQuery(getAnalyzer(), field, queryText, false, false, true, AS_SAME_TERM);
}
protected boolean isRangeShouldBeProtectedFromReverse(String field, String part1){

View File

@ -905,6 +905,7 @@ public abstract class FieldType extends FieldProperties {
protected static final String ENABLE_GRAPH_QUERIES = "enableGraphQueries";
private static final String ARGS = "args";
private static final String POSITION_INCREMENT_GAP = "positionIncrementGap";
protected static final String SYNONYM_QUERY_STYLE = "synonymQueryStyle";
/**
* Get a map of property name -&gt; value for this field type.
@ -926,6 +927,7 @@ public abstract class FieldType extends FieldProperties {
if (this instanceof TextField) {
namedPropertyValues.add(AUTO_GENERATE_PHRASE_QUERIES, ((TextField) this).getAutoGeneratePhraseQueries());
namedPropertyValues.add(ENABLE_GRAPH_QUERIES, ((TextField) this).getEnableGraphQueries());
namedPropertyValues.add(SYNONYM_QUERY_STYLE, ((TextField) this).getSynonymQueryStyle());
}
namedPropertyValues.add(getPropertyName(INDEXED), hasProperty(INDEXED));
namedPropertyValues.add(getPropertyName(STORED), hasProperty(STORED));

View File

@ -17,6 +17,7 @@
package org.apache.solr.schema;
import java.io.IOException;
import java.util.Locale;
import java.util.Map;
import org.apache.lucene.analysis.Analyzer;
@ -29,6 +30,7 @@ import org.apache.lucene.search.*;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.QueryBuilder;
import org.apache.solr.common.SolrException;
import org.apache.solr.parser.SolrQueryParserBase;
import org.apache.solr.query.SolrRangeQuery;
import org.apache.solr.response.TextResponseWriter;
import org.apache.solr.search.QParser;
@ -41,6 +43,7 @@ import org.apache.solr.uninverting.UninvertingReader.Type;
public class TextField extends FieldType {
protected boolean autoGeneratePhraseQueries;
protected boolean enableGraphQueries;
protected SolrQueryParserBase.SynonymQueryStyle synonymQueryStyle;
/**
* Analyzer set by schema for text types to use when searching fields
@ -72,6 +75,12 @@ public class TextField extends FieldType {
String autoGeneratePhraseQueriesStr = args.remove(AUTO_GENERATE_PHRASE_QUERIES);
if (autoGeneratePhraseQueriesStr != null)
autoGeneratePhraseQueries = Boolean.parseBoolean(autoGeneratePhraseQueriesStr);
synonymQueryStyle = SolrQueryParserBase.SynonymQueryStyle.AS_SAME_TERM;
String synonymQueryStyle = args.remove(SYNONYM_QUERY_STYLE);
if (synonymQueryStyle != null) {
this.synonymQueryStyle = SolrQueryParserBase.SynonymQueryStyle.valueOf(synonymQueryStyle.toUpperCase(Locale.ROOT));
}
enableGraphQueries = true;
String enableGraphQueriesStr = args.remove(ENABLE_GRAPH_QUERIES);
@ -104,6 +113,8 @@ public class TextField extends FieldType {
return enableGraphQueries;
}
public SolrQueryParserBase.SynonymQueryStyle getSynonymQueryStyle() {return synonymQueryStyle;}
@Override
public SortField getSortField(SchemaField field, boolean reverse) {
/* :TODO: maybe warn if isTokenized(), but doesn't use LimitTokenCountFilter in its chain? */

View File

@ -1003,7 +1003,8 @@ public class ExtendedDismaxQParser extends QParser {
@Override
protected Query newFieldQuery(Analyzer analyzer, String field, String queryText,
boolean quoted, boolean fieldAutoGenPhraseQueries, boolean enableGraphQueries)
boolean quoted, boolean fieldAutoGenPhraseQueries, boolean enableGraphQueries,
SynonymQueryStyle synonymQueryStyle)
throws SyntaxError {
Analyzer actualAnalyzer;
if (removeStopFilter) {
@ -1017,7 +1018,7 @@ public class ExtendedDismaxQParser extends QParser {
} else {
actualAnalyzer = parser.getReq().getSchema().getFieldType(field).getQueryAnalyzer();
}
return super.newFieldQuery(actualAnalyzer, field, queryText, quoted, fieldAutoGenPhraseQueries, enableGraphQueries);
return super.newFieldQuery(actualAnalyzer, field, queryText, quoted, fieldAutoGenPhraseQueries, enableGraphQueries, synonymQueryStyle);
}
@Override

View File

@ -167,6 +167,67 @@
</analyzer>
</fieldType>
<fieldType name="text_pick_best" class="solr.TextField" positionIncrementGap="100" synonymQueryStyle="pick_best" autoGeneratePhraseQueries="true">
<analyzer type="index">
<tokenizer class="solr.MockTokenizerFactory"/>
<filter class="solr.StopFilterFactory"
ignoreCase="true"
words="stopwords.txt"
/>
<filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1"
catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
<filter class="solr.PorterStemFilterFactory"/>
<filter class="solr.FlattenGraphFilterFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.MockTokenizerFactory"/>
<filter class="solr.SynonymGraphFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
<filter class="solr.StopFilterFactory"
ignoreCase="true"
words="stopwords.txt"
/>
<filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0"
catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
<filter class="solr.PorterStemFilterFactory"/>
</analyzer>
</fieldType>
<fieldType name="text_as_distinct" class="solr.TextField" positionIncrementGap="100" synonymQueryStyle="as_distinct_terms" autoGeneratePhraseQueries="true">
<analyzer type="index">
<tokenizer class="solr.MockTokenizerFactory"/>
<filter class="solr.StopFilterFactory"
ignoreCase="true"
words="stopwords.txt"
/>
<filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1"
catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
<filter class="solr.PorterStemFilterFactory"/>
<filter class="solr.FlattenGraphFilterFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.MockTokenizerFactory"/>
<filter class="solr.SynonymGraphFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
<filter class="solr.StopFilterFactory"
ignoreCase="true"
words="stopwords.txt"
/>
<filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0"
catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
<filter class="solr.PorterStemFilterFactory"/>
</analyzer>
</fieldType>
<fieldType name="nametext" class="solr.TextField">
<analyzer class="org.apache.lucene.analysis.core.WhitespaceAnalyzer"/>
</fieldType>
@ -590,6 +651,10 @@
<dynamicField name="*_sI" type="string" indexed="true" stored="false"/>
<dynamicField name="*_sS" type="string" indexed="false" stored="true"/>
<dynamicField name="t_pick_best_*" type="text_pick_best" indexed="true" stored="true"/>
<dynamicField name="t_as_distinct_*" type="text_as_distinct" indexed="true" stored="true"/>
<dynamicField name="t_*" type="text" indexed="true" stored="true"/>
<dynamicField name="tv_*" type="text" indexed="true" stored="true"
termVectors="true" termPositions="true" termOffsets="true"/>

View File

@ -31,4 +31,10 @@ pixima => pixma
# multiword synonyms
wi fi => wifi
crow blackbird, grackle
crow blackbird, grackle
# Synonyms used in semantic expansion
tabby => tabby, cat, feline, animal
persian => persian, cat, feline, animal
jeans, denim pants

View File

@ -96,6 +96,9 @@ public class TestExtendedDismaxParser extends SolrTestCaseJ4 {
assertU(adoc("id", "71", "text_sw", "ties"));
assertU(adoc("id", "72", "text_sw", "wifi ATM"));
assertU(adoc("id", "73", "shingle23", "A B X D E"));
// assertU(adoc("id", "74", "text_pick_best", "tabby"));
// assertU(adoc("id", "74", "text_as_distinct", "persian"));
assertU(commit());
}
@ -2018,10 +2021,11 @@ public class TestExtendedDismaxParser extends SolrTestCaseJ4 {
**/
@Override
protected Query newFieldQuery(Analyzer analyzer, String field, String queryText,
boolean quoted, boolean fieldAutoGenPhraseQueries, boolean fieldEnableGraphQueries)
boolean quoted, boolean fieldAutoGenPhraseQueries,
boolean fieldEnableGraphQueries, SynonymQueryStyle synonymQueryStyle)
throws SyntaxError {
Query q = super.newFieldQuery
(analyzer, field, queryText, quoted, fieldAutoGenPhraseQueries, fieldEnableGraphQueries);
(analyzer, field, queryText, quoted, fieldAutoGenPhraseQueries, fieldEnableGraphQueries, synonymQueryStyle);
if (q instanceof BooleanQuery) {
boolean rewrittenSubQ = false; // dirty flag: rebuild the repacked query?
BooleanQuery.Builder builder = newBooleanQuery();

View File

@ -1057,7 +1057,25 @@ public class TestSolrQueryParser extends SolrTestCaseJ4 {
, "/response/numFound==1"
);
}
public void testSynonymQueryStyle() throws Exception {
Query q = QParser.getParser("tabby", req(params("df", "t_pick_best_foo"))).getQuery();
assertEquals("(t_pick_best_foo:tabbi | t_pick_best_foo:cat | t_pick_best_foo:felin | t_pick_best_foo:anim)", q.toString());
q = QParser.getParser("tabby", req(params("df", "t_as_distinct_foo"))).getQuery();
assertEquals("t_as_distinct_foo:tabbi t_as_distinct_foo:cat t_as_distinct_foo:felin t_as_distinct_foo:anim", q.toString());
/*confirm autoGeneratePhraseQueries always builds OR queries*/
q = QParser.getParser("jeans", req(params("df", "t_as_distinct_foo", "sow", "false"))).getQuery();
assertEquals("(t_as_distinct_foo:\"denim pant\" t_as_distinct_foo:jean)", q.toString());
q = QParser.getParser("jeans", req(params("df", "t_pick_best_foo", "sow", "false"))).getQuery();
assertEquals("(t_pick_best_foo:\"denim pant\" t_pick_best_foo:jean)", q.toString());
}
@Test
public void testBadRequestInSetQuery() throws SyntaxError {
SolrQueryRequest req = req();

View File

@ -87,6 +87,13 @@ For multivalued fields, specifies a distance between multiple values, which prev
`autoGeneratePhraseQueries`:: For text fields. If `true`, Solr automatically generates phrase queries for adjacent terms. If `false`, terms must be enclosed in double-quotes to be treated as phrases.
`synonymQueryStyle`::
Query used to combine scores of overlapping query terms (i.e. synonyms). Consider a search for "blue tee" with query-time synonyms `tshirt,tee`.
+
Use `as_same_term` (default) to blend terms, i.e. `SynonymQuery(tshirt,tee)` where each term will be treated as equally important. Use `pick_best` to select the most significant synonym when scoring `Dismax(tee,tshirt)`. Use `as_distinct_terms` to bias scoring towards the most significant synonym `(pants OR slacks)`.
+
`as_same_term` is appropriate when terms are true synonyms (television, tv). Use `pick_best` or `as_distinct_terms` when synonyms are expanding to hyponyms `(q=jeans w/ jeans\=>jeans,pants)` and you want exact to come before parent and sibling concepts. See this http://opensourceconnections.com/blog/2017/11/21/solr-synonyms-mea-culpa/[blog article].
`enableGraphQueries`::
For text fields, applicable when querying with <<the-standard-query-parser.adoc#standard-query-parser-parameters,`sow=false`>> (which is the default for the `sow` parameter). Use `true`, the default, for field types with query analyzers including graph-aware filters, e.g., <<filter-descriptions.adoc#synonym-graph-filter,Synonym Graph Filter>> and <<filter-descriptions.adoc#word-delimiter-graph-filter,Word Delimiter Graph Filter>>.
+
@ -138,4 +145,4 @@ The default values for each property depend on the underlying `FieldType` class,
A field type may optionally specify a `<similarity/>` that will be used when scoring documents that refer to fields with this type, as long as the "global" similarity for the collection allows it.
By default, any field type which does not define a similarity, uses `BM25Similarity`. For more details, and examples of configuring both global & per-type Similarities, please see <<other-schema-elements.adoc#similarity,Other Schema Elements>>.
By default, any field type which does not define a similarity, uses `BM25Similarity`. For more details, and examples of configuring both global & per-type Similarities, please see <<other-schema-elements.adoc#similarity,Other Schema Elements>>.