SOLR-10310: By default, stop splitting on whitespace prior to analysis in edismax and standard/"lucene" query parsers

This commit is contained in:
Steve Rowe 2017-04-25 12:02:25 -04:00
parent 4f89f98f66
commit dd171ff8fe
6 changed files with 153 additions and 72 deletions

View File

@ -64,6 +64,11 @@ Upgrading from Solr 6.x
registries as hierarchical MBeans. This behavior can be also disabled by specifying a SolrJmxReporter
configuration with a boolean init arg "enabled" set to "false". For a more fine-grained control users
should explicitly specify at least one SolrJmxReporter configuration.
* The sow (split-on-whitespace) request param now defaults to false (true in previous versions).
This affects the edismax and standard/"lucene" query parsers: if the sow param is not specified,
query text will not be split on whitespace before analysis. See
https://lucidworks.com/2017/04/18/multi-word-synonyms-solr-adds-query-time-support/ .
New Features
----------------------
@ -119,7 +124,8 @@ Other Changes
* SOLR-10557: Make "compact" format default for /admin/metrics. (ab)
----------------------
* SOLR-10310: By default, stop splitting on whitespace prior to analysis
in edismax and standard/"lucene" query parsers. (Steve Rowe)
================== 6.6.0 ==================

View File

@ -21,7 +21,7 @@ public class QueryParser extends SolrQueryParserBase implements QueryParserConst
static public enum Operator { OR, AND }
/** default split on whitespace behavior */
public static final boolean DEFAULT_SPLIT_ON_WHITESPACE = true;
public static final boolean DEFAULT_SPLIT_ON_WHITESPACE = false;
public QueryParser(String defaultField, QParser parser) {
this(new FastCharStream(new StringReader("")));

View File

@ -45,7 +45,7 @@ public class QueryParser extends SolrQueryParserBase {
static public enum Operator { OR, AND }
/** default split on whitespace behavior */
public static final boolean DEFAULT_SPLIT_ON_WHITESPACE = true;
public static final boolean DEFAULT_SPLIT_ON_WHITESPACE = false;
public QueryParser(String defaultField, QParser parser) {
this(new FastCharStream(new StringReader("")));

View File

@ -98,6 +98,7 @@ public class DisMaxQParser extends QParser {
@Override
public Query parse() throws SyntaxError {
parsed = true;
SolrParams solrParams = SolrParams.wrapDefaults(localParams, params);
@ -265,6 +266,7 @@ public class DisMaxQParser extends QParser {
IMPOSSIBLE_FIELD_NAME);
parser.addAlias(IMPOSSIBLE_FIELD_NAME, tiebreaker, fields);
parser.setPhraseSlop(slop);
parser.setSplitOnWhitespace(true);
return parser;
}

View File

@ -23,6 +23,8 @@ import java.util.Random;
import java.util.Set;
import java.util.stream.Stream;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.BoostQuery;
@ -364,8 +366,16 @@ public class TestExtendedDismaxParser extends SolrTestCaseJ4 {
assertQ(req("defType","edismax", "mm","0", "q","Terminator: 100", "qf","movies_t foo_i"),
twor);
assertQ(req("defType","edismax", "mm","100%", "q","Terminator: 100", "qf","movies_t foo_i"),
assertQ(req("defType","edismax", "mm","100%", "q","Terminator: 100", "qf","movies_t foo_i", "sow","true"),
nor);
// When sow=false, the per-field query structures differ (no "Terminator" query on integer field foo_i),
// so a dismax-per-field is constructed. As a result, mm=100% is applied per-field instead of per-term;
// since there is only one term (100) required in the foo_i field's dismax, the query can match docs that
// only have the 100 term in the foo_i field, and don't necessarily have "Terminator" in any field.
assertQ(req("defType","edismax", "mm","100%", "q","Terminator: 100", "qf","movies_t foo_i", "sow","false"),
oner);
assertQ(req("defType","edismax", "mm","100%", "q","Terminator: 100", "qf","movies_t foo_i"), // default sow=false
oner);
assertQ(req("defType","edismax", "mm","100%", "q","Terminator: 8", "qf","movies_t foo_i"),
oner);
@ -1413,19 +1423,21 @@ public class TestExtendedDismaxParser extends SolrTestCaseJ4 {
assertJQ(req("qf", "text_sw title", "defType","edismax", "q","wi fi", "sow","true")
, "/response/numFound==0"
);
assertJQ(req("qf","text_sw title", "defType","edismax", "q","wi fi") // default sow=true
, "/response/numFound==0"
assertJQ(req("qf","text_sw title", "defType","edismax", "q","wi fi") // default sow=false
, "/response/numFound==1"
, "/response/docs/[0]/id=='72'"
);
assertJQ(req("qf","text_sw title", "q","{!edismax sow=false}wi fi")
, "/response/numFound==1"
, "/response/docs/[0]/id=='72'"
);
assertJQ(req("df", "text_sw title", "q","{!edismax sow=true}wi fi")
assertJQ(req("qf", "text_sw title", "q","{!edismax sow=true}wi fi")
, "/response/numFound==0"
);
assertJQ(req("df", "text_sw title", "q", "{!edismax}wi fi") // default sow=true
, "/response/numFound==0"
assertJQ(req("qf", "text_sw title", "q", "{!edismax}wi fi") // default sow=false
, "/response/numFound==1"
, "/response/docs/[0]/id=='72'"
);
assertQ(req("qf", "name title",
@ -1451,7 +1463,7 @@ public class TestExtendedDismaxParser extends SolrTestCaseJ4 {
assertQ(req("qf", "name title",
"q", "barking curds of stigma",
"defType", "edismax",
"debugQuery", "true"), // Default sow=true
"debugQuery", "true"), // Default sow=false
"//str[@name='parsedquery'][contains(.,'DisjunctionMaxQuery((name:barking | title:barking))')]",
"//str[@name='parsedquery'][contains(.,'DisjunctionMaxQuery((name:curds | title:curds))')]",
"//str[@name='parsedquery'][contains(.,'DisjunctionMaxQuery((name:of | title:of))')]",
@ -1768,18 +1780,18 @@ public class TestExtendedDismaxParser extends SolrTestCaseJ4 {
//
// crow blackbird, grackle
try (SolrQueryRequest req = req(sowFalseParams)) {
QParser qParser = QParser.getParser("text:grackle", "edismax", req); // "text" has autoGeneratePhraseQueries="true"
Query q = qParser.getQuery();
assertEquals("+(text:\"crow blackbird\" text:grackl)", q.toString());
}
for (SolrParams params : Arrays.asList(noSowParams, sowTrueParams)) {
for (SolrParams params : Arrays.asList(noSowParams, sowFalseParams)) {
try (SolrQueryRequest req = req(params)) {
QParser qParser = QParser.getParser("text:grackle", "edismax", req);
QParser qParser = QParser.getParser("text:grackle", "edismax", req); // "text" has autoGeneratePhraseQueries="true"
Query q = qParser.getQuery();
assertEquals("+spanOr([spanNear([text:crow, text:blackbird], 0, true), text:grackl])", q.toString());
assertEquals("+(text:\"crow blackbird\" text:grackl)", q.toString());
}
}
try (SolrQueryRequest req = req(sowTrueParams)) {
QParser qParser = QParser.getParser("text:grackle", "edismax", req);
Query q = qParser.getQuery();
assertEquals("+spanOr([spanNear([text:crow, text:blackbird], 0, true), text:grackl])", q.toString());
}
for (SolrParams params : Arrays.asList(noSowParams, sowTrueParams, sowFalseParams)) {
try (SolrQueryRequest req = req(params)) {
QParser qParser = QParser.getParser("text_sw:grackle", "edismax", req); // "text_sw" doesn't specify autoGeneratePhraseQueries => default false
@ -1790,35 +1802,35 @@ public class TestExtendedDismaxParser extends SolrTestCaseJ4 {
Stream.of(noSowParams, sowTrueParams, sowFalseParams).forEach(p->p.add("qf", "text text_sw"));
try (SolrQueryRequest req = req(sowFalseParams)) {
QParser qParser = QParser.getParser("grackle", "edismax", req);
Query q = qParser.getQuery();
assertEquals("+((text:\"crow blackbird\" text:grackl)"
+ " | ((+text_sw:crow +text_sw:blackbird) text_sw:grackl))",
q.toString());
qParser = QParser.getParser("grackle wi fi", "edismax", req);
q = qParser.getQuery();
assertEquals("+(((text:\"crow blackbird\" text:grackl) text:wifi)"
+ " | (((+text_sw:crow +text_sw:blackbird) text_sw:grackl) text_sw:wifi))",
q.toString());
}
for (SolrParams params : Arrays.asList(noSowParams, sowTrueParams)) {
for (SolrParams params : Arrays.asList(noSowParams, sowFalseParams)) {
try (SolrQueryRequest req = req(params)) {
QParser qParser = QParser.getParser("grackle", "edismax", req);
Query q = qParser.getQuery();
assertEquals("+(spanOr([spanNear([text:crow, text:blackbird], 0, true), text:grackl])"
assertEquals("+((text:\"crow blackbird\" text:grackl)"
+ " | ((+text_sw:crow +text_sw:blackbird) text_sw:grackl))",
q.toString());
qParser = QParser.getParser("grackle wi fi", "edismax", req);
q = qParser.getQuery();
assertEquals("+((spanOr([spanNear([text:crow, text:blackbird], 0, true), text:grackl])"
+ " | ((+text_sw:crow +text_sw:blackbird) text_sw:grackl)) (text:wi | text_sw:wi) (text:fi | text_sw:fi))",
assertEquals("+(((text:\"crow blackbird\" text:grackl) text:wifi)"
+ " | (((+text_sw:crow +text_sw:blackbird) text_sw:grackl) text_sw:wifi))",
q.toString());
}
}
try (SolrQueryRequest req = req(sowTrueParams)) {
QParser qParser = QParser.getParser("grackle", "edismax", req);
Query q = qParser.getQuery();
assertEquals("+(spanOr([spanNear([text:crow, text:blackbird], 0, true), text:grackl])"
+ " | ((+text_sw:crow +text_sw:blackbird) text_sw:grackl))",
q.toString());
qParser = QParser.getParser("grackle wi fi", "edismax", req);
q = qParser.getQuery();
assertEquals("+((spanOr([spanNear([text:crow, text:blackbird], 0, true), text:grackl])"
+ " | ((+text_sw:crow +text_sw:blackbird) text_sw:grackl)) (text:wi | text_sw:wi) (text:fi | text_sw:fi))",
q.toString());
}
}
public void testSowFalseWithBoost() throws Exception {
@ -1938,6 +1950,8 @@ public class TestExtendedDismaxParser extends SolrTestCaseJ4 {
static class FuzzyDismaxQParser extends ExtendedDismaxQParser {
private static final float MIN_SIMILARITY = 0.75F;
public FuzzyDismaxQParser(String qstr, SolrParams localParams,
SolrParams params, SolrQueryRequest req) {
@ -1958,16 +1972,50 @@ public class TestExtendedDismaxParser extends SolrTestCaseJ4 {
super(parser, defaultField);
frequentlyMisspelledWords = new HashSet<>();
frequentlyMisspelledWords.add("absence");
frequentlyMisspelledWords.add("absenc");
}
@Override
protected Query getFieldQuery(String field,
String val, boolean quoted, boolean raw) throws SyntaxError {
if(frequentlyMisspelledWords.contains(val)) {
return getFuzzyQuery(field, val, 0.75F);
return getFuzzyQuery(field, val, MIN_SIMILARITY);
}
return super.getFieldQuery(field, val, quoted, raw);
}
/**
* Handle multi-term queries by repacking boolean queries with frequently misspelled term
* queries rewritten as fuzzy queries.
**/
@Override
protected Query newFieldQuery(Analyzer analyzer, String field, String queryText,
boolean quoted, boolean fieldAutoGenPhraseQueries, boolean fieldEnableGraphQueries)
throws SyntaxError {
Query q = super.newFieldQuery
(analyzer, field, queryText, quoted, fieldAutoGenPhraseQueries, fieldEnableGraphQueries);
if (q instanceof BooleanQuery) {
boolean rewrittenSubQ = false; // dirty flag: rebuild the repacked query?
BooleanQuery.Builder builder = newBooleanQuery();
for (BooleanClause clause : ((BooleanQuery)q).clauses()) {
Query subQ = clause.getQuery();
if (subQ instanceof TermQuery) {
Term subTerm = ((TermQuery)subQ).getTerm();
if (frequentlyMisspelledWords.contains(subTerm.text())) {
rewrittenSubQ = true;
Query fuzzySubQ = newFuzzyQuery(subTerm, MIN_SIMILARITY, getFuzzyPrefixLength());
clause = newBooleanClause(fuzzySubQ, clause.getOccur());
}
}
builder.add(clause);
}
if (rewrittenSubQ) {
builder.setMinimumNumberShouldMatch(((BooleanQuery)q).getMinimumNumberShouldMatch());
q = builder.build();
}
}
return q;
}
}
}

View File

@ -75,10 +75,23 @@ public class TestSolrQueryParser extends SolrTestCaseJ4 {
@Test
public void testPhrase() {
// "text" field's type has WordDelimiterGraphFilter (WDGFF) and autoGeneratePhraseQueries=true
// should generate a phrase of "now cow" and match only one doc
assertQ(req("q", "text:now-cow", "indent", "true")
assertQ(req("q", "text:now-cow", "indent", "true", "sow","true")
, "//*[@numFound='1']"
);
// When sow=false, autoGeneratePhraseQueries=true only works when a graph is produced
// (i.e. overlapping terms, e.g. if WDGFF's preserveOriginal=1 or concatenateWords=1).
// The WDGFF config on the "text" field doesn't produce a graph, so the generated query
// is not a phrase query. As a result, docs can match that don't match phrase query "now cow"
assertQ(req("q", "text:now-cow", "indent", "true", "sow","false")
, "//*[@numFound='2']"
);
assertQ(req("q", "text:now-cow", "indent", "true") // default sow=false
, "//*[@numFound='2']"
);
// "text_np" field's type has WDGFF and (default) autoGeneratePhraseQueries=false
// should generate a query of (now OR cow) and match both docs
assertQ(req("q", "text_np:now-cow", "indent", "true")
, "//*[@numFound='2']"
@ -593,8 +606,9 @@ public class TestSolrQueryParser extends SolrTestCaseJ4 {
assertJQ(req("df", "syn", "q", "wi fi", "sow", "true")
, "/response/numFound==0"
);
assertJQ(req("df", "syn", "q", "wi fi") // default sow=true
, "/response/numFound==0"
assertJQ(req("df", "syn", "q", "wi fi") // default sow=false
, "/response/numFound==1"
, "/response/docs/[0]/id=='20'"
);
assertJQ(req("df", "syn", "q", "{!lucene sow=false}wi fi")
@ -605,8 +619,9 @@ public class TestSolrQueryParser extends SolrTestCaseJ4 {
, "/response/numFound==0"
);
assertJQ(req("df", "syn", "q", "{!lucene}wi fi") // default sow=true
, "/response/numFound==0"
assertJQ(req("df", "syn", "q", "{!lucene}wi fi") // default sow=false
, "/response/numFound==1"
, "/response/docs/[0]/id=='20'"
);
}
@ -654,20 +669,25 @@ public class TestSolrQueryParser extends SolrTestCaseJ4 {
, "/response/numFound==0"
);
assertJQ(req("df", "syn", "q", "wi fi") // default sow=true
, "/response/numFound==0"
assertJQ(req("df", "syn", "q", "wi fi") // default sow=false
, "/response/numFound==1"
, "/response/docs/[0]/id=='20'"
);
assertJQ(req("df", "syn", "q", "wi /* foo */ fi") // default sow=true
, "/response/numFound==0"
assertJQ(req("df", "syn", "q", "wi /* foo */ fi") // default sow=false
, "/response/numFound==1"
, "/response/docs/[0]/id=='20'"
);
assertJQ(req("df", "syn", "q", "wi /* foo */ /* bar */ fi") // default sow=true
, "/response/numFound==0"
assertJQ(req("df", "syn", "q", "wi /* foo */ /* bar */ fi") // default sow=false
, "/response/numFound==1"
, "/response/docs/[0]/id=='20'"
);
assertJQ(req("df", "syn", "q", "/* foo */ wi fi /* bar */") // default sow=true
, "/response/numFound==0"
assertJQ(req("df", "syn", "q", " /* foo */ wi fi /* bar */") // default sow=false
, "/response/numFound==1"
, "/response/docs/[0]/id=='20'"
);
assertJQ(req("df", "syn", "q", "/* foo */ wi /* bar */ fi /* baz */") // default sow=true
, "/response/numFound==0"
assertJQ(req("df", "syn", "q", " /* foo */ wi /* bar */ fi /* baz */") // default sow=false
, "/response/numFound==1"
, "/response/docs/[0]/id=='20'"
);
@ -708,20 +728,25 @@ public class TestSolrQueryParser extends SolrTestCaseJ4 {
, "/response/numFound==0"
);
assertJQ(req("df", "syn", "q", "{!lucene}wi fi") // default sow=true
, "/response/numFound==0"
assertJQ(req("df", "syn", "q", "{!lucene}wi fi") // default sow=false
, "/response/numFound==1"
, "/response/docs/[0]/id=='20'"
);
assertJQ(req("df", "syn", "q", "{!lucene}wi /* foo */ fi") // default sow=true
, "/response/numFound==0"
assertJQ(req("df", "syn", "q", "{!lucene}wi /* foo */ fi") // default sow=false
, "/response/numFound==1"
, "/response/docs/[0]/id=='20'"
);
assertJQ(req("df", "syn", "q", "{!lucene}wi /* foo */ /* bar */ fi") // default sow=true
, "/response/numFound==0"
assertJQ(req("df", "syn", "q", "{!lucene}wi /* foo */ /* bar */ fi") // default sow=false
, "/response/numFound==1"
, "/response/docs/[0]/id=='20'"
);
assertJQ(req("df", "syn", "q", "{!lucene}/* foo */ wi fi /* bar */") // default sow=true
, "/response/numFound==0"
assertJQ(req("df", "syn", "q", "{!lucene}/* foo */ wi fi /* bar */") // default sow=false
, "/response/numFound==1"
, "/response/docs/[0]/id=='20'"
);
assertJQ(req("df", "syn", "q", "{!lucene}/* foo */ wi /* bar */ fi /* baz */") // default sow=true
, "/response/numFound==0"
assertJQ(req("df", "syn", "q", "{!lucene}/* foo */ wi /* bar */ fi /* baz */") // default sow=false
, "/response/numFound==1"
, "/response/docs/[0]/id=='20'"
);
}
@ -977,18 +1002,18 @@ public class TestSolrQueryParser extends SolrTestCaseJ4 {
//
try (SolrQueryRequest req = req()) {
QParser qParser = QParser.getParser("text:grackle", req); // "text" has autoGeneratePhraseQueries="true"
qParser.setParams(sowFalseParams);
Query q = qParser.getQuery();
assertEquals("text:\"crow blackbird\" text:grackl", q.toString());
for (SolrParams params : Arrays.asList(noSowParams, sowTrueParams)) {
qParser = QParser.getParser("text:grackle", req);
qParser.setParams(params);
q = qParser.getQuery();
assertEquals("spanOr([spanNear([text:crow, text:blackbird], 0, true), text:grackl])", q.toString());
for (SolrParams params : Arrays.asList(noSowParams, sowFalseParams)) {
QParser qParser = QParser.getParser("text:grackle", req); // "text" has autoGeneratePhraseQueries="true"
qParser.setParams(sowFalseParams);
Query q = qParser.getQuery();
assertEquals("text:\"crow blackbird\" text:grackl", q.toString());
}
QParser qParser = QParser.getParser("text:grackle", req);
qParser.setParams(sowTrueParams);
Query q = qParser.getQuery();
assertEquals("spanOr([spanNear([text:crow, text:blackbird], 0, true), text:grackl])", q.toString());
for (SolrParams params : Arrays.asList(noSowParams, sowTrueParams, sowFalseParams)) {
qParser = QParser.getParser("text_sw:grackle", req); // "text_sw" doesn't specify autoGeneratePhraseQueries => default false
qParser.setParams(params);