SOLR-10310: By default, stop splitting on whitespace prior to analysis in edismax and standard/"lucene" query parsers

This commit is contained in:
Steve Rowe 2017-04-25 12:02:25 -04:00
parent 4f89f98f66
commit dd171ff8fe
6 changed files with 153 additions and 72 deletions

View File

@ -65,6 +65,11 @@ Upgrading from Solr 6.x
configuration with a boolean init arg "enabled" set to "false". For a more fine-grained control users configuration with a boolean init arg "enabled" set to "false". For a more fine-grained control users
should explicitly specify at least one SolrJmxReporter configuration. should explicitly specify at least one SolrJmxReporter configuration.
* The sow (split-on-whitespace) request param now defaults to false (true in previous versions).
This affects the edismax and standard/"lucene" query parsers: if the sow param is not specified,
query text will not be split on whitespace before analysis. See
https://lucidworks.com/2017/04/18/multi-word-synonyms-solr-adds-query-time-support/ .
New Features New Features
---------------------- ----------------------
* SOLR-9857, SOLR-9858: Collect aggregated metrics from nodes and shard leaders in overseer. (ab) * SOLR-9857, SOLR-9858: Collect aggregated metrics from nodes and shard leaders in overseer. (ab)
@ -119,7 +124,8 @@ Other Changes
* SOLR-10557: Make "compact" format default for /admin/metrics. (ab) * SOLR-10557: Make "compact" format default for /admin/metrics. (ab)
---------------------- * SOLR-10310: By default, stop splitting on whitespace prior to analysis
in edismax and standard/"lucene" query parsers. (Steve Rowe)
================== 6.6.0 ================== ================== 6.6.0 ==================

View File

@ -21,7 +21,7 @@ public class QueryParser extends SolrQueryParserBase implements QueryParserConst
static public enum Operator { OR, AND } static public enum Operator { OR, AND }
/** default split on whitespace behavior */ /** default split on whitespace behavior */
public static final boolean DEFAULT_SPLIT_ON_WHITESPACE = true; public static final boolean DEFAULT_SPLIT_ON_WHITESPACE = false;
public QueryParser(String defaultField, QParser parser) { public QueryParser(String defaultField, QParser parser) {
this(new FastCharStream(new StringReader(""))); this(new FastCharStream(new StringReader("")));

View File

@ -45,7 +45,7 @@ public class QueryParser extends SolrQueryParserBase {
static public enum Operator { OR, AND } static public enum Operator { OR, AND }
/** default split on whitespace behavior */ /** default split on whitespace behavior */
public static final boolean DEFAULT_SPLIT_ON_WHITESPACE = true; public static final boolean DEFAULT_SPLIT_ON_WHITESPACE = false;
public QueryParser(String defaultField, QParser parser) { public QueryParser(String defaultField, QParser parser) {
this(new FastCharStream(new StringReader(""))); this(new FastCharStream(new StringReader("")));

View File

@ -98,6 +98,7 @@ public class DisMaxQParser extends QParser {
@Override @Override
public Query parse() throws SyntaxError { public Query parse() throws SyntaxError {
parsed = true; parsed = true;
SolrParams solrParams = SolrParams.wrapDefaults(localParams, params); SolrParams solrParams = SolrParams.wrapDefaults(localParams, params);
@ -265,6 +266,7 @@ public class DisMaxQParser extends QParser {
IMPOSSIBLE_FIELD_NAME); IMPOSSIBLE_FIELD_NAME);
parser.addAlias(IMPOSSIBLE_FIELD_NAME, tiebreaker, fields); parser.addAlias(IMPOSSIBLE_FIELD_NAME, tiebreaker, fields);
parser.setPhraseSlop(slop); parser.setPhraseSlop(slop);
parser.setSplitOnWhitespace(true);
return parser; return parser;
} }

View File

@ -23,6 +23,8 @@ import java.util.Random;
import java.util.Set; import java.util.Set;
import java.util.stream.Stream; import java.util.stream.Stream;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.BoostQuery; import org.apache.lucene.search.BoostQuery;
@ -364,8 +366,16 @@ public class TestExtendedDismaxParser extends SolrTestCaseJ4 {
assertQ(req("defType","edismax", "mm","0", "q","Terminator: 100", "qf","movies_t foo_i"), assertQ(req("defType","edismax", "mm","0", "q","Terminator: 100", "qf","movies_t foo_i"),
twor); twor);
assertQ(req("defType","edismax", "mm","100%", "q","Terminator: 100", "qf","movies_t foo_i"), assertQ(req("defType","edismax", "mm","100%", "q","Terminator: 100", "qf","movies_t foo_i", "sow","true"),
nor); nor);
// When sow=false, the per-field query structures differ (no "Terminator" query on integer field foo_i),
// so a dismax-per-field is constructed. As a result, mm=100% is applied per-field instead of per-term;
// since there is only one term (100) required in the foo_i field's dismax, the query can match docs that
// only have the 100 term in the foo_i field, and don't necessarily have "Terminator" in any field.
assertQ(req("defType","edismax", "mm","100%", "q","Terminator: 100", "qf","movies_t foo_i", "sow","false"),
oner);
assertQ(req("defType","edismax", "mm","100%", "q","Terminator: 100", "qf","movies_t foo_i"), // default sow=false
oner);
assertQ(req("defType","edismax", "mm","100%", "q","Terminator: 8", "qf","movies_t foo_i"), assertQ(req("defType","edismax", "mm","100%", "q","Terminator: 8", "qf","movies_t foo_i"),
oner); oner);
@ -1413,19 +1423,21 @@ public class TestExtendedDismaxParser extends SolrTestCaseJ4 {
assertJQ(req("qf", "text_sw title", "defType","edismax", "q","wi fi", "sow","true") assertJQ(req("qf", "text_sw title", "defType","edismax", "q","wi fi", "sow","true")
, "/response/numFound==0" , "/response/numFound==0"
); );
assertJQ(req("qf","text_sw title", "defType","edismax", "q","wi fi") // default sow=true assertJQ(req("qf","text_sw title", "defType","edismax", "q","wi fi") // default sow=false
, "/response/numFound==0" , "/response/numFound==1"
, "/response/docs/[0]/id=='72'"
); );
assertJQ(req("qf","text_sw title", "q","{!edismax sow=false}wi fi") assertJQ(req("qf","text_sw title", "q","{!edismax sow=false}wi fi")
, "/response/numFound==1" , "/response/numFound==1"
, "/response/docs/[0]/id=='72'" , "/response/docs/[0]/id=='72'"
); );
assertJQ(req("df", "text_sw title", "q","{!edismax sow=true}wi fi") assertJQ(req("qf", "text_sw title", "q","{!edismax sow=true}wi fi")
, "/response/numFound==0" , "/response/numFound==0"
); );
assertJQ(req("df", "text_sw title", "q", "{!edismax}wi fi") // default sow=true assertJQ(req("qf", "text_sw title", "q", "{!edismax}wi fi") // default sow=false
, "/response/numFound==0" , "/response/numFound==1"
, "/response/docs/[0]/id=='72'"
); );
assertQ(req("qf", "name title", assertQ(req("qf", "name title",
@ -1451,7 +1463,7 @@ public class TestExtendedDismaxParser extends SolrTestCaseJ4 {
assertQ(req("qf", "name title", assertQ(req("qf", "name title",
"q", "barking curds of stigma", "q", "barking curds of stigma",
"defType", "edismax", "defType", "edismax",
"debugQuery", "true"), // Default sow=true "debugQuery", "true"), // Default sow=false
"//str[@name='parsedquery'][contains(.,'DisjunctionMaxQuery((name:barking | title:barking))')]", "//str[@name='parsedquery'][contains(.,'DisjunctionMaxQuery((name:barking | title:barking))')]",
"//str[@name='parsedquery'][contains(.,'DisjunctionMaxQuery((name:curds | title:curds))')]", "//str[@name='parsedquery'][contains(.,'DisjunctionMaxQuery((name:curds | title:curds))')]",
"//str[@name='parsedquery'][contains(.,'DisjunctionMaxQuery((name:of | title:of))')]", "//str[@name='parsedquery'][contains(.,'DisjunctionMaxQuery((name:of | title:of))')]",
@ -1768,18 +1780,18 @@ public class TestExtendedDismaxParser extends SolrTestCaseJ4 {
// //
// crow blackbird, grackle // crow blackbird, grackle
try (SolrQueryRequest req = req(sowFalseParams)) { for (SolrParams params : Arrays.asList(noSowParams, sowFalseParams)) {
QParser qParser = QParser.getParser("text:grackle", "edismax", req); // "text" has autoGeneratePhraseQueries="true"
Query q = qParser.getQuery();
assertEquals("+(text:\"crow blackbird\" text:grackl)", q.toString());
}
for (SolrParams params : Arrays.asList(noSowParams, sowTrueParams)) {
try (SolrQueryRequest req = req(params)) { try (SolrQueryRequest req = req(params)) {
QParser qParser = QParser.getParser("text:grackle", "edismax", req); QParser qParser = QParser.getParser("text:grackle", "edismax", req); // "text" has autoGeneratePhraseQueries="true"
Query q = qParser.getQuery(); Query q = qParser.getQuery();
assertEquals("+spanOr([spanNear([text:crow, text:blackbird], 0, true), text:grackl])", q.toString()); assertEquals("+(text:\"crow blackbird\" text:grackl)", q.toString());
} }
} }
try (SolrQueryRequest req = req(sowTrueParams)) {
QParser qParser = QParser.getParser("text:grackle", "edismax", req);
Query q = qParser.getQuery();
assertEquals("+spanOr([spanNear([text:crow, text:blackbird], 0, true), text:grackl])", q.toString());
}
for (SolrParams params : Arrays.asList(noSowParams, sowTrueParams, sowFalseParams)) { for (SolrParams params : Arrays.asList(noSowParams, sowTrueParams, sowFalseParams)) {
try (SolrQueryRequest req = req(params)) { try (SolrQueryRequest req = req(params)) {
QParser qParser = QParser.getParser("text_sw:grackle", "edismax", req); // "text_sw" doesn't specify autoGeneratePhraseQueries => default false QParser qParser = QParser.getParser("text_sw:grackle", "edismax", req); // "text_sw" doesn't specify autoGeneratePhraseQueries => default false
@ -1790,35 +1802,35 @@ public class TestExtendedDismaxParser extends SolrTestCaseJ4 {
Stream.of(noSowParams, sowTrueParams, sowFalseParams).forEach(p->p.add("qf", "text text_sw")); Stream.of(noSowParams, sowTrueParams, sowFalseParams).forEach(p->p.add("qf", "text text_sw"));
try (SolrQueryRequest req = req(sowFalseParams)) { for (SolrParams params : Arrays.asList(noSowParams, sowFalseParams)) {
QParser qParser = QParser.getParser("grackle", "edismax", req);
Query q = qParser.getQuery();
assertEquals("+((text:\"crow blackbird\" text:grackl)"
+ " | ((+text_sw:crow +text_sw:blackbird) text_sw:grackl))",
q.toString());
qParser = QParser.getParser("grackle wi fi", "edismax", req);
q = qParser.getQuery();
assertEquals("+(((text:\"crow blackbird\" text:grackl) text:wifi)"
+ " | (((+text_sw:crow +text_sw:blackbird) text_sw:grackl) text_sw:wifi))",
q.toString());
}
for (SolrParams params : Arrays.asList(noSowParams, sowTrueParams)) {
try (SolrQueryRequest req = req(params)) { try (SolrQueryRequest req = req(params)) {
QParser qParser = QParser.getParser("grackle", "edismax", req); QParser qParser = QParser.getParser("grackle", "edismax", req);
Query q = qParser.getQuery(); Query q = qParser.getQuery();
assertEquals("+(spanOr([spanNear([text:crow, text:blackbird], 0, true), text:grackl])" assertEquals("+((text:\"crow blackbird\" text:grackl)"
+ " | ((+text_sw:crow +text_sw:blackbird) text_sw:grackl))", + " | ((+text_sw:crow +text_sw:blackbird) text_sw:grackl))",
q.toString()); q.toString());
qParser = QParser.getParser("grackle wi fi", "edismax", req); qParser = QParser.getParser("grackle wi fi", "edismax", req);
q = qParser.getQuery(); q = qParser.getQuery();
assertEquals("+((spanOr([spanNear([text:crow, text:blackbird], 0, true), text:grackl])" assertEquals("+(((text:\"crow blackbird\" text:grackl) text:wifi)"
+ " | ((+text_sw:crow +text_sw:blackbird) text_sw:grackl)) (text:wi | text_sw:wi) (text:fi | text_sw:fi))", + " | (((+text_sw:crow +text_sw:blackbird) text_sw:grackl) text_sw:wifi))",
q.toString()); q.toString());
} }
} }
try (SolrQueryRequest req = req(sowTrueParams)) {
QParser qParser = QParser.getParser("grackle", "edismax", req);
Query q = qParser.getQuery();
assertEquals("+(spanOr([spanNear([text:crow, text:blackbird], 0, true), text:grackl])"
+ " | ((+text_sw:crow +text_sw:blackbird) text_sw:grackl))",
q.toString());
qParser = QParser.getParser("grackle wi fi", "edismax", req);
q = qParser.getQuery();
assertEquals("+((spanOr([spanNear([text:crow, text:blackbird], 0, true), text:grackl])"
+ " | ((+text_sw:crow +text_sw:blackbird) text_sw:grackl)) (text:wi | text_sw:wi) (text:fi | text_sw:fi))",
q.toString());
}
} }
public void testSowFalseWithBoost() throws Exception { public void testSowFalseWithBoost() throws Exception {
@ -1939,6 +1951,8 @@ public class TestExtendedDismaxParser extends SolrTestCaseJ4 {
static class FuzzyDismaxQParser extends ExtendedDismaxQParser { static class FuzzyDismaxQParser extends ExtendedDismaxQParser {
private static final float MIN_SIMILARITY = 0.75F;
public FuzzyDismaxQParser(String qstr, SolrParams localParams, public FuzzyDismaxQParser(String qstr, SolrParams localParams,
SolrParams params, SolrQueryRequest req) { SolrParams params, SolrQueryRequest req) {
super(qstr, localParams, params, req); super(qstr, localParams, params, req);
@ -1958,16 +1972,50 @@ public class TestExtendedDismaxParser extends SolrTestCaseJ4 {
super(parser, defaultField); super(parser, defaultField);
frequentlyMisspelledWords = new HashSet<>(); frequentlyMisspelledWords = new HashSet<>();
frequentlyMisspelledWords.add("absence"); frequentlyMisspelledWords.add("absence");
frequentlyMisspelledWords.add("absenc");
} }
@Override @Override
protected Query getFieldQuery(String field, protected Query getFieldQuery(String field,
String val, boolean quoted, boolean raw) throws SyntaxError { String val, boolean quoted, boolean raw) throws SyntaxError {
if(frequentlyMisspelledWords.contains(val)) { if(frequentlyMisspelledWords.contains(val)) {
return getFuzzyQuery(field, val, 0.75F); return getFuzzyQuery(field, val, MIN_SIMILARITY);
} }
return super.getFieldQuery(field, val, quoted, raw); return super.getFieldQuery(field, val, quoted, raw);
} }
/**
* Handle multi-term queries by repacking boolean queries with frequently misspelled term
* queries rewritten as fuzzy queries.
**/
@Override
protected Query newFieldQuery(Analyzer analyzer, String field, String queryText,
boolean quoted, boolean fieldAutoGenPhraseQueries, boolean fieldEnableGraphQueries)
throws SyntaxError {
Query q = super.newFieldQuery
(analyzer, field, queryText, quoted, fieldAutoGenPhraseQueries, fieldEnableGraphQueries);
if (q instanceof BooleanQuery) {
boolean rewrittenSubQ = false; // dirty flag: rebuild the repacked query?
BooleanQuery.Builder builder = newBooleanQuery();
for (BooleanClause clause : ((BooleanQuery)q).clauses()) {
Query subQ = clause.getQuery();
if (subQ instanceof TermQuery) {
Term subTerm = ((TermQuery)subQ).getTerm();
if (frequentlyMisspelledWords.contains(subTerm.text())) {
rewrittenSubQ = true;
Query fuzzySubQ = newFuzzyQuery(subTerm, MIN_SIMILARITY, getFuzzyPrefixLength());
clause = newBooleanClause(fuzzySubQ, clause.getOccur());
}
}
builder.add(clause);
}
if (rewrittenSubQ) {
builder.setMinimumNumberShouldMatch(((BooleanQuery)q).getMinimumNumberShouldMatch());
q = builder.build();
}
}
return q;
}
} }
} }

View File

@ -75,10 +75,23 @@ public class TestSolrQueryParser extends SolrTestCaseJ4 {
@Test @Test
public void testPhrase() { public void testPhrase() {
// "text" field's type has WordDelimiterGraphFilter (WDGFF) and autoGeneratePhraseQueries=true
// should generate a phrase of "now cow" and match only one doc // should generate a phrase of "now cow" and match only one doc
assertQ(req("q", "text:now-cow", "indent", "true") assertQ(req("q", "text:now-cow", "indent", "true", "sow","true")
, "//*[@numFound='1']" , "//*[@numFound='1']"
); );
// When sow=false, autoGeneratePhraseQueries=true only works when a graph is produced
// (i.e. overlapping terms, e.g. if WDGFF's preserveOriginal=1 or concatenateWords=1).
// The WDGFF config on the "text" field doesn't produce a graph, so the generated query
// is not a phrase query. As a result, docs can match that don't match phrase query "now cow"
assertQ(req("q", "text:now-cow", "indent", "true", "sow","false")
, "//*[@numFound='2']"
);
assertQ(req("q", "text:now-cow", "indent", "true") // default sow=false
, "//*[@numFound='2']"
);
// "text_np" field's type has WDGFF and (default) autoGeneratePhraseQueries=false
// should generate a query of (now OR cow) and match both docs // should generate a query of (now OR cow) and match both docs
assertQ(req("q", "text_np:now-cow", "indent", "true") assertQ(req("q", "text_np:now-cow", "indent", "true")
, "//*[@numFound='2']" , "//*[@numFound='2']"
@ -593,8 +606,9 @@ public class TestSolrQueryParser extends SolrTestCaseJ4 {
assertJQ(req("df", "syn", "q", "wi fi", "sow", "true") assertJQ(req("df", "syn", "q", "wi fi", "sow", "true")
, "/response/numFound==0" , "/response/numFound==0"
); );
assertJQ(req("df", "syn", "q", "wi fi") // default sow=true assertJQ(req("df", "syn", "q", "wi fi") // default sow=false
, "/response/numFound==0" , "/response/numFound==1"
, "/response/docs/[0]/id=='20'"
); );
assertJQ(req("df", "syn", "q", "{!lucene sow=false}wi fi") assertJQ(req("df", "syn", "q", "{!lucene sow=false}wi fi")
@ -605,8 +619,9 @@ public class TestSolrQueryParser extends SolrTestCaseJ4 {
, "/response/numFound==0" , "/response/numFound==0"
); );
assertJQ(req("df", "syn", "q", "{!lucene}wi fi") // default sow=true assertJQ(req("df", "syn", "q", "{!lucene}wi fi") // default sow=false
, "/response/numFound==0" , "/response/numFound==1"
, "/response/docs/[0]/id=='20'"
); );
} }
@ -654,20 +669,25 @@ public class TestSolrQueryParser extends SolrTestCaseJ4 {
, "/response/numFound==0" , "/response/numFound==0"
); );
assertJQ(req("df", "syn", "q", "wi fi") // default sow=true assertJQ(req("df", "syn", "q", "wi fi") // default sow=false
, "/response/numFound==0" , "/response/numFound==1"
, "/response/docs/[0]/id=='20'"
); );
assertJQ(req("df", "syn", "q", "wi /* foo */ fi") // default sow=true assertJQ(req("df", "syn", "q", "wi /* foo */ fi") // default sow=false
, "/response/numFound==0" , "/response/numFound==1"
, "/response/docs/[0]/id=='20'"
); );
assertJQ(req("df", "syn", "q", "wi /* foo */ /* bar */ fi") // default sow=true assertJQ(req("df", "syn", "q", "wi /* foo */ /* bar */ fi") // default sow=false
, "/response/numFound==0" , "/response/numFound==1"
, "/response/docs/[0]/id=='20'"
); );
assertJQ(req("df", "syn", "q", "/* foo */ wi fi /* bar */") // default sow=true assertJQ(req("df", "syn", "q", " /* foo */ wi fi /* bar */") // default sow=false
, "/response/numFound==0" , "/response/numFound==1"
, "/response/docs/[0]/id=='20'"
); );
assertJQ(req("df", "syn", "q", "/* foo */ wi /* bar */ fi /* baz */") // default sow=true assertJQ(req("df", "syn", "q", " /* foo */ wi /* bar */ fi /* baz */") // default sow=false
, "/response/numFound==0" , "/response/numFound==1"
, "/response/docs/[0]/id=='20'"
); );
@ -708,20 +728,25 @@ public class TestSolrQueryParser extends SolrTestCaseJ4 {
, "/response/numFound==0" , "/response/numFound==0"
); );
assertJQ(req("df", "syn", "q", "{!lucene}wi fi") // default sow=true assertJQ(req("df", "syn", "q", "{!lucene}wi fi") // default sow=false
, "/response/numFound==0" , "/response/numFound==1"
, "/response/docs/[0]/id=='20'"
); );
assertJQ(req("df", "syn", "q", "{!lucene}wi /* foo */ fi") // default sow=true assertJQ(req("df", "syn", "q", "{!lucene}wi /* foo */ fi") // default sow=false
, "/response/numFound==0" , "/response/numFound==1"
, "/response/docs/[0]/id=='20'"
); );
assertJQ(req("df", "syn", "q", "{!lucene}wi /* foo */ /* bar */ fi") // default sow=true assertJQ(req("df", "syn", "q", "{!lucene}wi /* foo */ /* bar */ fi") // default sow=false
, "/response/numFound==0" , "/response/numFound==1"
, "/response/docs/[0]/id=='20'"
); );
assertJQ(req("df", "syn", "q", "{!lucene}/* foo */ wi fi /* bar */") // default sow=true assertJQ(req("df", "syn", "q", "{!lucene}/* foo */ wi fi /* bar */") // default sow=false
, "/response/numFound==0" , "/response/numFound==1"
, "/response/docs/[0]/id=='20'"
); );
assertJQ(req("df", "syn", "q", "{!lucene}/* foo */ wi /* bar */ fi /* baz */") // default sow=true assertJQ(req("df", "syn", "q", "{!lucene}/* foo */ wi /* bar */ fi /* baz */") // default sow=false
, "/response/numFound==0" , "/response/numFound==1"
, "/response/docs/[0]/id=='20'"
); );
} }
@ -977,18 +1002,18 @@ public class TestSolrQueryParser extends SolrTestCaseJ4 {
// //
try (SolrQueryRequest req = req()) { try (SolrQueryRequest req = req()) {
QParser qParser = QParser.getParser("text:grackle", req); // "text" has autoGeneratePhraseQueries="true" for (SolrParams params : Arrays.asList(noSowParams, sowFalseParams)) {
qParser.setParams(sowFalseParams); QParser qParser = QParser.getParser("text:grackle", req); // "text" has autoGeneratePhraseQueries="true"
Query q = qParser.getQuery(); qParser.setParams(sowFalseParams);
assertEquals("text:\"crow blackbird\" text:grackl", q.toString()); Query q = qParser.getQuery();
assertEquals("text:\"crow blackbird\" text:grackl", q.toString());
for (SolrParams params : Arrays.asList(noSowParams, sowTrueParams)) {
qParser = QParser.getParser("text:grackle", req);
qParser.setParams(params);
q = qParser.getQuery();
assertEquals("spanOr([spanNear([text:crow, text:blackbird], 0, true), text:grackl])", q.toString());
} }
QParser qParser = QParser.getParser("text:grackle", req);
qParser.setParams(sowTrueParams);
Query q = qParser.getQuery();
assertEquals("spanOr([spanNear([text:crow, text:blackbird], 0, true), text:grackl])", q.toString());
for (SolrParams params : Arrays.asList(noSowParams, sowTrueParams, sowFalseParams)) { for (SolrParams params : Arrays.asList(noSowParams, sowTrueParams, sowFalseParams)) {
qParser = QParser.getParser("text_sw:grackle", req); // "text_sw" doesn't specify autoGeneratePhraseQueries => default false qParser = QParser.getParser("text_sw:grackle", req); // "text_sw" doesn't specify autoGeneratePhraseQueries => default false
qParser.setParams(params); qParser.setParams(params);