mirror of https://github.com/apache/lucene.git
SOLR-10310: By default, stop splitting on whitespace prior to analysis in edismax and standard/"lucene" query parsers
This commit is contained in:
parent
4f89f98f66
commit
dd171ff8fe
|
@ -65,6 +65,11 @@ Upgrading from Solr 6.x
|
|||
configuration with a boolean init arg "enabled" set to "false". For a more fine-grained control users
|
||||
should explicitly specify at least one SolrJmxReporter configuration.
|
||||
|
||||
* The sow (split-on-whitespace) request param now defaults to false (true in previous versions).
|
||||
This affects the edismax and standard/"lucene" query parsers: if the sow param is not specified,
|
||||
query text will not be split on whitespace before analysis. See
|
||||
https://lucidworks.com/2017/04/18/multi-word-synonyms-solr-adds-query-time-support/ .
|
||||
|
||||
New Features
|
||||
----------------------
|
||||
* SOLR-9857, SOLR-9858: Collect aggregated metrics from nodes and shard leaders in overseer. (ab)
|
||||
|
@ -119,7 +124,8 @@ Other Changes
|
|||
|
||||
* SOLR-10557: Make "compact" format default for /admin/metrics. (ab)
|
||||
|
||||
----------------------
|
||||
* SOLR-10310: By default, stop splitting on whitespace prior to analysis
|
||||
in edismax and standard/"lucene" query parsers. (Steve Rowe)
|
||||
|
||||
================== 6.6.0 ==================
|
||||
|
||||
|
|
|
@ -21,7 +21,7 @@ public class QueryParser extends SolrQueryParserBase implements QueryParserConst
|
|||
static public enum Operator { OR, AND }
|
||||
|
||||
/** default split on whitespace behavior */
|
||||
public static final boolean DEFAULT_SPLIT_ON_WHITESPACE = true;
|
||||
public static final boolean DEFAULT_SPLIT_ON_WHITESPACE = false;
|
||||
|
||||
public QueryParser(String defaultField, QParser parser) {
|
||||
this(new FastCharStream(new StringReader("")));
|
||||
|
|
|
@ -45,7 +45,7 @@ public class QueryParser extends SolrQueryParserBase {
|
|||
static public enum Operator { OR, AND }
|
||||
|
||||
/** default split on whitespace behavior */
|
||||
public static final boolean DEFAULT_SPLIT_ON_WHITESPACE = true;
|
||||
public static final boolean DEFAULT_SPLIT_ON_WHITESPACE = false;
|
||||
|
||||
public QueryParser(String defaultField, QParser parser) {
|
||||
this(new FastCharStream(new StringReader("")));
|
||||
|
|
|
@ -98,6 +98,7 @@ public class DisMaxQParser extends QParser {
|
|||
|
||||
@Override
|
||||
public Query parse() throws SyntaxError {
|
||||
|
||||
parsed = true;
|
||||
SolrParams solrParams = SolrParams.wrapDefaults(localParams, params);
|
||||
|
||||
|
@ -265,6 +266,7 @@ public class DisMaxQParser extends QParser {
|
|||
IMPOSSIBLE_FIELD_NAME);
|
||||
parser.addAlias(IMPOSSIBLE_FIELD_NAME, tiebreaker, fields);
|
||||
parser.setPhraseSlop(slop);
|
||||
parser.setSplitOnWhitespace(true);
|
||||
return parser;
|
||||
}
|
||||
|
||||
|
|
|
@ -23,6 +23,8 @@ import java.util.Random;
|
|||
import java.util.Set;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.BooleanClause;
|
||||
import org.apache.lucene.search.BooleanQuery;
|
||||
import org.apache.lucene.search.BoostQuery;
|
||||
|
@ -364,8 +366,16 @@ public class TestExtendedDismaxParser extends SolrTestCaseJ4 {
|
|||
assertQ(req("defType","edismax", "mm","0", "q","Terminator: 100", "qf","movies_t foo_i"),
|
||||
twor);
|
||||
|
||||
assertQ(req("defType","edismax", "mm","100%", "q","Terminator: 100", "qf","movies_t foo_i"),
|
||||
assertQ(req("defType","edismax", "mm","100%", "q","Terminator: 100", "qf","movies_t foo_i", "sow","true"),
|
||||
nor);
|
||||
// When sow=false, the per-field query structures differ (no "Terminator" query on integer field foo_i),
|
||||
// so a dismax-per-field is constructed. As a result, mm=100% is applied per-field instead of per-term;
|
||||
// since there is only one term (100) required in the foo_i field's dismax, the query can match docs that
|
||||
// only have the 100 term in the foo_i field, and don't necessarily have "Terminator" in any field.
|
||||
assertQ(req("defType","edismax", "mm","100%", "q","Terminator: 100", "qf","movies_t foo_i", "sow","false"),
|
||||
oner);
|
||||
assertQ(req("defType","edismax", "mm","100%", "q","Terminator: 100", "qf","movies_t foo_i"), // default sow=false
|
||||
oner);
|
||||
|
||||
assertQ(req("defType","edismax", "mm","100%", "q","Terminator: 8", "qf","movies_t foo_i"),
|
||||
oner);
|
||||
|
@ -1413,19 +1423,21 @@ public class TestExtendedDismaxParser extends SolrTestCaseJ4 {
|
|||
assertJQ(req("qf", "text_sw title", "defType","edismax", "q","wi fi", "sow","true")
|
||||
, "/response/numFound==0"
|
||||
);
|
||||
assertJQ(req("qf","text_sw title", "defType","edismax", "q","wi fi") // default sow=true
|
||||
, "/response/numFound==0"
|
||||
assertJQ(req("qf","text_sw title", "defType","edismax", "q","wi fi") // default sow=false
|
||||
, "/response/numFound==1"
|
||||
, "/response/docs/[0]/id=='72'"
|
||||
);
|
||||
|
||||
assertJQ(req("qf","text_sw title", "q","{!edismax sow=false}wi fi")
|
||||
, "/response/numFound==1"
|
||||
, "/response/docs/[0]/id=='72'"
|
||||
);
|
||||
assertJQ(req("df", "text_sw title", "q","{!edismax sow=true}wi fi")
|
||||
assertJQ(req("qf", "text_sw title", "q","{!edismax sow=true}wi fi")
|
||||
, "/response/numFound==0"
|
||||
);
|
||||
assertJQ(req("df", "text_sw title", "q", "{!edismax}wi fi") // default sow=true
|
||||
, "/response/numFound==0"
|
||||
assertJQ(req("qf", "text_sw title", "q", "{!edismax}wi fi") // default sow=false
|
||||
, "/response/numFound==1"
|
||||
, "/response/docs/[0]/id=='72'"
|
||||
);
|
||||
|
||||
assertQ(req("qf", "name title",
|
||||
|
@ -1451,7 +1463,7 @@ public class TestExtendedDismaxParser extends SolrTestCaseJ4 {
|
|||
assertQ(req("qf", "name title",
|
||||
"q", "barking curds of stigma",
|
||||
"defType", "edismax",
|
||||
"debugQuery", "true"), // Default sow=true
|
||||
"debugQuery", "true"), // Default sow=false
|
||||
"//str[@name='parsedquery'][contains(.,'DisjunctionMaxQuery((name:barking | title:barking))')]",
|
||||
"//str[@name='parsedquery'][contains(.,'DisjunctionMaxQuery((name:curds | title:curds))')]",
|
||||
"//str[@name='parsedquery'][contains(.,'DisjunctionMaxQuery((name:of | title:of))')]",
|
||||
|
@ -1768,18 +1780,18 @@ public class TestExtendedDismaxParser extends SolrTestCaseJ4 {
|
|||
//
|
||||
// crow blackbird, grackle
|
||||
|
||||
try (SolrQueryRequest req = req(sowFalseParams)) {
|
||||
for (SolrParams params : Arrays.asList(noSowParams, sowFalseParams)) {
|
||||
try (SolrQueryRequest req = req(params)) {
|
||||
QParser qParser = QParser.getParser("text:grackle", "edismax", req); // "text" has autoGeneratePhraseQueries="true"
|
||||
Query q = qParser.getQuery();
|
||||
assertEquals("+(text:\"crow blackbird\" text:grackl)", q.toString());
|
||||
}
|
||||
for (SolrParams params : Arrays.asList(noSowParams, sowTrueParams)) {
|
||||
try (SolrQueryRequest req = req(params)) {
|
||||
}
|
||||
try (SolrQueryRequest req = req(sowTrueParams)) {
|
||||
QParser qParser = QParser.getParser("text:grackle", "edismax", req);
|
||||
Query q = qParser.getQuery();
|
||||
assertEquals("+spanOr([spanNear([text:crow, text:blackbird], 0, true), text:grackl])", q.toString());
|
||||
}
|
||||
}
|
||||
for (SolrParams params : Arrays.asList(noSowParams, sowTrueParams, sowFalseParams)) {
|
||||
try (SolrQueryRequest req = req(params)) {
|
||||
QParser qParser = QParser.getParser("text_sw:grackle", "edismax", req); // "text_sw" doesn't specify autoGeneratePhraseQueries => default false
|
||||
|
@ -1790,7 +1802,8 @@ public class TestExtendedDismaxParser extends SolrTestCaseJ4 {
|
|||
|
||||
Stream.of(noSowParams, sowTrueParams, sowFalseParams).forEach(p->p.add("qf", "text text_sw"));
|
||||
|
||||
try (SolrQueryRequest req = req(sowFalseParams)) {
|
||||
for (SolrParams params : Arrays.asList(noSowParams, sowFalseParams)) {
|
||||
try (SolrQueryRequest req = req(params)) {
|
||||
QParser qParser = QParser.getParser("grackle", "edismax", req);
|
||||
Query q = qParser.getQuery();
|
||||
assertEquals("+((text:\"crow blackbird\" text:grackl)"
|
||||
|
@ -1803,9 +1816,9 @@ public class TestExtendedDismaxParser extends SolrTestCaseJ4 {
|
|||
+ " | (((+text_sw:crow +text_sw:blackbird) text_sw:grackl) text_sw:wifi))",
|
||||
q.toString());
|
||||
}
|
||||
}
|
||||
|
||||
for (SolrParams params : Arrays.asList(noSowParams, sowTrueParams)) {
|
||||
try (SolrQueryRequest req = req(params)) {
|
||||
try (SolrQueryRequest req = req(sowTrueParams)) {
|
||||
QParser qParser = QParser.getParser("grackle", "edismax", req);
|
||||
Query q = qParser.getQuery();
|
||||
assertEquals("+(spanOr([spanNear([text:crow, text:blackbird], 0, true), text:grackl])"
|
||||
|
@ -1819,7 +1832,6 @@ public class TestExtendedDismaxParser extends SolrTestCaseJ4 {
|
|||
q.toString());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void testSowFalseWithBoost() throws Exception {
|
||||
try (SolrQueryRequest req = req("sow", "false", "qf", "subject title")) {
|
||||
|
@ -1939,6 +1951,8 @@ public class TestExtendedDismaxParser extends SolrTestCaseJ4 {
|
|||
|
||||
static class FuzzyDismaxQParser extends ExtendedDismaxQParser {
|
||||
|
||||
private static final float MIN_SIMILARITY = 0.75F;
|
||||
|
||||
public FuzzyDismaxQParser(String qstr, SolrParams localParams,
|
||||
SolrParams params, SolrQueryRequest req) {
|
||||
super(qstr, localParams, params, req);
|
||||
|
@ -1958,16 +1972,50 @@ public class TestExtendedDismaxParser extends SolrTestCaseJ4 {
|
|||
super(parser, defaultField);
|
||||
frequentlyMisspelledWords = new HashSet<>();
|
||||
frequentlyMisspelledWords.add("absence");
|
||||
frequentlyMisspelledWords.add("absenc");
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Query getFieldQuery(String field,
|
||||
String val, boolean quoted, boolean raw) throws SyntaxError {
|
||||
if(frequentlyMisspelledWords.contains(val)) {
|
||||
return getFuzzyQuery(field, val, 0.75F);
|
||||
return getFuzzyQuery(field, val, MIN_SIMILARITY);
|
||||
}
|
||||
return super.getFieldQuery(field, val, quoted, raw);
|
||||
}
|
||||
|
||||
/**
|
||||
* Handle multi-term queries by repacking boolean queries with frequently misspelled term
|
||||
* queries rewritten as fuzzy queries.
|
||||
**/
|
||||
@Override
|
||||
protected Query newFieldQuery(Analyzer analyzer, String field, String queryText,
|
||||
boolean quoted, boolean fieldAutoGenPhraseQueries, boolean fieldEnableGraphQueries)
|
||||
throws SyntaxError {
|
||||
Query q = super.newFieldQuery
|
||||
(analyzer, field, queryText, quoted, fieldAutoGenPhraseQueries, fieldEnableGraphQueries);
|
||||
if (q instanceof BooleanQuery) {
|
||||
boolean rewrittenSubQ = false; // dirty flag: rebuild the repacked query?
|
||||
BooleanQuery.Builder builder = newBooleanQuery();
|
||||
for (BooleanClause clause : ((BooleanQuery)q).clauses()) {
|
||||
Query subQ = clause.getQuery();
|
||||
if (subQ instanceof TermQuery) {
|
||||
Term subTerm = ((TermQuery)subQ).getTerm();
|
||||
if (frequentlyMisspelledWords.contains(subTerm.text())) {
|
||||
rewrittenSubQ = true;
|
||||
Query fuzzySubQ = newFuzzyQuery(subTerm, MIN_SIMILARITY, getFuzzyPrefixLength());
|
||||
clause = newBooleanClause(fuzzySubQ, clause.getOccur());
|
||||
}
|
||||
}
|
||||
builder.add(clause);
|
||||
}
|
||||
if (rewrittenSubQ) {
|
||||
builder.setMinimumNumberShouldMatch(((BooleanQuery)q).getMinimumNumberShouldMatch());
|
||||
q = builder.build();
|
||||
}
|
||||
}
|
||||
return q;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -75,10 +75,23 @@ public class TestSolrQueryParser extends SolrTestCaseJ4 {
|
|||
|
||||
@Test
|
||||
public void testPhrase() {
|
||||
// "text" field's type has WordDelimiterGraphFilter (WDGFF) and autoGeneratePhraseQueries=true
|
||||
// should generate a phrase of "now cow" and match only one doc
|
||||
assertQ(req("q", "text:now-cow", "indent", "true")
|
||||
assertQ(req("q", "text:now-cow", "indent", "true", "sow","true")
|
||||
, "//*[@numFound='1']"
|
||||
);
|
||||
// When sow=false, autoGeneratePhraseQueries=true only works when a graph is produced
|
||||
// (i.e. overlapping terms, e.g. if WDGFF's preserveOriginal=1 or concatenateWords=1).
|
||||
// The WDGFF config on the "text" field doesn't produce a graph, so the generated query
|
||||
// is not a phrase query. As a result, docs can match that don't match phrase query "now cow"
|
||||
assertQ(req("q", "text:now-cow", "indent", "true", "sow","false")
|
||||
, "//*[@numFound='2']"
|
||||
);
|
||||
assertQ(req("q", "text:now-cow", "indent", "true") // default sow=false
|
||||
, "//*[@numFound='2']"
|
||||
);
|
||||
|
||||
// "text_np" field's type has WDGFF and (default) autoGeneratePhraseQueries=false
|
||||
// should generate a query of (now OR cow) and match both docs
|
||||
assertQ(req("q", "text_np:now-cow", "indent", "true")
|
||||
, "//*[@numFound='2']"
|
||||
|
@ -593,8 +606,9 @@ public class TestSolrQueryParser extends SolrTestCaseJ4 {
|
|||
assertJQ(req("df", "syn", "q", "wi fi", "sow", "true")
|
||||
, "/response/numFound==0"
|
||||
);
|
||||
assertJQ(req("df", "syn", "q", "wi fi") // default sow=true
|
||||
, "/response/numFound==0"
|
||||
assertJQ(req("df", "syn", "q", "wi fi") // default sow=false
|
||||
, "/response/numFound==1"
|
||||
, "/response/docs/[0]/id=='20'"
|
||||
);
|
||||
|
||||
assertJQ(req("df", "syn", "q", "{!lucene sow=false}wi fi")
|
||||
|
@ -605,8 +619,9 @@ public class TestSolrQueryParser extends SolrTestCaseJ4 {
|
|||
, "/response/numFound==0"
|
||||
);
|
||||
|
||||
assertJQ(req("df", "syn", "q", "{!lucene}wi fi") // default sow=true
|
||||
, "/response/numFound==0"
|
||||
assertJQ(req("df", "syn", "q", "{!lucene}wi fi") // default sow=false
|
||||
, "/response/numFound==1"
|
||||
, "/response/docs/[0]/id=='20'"
|
||||
);
|
||||
}
|
||||
|
||||
|
@ -654,20 +669,25 @@ public class TestSolrQueryParser extends SolrTestCaseJ4 {
|
|||
, "/response/numFound==0"
|
||||
);
|
||||
|
||||
assertJQ(req("df", "syn", "q", "wi fi") // default sow=true
|
||||
, "/response/numFound==0"
|
||||
assertJQ(req("df", "syn", "q", "wi fi") // default sow=false
|
||||
, "/response/numFound==1"
|
||||
, "/response/docs/[0]/id=='20'"
|
||||
);
|
||||
assertJQ(req("df", "syn", "q", "wi /* foo */ fi") // default sow=true
|
||||
, "/response/numFound==0"
|
||||
assertJQ(req("df", "syn", "q", "wi /* foo */ fi") // default sow=false
|
||||
, "/response/numFound==1"
|
||||
, "/response/docs/[0]/id=='20'"
|
||||
);
|
||||
assertJQ(req("df", "syn", "q", "wi /* foo */ /* bar */ fi") // default sow=true
|
||||
, "/response/numFound==0"
|
||||
assertJQ(req("df", "syn", "q", "wi /* foo */ /* bar */ fi") // default sow=false
|
||||
, "/response/numFound==1"
|
||||
, "/response/docs/[0]/id=='20'"
|
||||
);
|
||||
assertJQ(req("df", "syn", "q", "/* foo */ wi fi /* bar */") // default sow=true
|
||||
, "/response/numFound==0"
|
||||
assertJQ(req("df", "syn", "q", " /* foo */ wi fi /* bar */") // default sow=false
|
||||
, "/response/numFound==1"
|
||||
, "/response/docs/[0]/id=='20'"
|
||||
);
|
||||
assertJQ(req("df", "syn", "q", "/* foo */ wi /* bar */ fi /* baz */") // default sow=true
|
||||
, "/response/numFound==0"
|
||||
assertJQ(req("df", "syn", "q", " /* foo */ wi /* bar */ fi /* baz */") // default sow=false
|
||||
, "/response/numFound==1"
|
||||
, "/response/docs/[0]/id=='20'"
|
||||
);
|
||||
|
||||
|
||||
|
@ -708,20 +728,25 @@ public class TestSolrQueryParser extends SolrTestCaseJ4 {
|
|||
, "/response/numFound==0"
|
||||
);
|
||||
|
||||
assertJQ(req("df", "syn", "q", "{!lucene}wi fi") // default sow=true
|
||||
, "/response/numFound==0"
|
||||
assertJQ(req("df", "syn", "q", "{!lucene}wi fi") // default sow=false
|
||||
, "/response/numFound==1"
|
||||
, "/response/docs/[0]/id=='20'"
|
||||
);
|
||||
assertJQ(req("df", "syn", "q", "{!lucene}wi /* foo */ fi") // default sow=true
|
||||
, "/response/numFound==0"
|
||||
assertJQ(req("df", "syn", "q", "{!lucene}wi /* foo */ fi") // default sow=false
|
||||
, "/response/numFound==1"
|
||||
, "/response/docs/[0]/id=='20'"
|
||||
);
|
||||
assertJQ(req("df", "syn", "q", "{!lucene}wi /* foo */ /* bar */ fi") // default sow=true
|
||||
, "/response/numFound==0"
|
||||
assertJQ(req("df", "syn", "q", "{!lucene}wi /* foo */ /* bar */ fi") // default sow=false
|
||||
, "/response/numFound==1"
|
||||
, "/response/docs/[0]/id=='20'"
|
||||
);
|
||||
assertJQ(req("df", "syn", "q", "{!lucene}/* foo */ wi fi /* bar */") // default sow=true
|
||||
, "/response/numFound==0"
|
||||
assertJQ(req("df", "syn", "q", "{!lucene}/* foo */ wi fi /* bar */") // default sow=false
|
||||
, "/response/numFound==1"
|
||||
, "/response/docs/[0]/id=='20'"
|
||||
);
|
||||
assertJQ(req("df", "syn", "q", "{!lucene}/* foo */ wi /* bar */ fi /* baz */") // default sow=true
|
||||
, "/response/numFound==0"
|
||||
assertJQ(req("df", "syn", "q", "{!lucene}/* foo */ wi /* bar */ fi /* baz */") // default sow=false
|
||||
, "/response/numFound==1"
|
||||
, "/response/docs/[0]/id=='20'"
|
||||
);
|
||||
}
|
||||
|
||||
|
@ -977,18 +1002,18 @@ public class TestSolrQueryParser extends SolrTestCaseJ4 {
|
|||
//
|
||||
try (SolrQueryRequest req = req()) {
|
||||
|
||||
for (SolrParams params : Arrays.asList(noSowParams, sowFalseParams)) {
|
||||
QParser qParser = QParser.getParser("text:grackle", req); // "text" has autoGeneratePhraseQueries="true"
|
||||
qParser.setParams(sowFalseParams);
|
||||
Query q = qParser.getQuery();
|
||||
assertEquals("text:\"crow blackbird\" text:grackl", q.toString());
|
||||
|
||||
for (SolrParams params : Arrays.asList(noSowParams, sowTrueParams)) {
|
||||
qParser = QParser.getParser("text:grackle", req);
|
||||
qParser.setParams(params);
|
||||
q = qParser.getQuery();
|
||||
assertEquals("spanOr([spanNear([text:crow, text:blackbird], 0, true), text:grackl])", q.toString());
|
||||
}
|
||||
|
||||
QParser qParser = QParser.getParser("text:grackle", req);
|
||||
qParser.setParams(sowTrueParams);
|
||||
Query q = qParser.getQuery();
|
||||
assertEquals("spanOr([spanNear([text:crow, text:blackbird], 0, true), text:grackl])", q.toString());
|
||||
|
||||
for (SolrParams params : Arrays.asList(noSowParams, sowTrueParams, sowFalseParams)) {
|
||||
qParser = QParser.getParser("text_sw:grackle", req); // "text_sw" doesn't specify autoGeneratePhraseQueries => default false
|
||||
qParser.setParams(params);
|
||||
|
|
Loading…
Reference in New Issue