SOLR-6613: TextField.analyzeMultiTerm does not throw an exception when Analyzer returns no terms. (Bruno Roustant)

Closes #1146
This commit is contained in:
Bruno Roustant 2020-01-06 14:54:18 +01:00
parent d68f3e1a44
commit 0b072ecedb
No known key found for this signature in database
GPG Key ID: CD28DABB95360525
5 changed files with 95 additions and 21 deletions

View File

@ -212,6 +212,8 @@ Bug Fixes
* SOLR-14163: SOLR_SSL_CLIENT_HOSTNAME_VERIFICATION needs to work with Jetty server/client SSL contexts (Kevin Risden) * SOLR-14163: SOLR_SSL_CLIENT_HOSTNAME_VERIFICATION needs to work with Jetty server/client SSL contexts (Kevin Risden)
* SOLR-6613: TextField.analyzeMultiTerm does not throw an exception when Analyzer returns no terms. (Bruno Roustant)
Other Changes Other Changes
--------------------- ---------------------

View File

@ -45,6 +45,7 @@ import org.apache.lucene.search.Query;
import org.apache.lucene.search.QueryVisitor; import org.apache.lucene.search.QueryVisitor;
import org.apache.lucene.search.RegexpQuery; import org.apache.lucene.search.RegexpQuery;
import org.apache.lucene.search.WildcardQuery; import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.QueryBuilder; import org.apache.lucene.util.QueryBuilder;
import org.apache.lucene.util.automaton.Automata; import org.apache.lucene.util.automaton.Automata;
import org.apache.lucene.util.automaton.Automaton; import org.apache.lucene.util.automaton.Automaton;
@ -997,8 +998,8 @@ public abstract class SolrQueryParserBase extends QueryBuilder {
SchemaField sf = schema.getFieldOrNull((field)); SchemaField sf = schema.getFieldOrNull((field));
if (sf == null || ! (fieldType instanceof TextField)) return part; if (sf == null || ! (fieldType instanceof TextField)) return part;
String out = TextField.analyzeMultiTerm(field, part, ((TextField)fieldType).getMultiTermAnalyzer()).utf8ToString(); BytesRef out = TextField.analyzeMultiTerm(field, part, ((TextField)fieldType).getMultiTermAnalyzer());
return out; return out == null ? part : out.utf8ToString();
} }

View File

@ -165,6 +165,16 @@ public class TextField extends FieldType {
return new SolrRangeQuery(field.getName(), lower, upper, minInclusive, maxInclusive); return new SolrRangeQuery(field.getName(), lower, upper, minInclusive, maxInclusive);
} }
/**
* Analyzes a text part using the provided {@link Analyzer} for a multi-term query.
* <p>
* Expects a single token to be used as multi-term term. This single token might also be filtered out
* so zero token is supported and null is returned in this case.
*
* @return The multi-term term bytes; or null if there is no multi-term terms.
* @throws SolrException If the {@link Analyzer} tokenizes more than one token;
* or if an underlying {@link IOException} occurs.
*/
public static BytesRef analyzeMultiTerm(String field, String part, Analyzer analyzerIn) { public static BytesRef analyzeMultiTerm(String field, String part, Analyzer analyzerIn) {
if (part == null || analyzerIn == null) return null; if (part == null || analyzerIn == null) return null;
@ -173,8 +183,10 @@ public class TextField extends FieldType {
TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class); TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class);
if (!source.incrementToken()) if (!source.incrementToken()) {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,"analyzer returned no terms for multiTerm term: " + part); // Accept no tokens because it may have been filtered out by a StopFilter for example.
return null;
}
BytesRef bytes = BytesRef.deepCopyOf(termAtt.getBytesRef()); BytesRef bytes = BytesRef.deepCopyOf(termAtt.getBytesRef());
if (source.incrementToken()) if (source.incrementToken())
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,"analyzer returned too many terms for multiTerm term: " + part); throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,"analyzer returned too many terms for multiTerm term: " + part);

View File

@ -24,6 +24,7 @@ import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.BoostQuery; import org.apache.lucene.search.BoostQuery;
import org.apache.lucene.search.FuzzyQuery; import org.apache.lucene.search.FuzzyQuery;
import org.apache.lucene.search.Query; import org.apache.lucene.search.Query;
import org.apache.lucene.util.BytesRef;
import org.apache.solr.common.params.CommonParams; import org.apache.solr.common.params.CommonParams;
import org.apache.solr.common.params.SimpleParams; import org.apache.solr.common.params.SimpleParams;
import org.apache.solr.common.params.SolrParams; import org.apache.solr.common.params.SolrParams;
@ -186,26 +187,30 @@ public class SimpleQParserPlugin extends QParserPlugin {
for (Map.Entry<String, Float> entry : weights.entrySet()) { for (Map.Entry<String, Float> entry : weights.entrySet()) {
String field = entry.getKey(); String field = entry.getKey();
FieldType type = schema.getFieldType(field); FieldType type = schema.getFieldType(field);
Query prefix; Query prefix = null;
if (type instanceof TextField) { if (type instanceof TextField) {
// If the field type is a TextField then use the multi term analyzer. // If the field type is a TextField then use the multi term analyzer.
Analyzer analyzer = ((TextField)type).getMultiTermAnalyzer(); Analyzer analyzer = ((TextField)type).getMultiTermAnalyzer();
String term = TextField.analyzeMultiTerm(field, text, analyzer).utf8ToString(); BytesRef termBytes = TextField.analyzeMultiTerm(field, text, analyzer);
if (termBytes != null) {
String term = termBytes.utf8ToString();
SchemaField sf = schema.getField(field); SchemaField sf = schema.getField(field);
prefix = sf.getType().getPrefixQuery(qParser, sf, term); prefix = sf.getType().getPrefixQuery(qParser, sf, term);
}
} else { } else {
// If the type is *not* a TextField don't do any analysis. // If the type is *not* a TextField don't do any analysis.
SchemaField sf = schema.getField(field); SchemaField sf = schema.getField(field);
prefix = type.getPrefixQuery(qParser, sf, text); prefix = type.getPrefixQuery(qParser, sf, text);
} }
if (prefix != null) {
float boost = entry.getValue(); float boost = entry.getValue();
if (boost != 1f) { if (boost != 1f) {
prefix = new BoostQuery(prefix, boost); prefix = new BoostQuery(prefix, boost);
} }
bq.add(prefix, BooleanClause.Occur.SHOULD); bq.add(prefix, BooleanClause.Occur.SHOULD);
} }
}
return simplify(bq.build()); return simplify(bq.build());
} }
@ -217,24 +222,28 @@ public class SimpleQParserPlugin extends QParserPlugin {
for (Map.Entry<String, Float> entry : weights.entrySet()) { for (Map.Entry<String, Float> entry : weights.entrySet()) {
String field = entry.getKey(); String field = entry.getKey();
FieldType type = schema.getFieldType(field); FieldType type = schema.getFieldType(field);
Query fuzzy; Query fuzzy = null;
if (type instanceof TextField) { if (type instanceof TextField) {
// If the field type is a TextField then use the multi term analyzer. // If the field type is a TextField then use the multi term analyzer.
Analyzer analyzer = ((TextField)type).getMultiTermAnalyzer(); Analyzer analyzer = ((TextField)type).getMultiTermAnalyzer();
String term = TextField.analyzeMultiTerm(field, text, analyzer).utf8ToString(); BytesRef termBytes = TextField.analyzeMultiTerm(field, text, analyzer);
if (termBytes != null) {
String term = termBytes.utf8ToString();
fuzzy = new FuzzyQuery(new Term(entry.getKey(), term), fuzziness); fuzzy = new FuzzyQuery(new Term(entry.getKey(), term), fuzziness);
}
} else { } else {
// If the type is *not* a TextField don't do any analysis. // If the type is *not* a TextField don't do any analysis.
fuzzy = new FuzzyQuery(new Term(entry.getKey(), text), fuzziness); fuzzy = new FuzzyQuery(new Term(entry.getKey(), text), fuzziness);
} }
if (fuzzy != null) {
float boost = entry.getValue(); float boost = entry.getValue();
if (boost != 1f) { if (boost != 1f) {
fuzzy = new BoostQuery(fuzzy, boost); fuzzy = new BoostQuery(fuzzy, boost);
} }
bq.add(fuzzy, BooleanClause.Occur.SHOULD); bq.add(fuzzy, BooleanClause.Occur.SHOULD);
} }
}
return simplify(bq.build()); return simplify(bq.build());
} }

View File

@ -0,0 +1,50 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.schema;
import org.apache.lucene.analysis.core.StopAnalyzer;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.analysis.en.EnglishAnalyzer;
import org.apache.lucene.util.BytesRef;
import org.apache.solr.SolrTestCaseJ4;
import org.apache.solr.common.SolrException;
import org.junit.Test;
/**
* Tests directly {@link org.apache.solr.schema.TextField} methods.
*/
public class TestTextField extends SolrTestCaseJ4 {
@Test
public void testAnalyzeMultiTerm() {
// No terms provided by the StopFilter (stop word) for the multi-term part.
// This is supported. Check TextField.analyzeMultiTerm returns null (and does not throw an exception).
BytesRef termBytes = TextField.analyzeMultiTerm("field", "the", new StopAnalyzer(EnglishAnalyzer.ENGLISH_STOP_WORDS_SET));
assertNull(termBytes);
// One term provided by the WhitespaceTokenizer for the multi-term part.
// This is the regular case. Check TextField.analyzeMultiTerm returns it (and does not throw an exception).
termBytes = TextField.analyzeMultiTerm("field", "Sol", new WhitespaceAnalyzer());
assertEquals("Sol", termBytes.utf8ToString());
// Two terms provided by the WhitespaceTokenizer for the multi-term part.
// This is not allowed. Expect an exception.
SolrException exception = expectThrows(SolrException.class, () -> TextField.analyzeMultiTerm("field", "term1 term2", new WhitespaceAnalyzer()));
assertEquals("Unexpected error code", SolrException.ErrorCode.BAD_REQUEST.code, exception.code());
}
}