SOLR-6613: TextField.analyzeMultiTerm does not throw an exception when Analyzer returns no terms. (Bruno Roustant)

This commit is contained in:
Bruno Roustant 2020-01-10 16:52:49 +01:00
parent 1cb085afcb
commit 72dea4919e
No known key found for this signature in database
GPG Key ID: CD28DABB95360525
5 changed files with 95 additions and 21 deletions

View File

@ -371,6 +371,8 @@ Bug Fixes
affects splits triggered by the autoscale framework, which use async mode. affects splits triggered by the autoscale framework, which use async mode.
(Megan Carey, Andy Vuong, Bilal Waheed, Ilan Ginzburg, yonik) (Megan Carey, Andy Vuong, Bilal Waheed, Ilan Ginzburg, yonik)
* SOLR-6613: TextField.analyzeMultiTerm does not throw an exception when Analyzer returns no terms. (Bruno Roustant)
Other Changes Other Changes
--------------------- ---------------------

View File

@ -44,6 +44,7 @@ import org.apache.lucene.search.Query;
import org.apache.lucene.search.QueryVisitor; import org.apache.lucene.search.QueryVisitor;
import org.apache.lucene.search.RegexpQuery; import org.apache.lucene.search.RegexpQuery;
import org.apache.lucene.search.WildcardQuery; import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.QueryBuilder; import org.apache.lucene.util.QueryBuilder;
import org.apache.lucene.util.Version; import org.apache.lucene.util.Version;
import org.apache.lucene.util.automaton.Automata; import org.apache.lucene.util.automaton.Automata;
@ -995,8 +996,8 @@ public abstract class SolrQueryParserBase extends QueryBuilder {
SchemaField sf = schema.getFieldOrNull((field)); SchemaField sf = schema.getFieldOrNull((field));
if (sf == null || ! (fieldType instanceof TextField)) return part; if (sf == null || ! (fieldType instanceof TextField)) return part;
String out = TextField.analyzeMultiTerm(field, part, ((TextField)fieldType).getMultiTermAnalyzer()).utf8ToString(); BytesRef out = TextField.analyzeMultiTerm(field, part, ((TextField)fieldType).getMultiTermAnalyzer());
return out; return out == null ? part : out.utf8ToString();
} }

View File

@ -165,6 +165,16 @@ public class TextField extends FieldType {
return new SolrRangeQuery(field.getName(), lower, upper, minInclusive, maxInclusive); return new SolrRangeQuery(field.getName(), lower, upper, minInclusive, maxInclusive);
} }
/**
* Analyzes a text part using the provided {@link Analyzer} for a multi-term query.
* <p>
* Expects a single token to be used as multi-term term. This single token might also be filtered out
* so zero token is supported and null is returned in this case.
*
* @return The multi-term term bytes; or null if there is no multi-term terms.
* @throws SolrException If the {@link Analyzer} tokenizes more than one token;
* or if an underlying {@link IOException} occurs.
*/
public static BytesRef analyzeMultiTerm(String field, String part, Analyzer analyzerIn) { public static BytesRef analyzeMultiTerm(String field, String part, Analyzer analyzerIn) {
if (part == null || analyzerIn == null) return null; if (part == null || analyzerIn == null) return null;
@ -173,8 +183,10 @@ public class TextField extends FieldType {
TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class); TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class);
if (!source.incrementToken()) if (!source.incrementToken()) {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,"analyzer returned no terms for multiTerm term: " + part); // Accept no tokens because it may have been filtered out by a StopFilter for example.
return null;
}
BytesRef bytes = BytesRef.deepCopyOf(termAtt.getBytesRef()); BytesRef bytes = BytesRef.deepCopyOf(termAtt.getBytesRef());
if (source.incrementToken()) if (source.incrementToken())
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,"analyzer returned too many terms for multiTerm term: " + part); throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,"analyzer returned too many terms for multiTerm term: " + part);

View File

@ -24,6 +24,7 @@ import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.BoostQuery; import org.apache.lucene.search.BoostQuery;
import org.apache.lucene.search.FuzzyQuery; import org.apache.lucene.search.FuzzyQuery;
import org.apache.lucene.search.Query; import org.apache.lucene.search.Query;
import org.apache.lucene.util.BytesRef;
import org.apache.solr.common.params.CommonParams; import org.apache.solr.common.params.CommonParams;
import org.apache.solr.common.params.SimpleParams; import org.apache.solr.common.params.SimpleParams;
import org.apache.solr.common.params.SolrParams; import org.apache.solr.common.params.SolrParams;
@ -186,25 +187,29 @@ public class SimpleQParserPlugin extends QParserPlugin {
for (Map.Entry<String, Float> entry : weights.entrySet()) { for (Map.Entry<String, Float> entry : weights.entrySet()) {
String field = entry.getKey(); String field = entry.getKey();
FieldType type = schema.getFieldType(field); FieldType type = schema.getFieldType(field);
Query prefix; Query prefix = null;
if (type instanceof TextField) { if (type instanceof TextField) {
// If the field type is a TextField then use the multi term analyzer. // If the field type is a TextField then use the multi term analyzer.
Analyzer analyzer = ((TextField)type).getMultiTermAnalyzer(); Analyzer analyzer = ((TextField)type).getMultiTermAnalyzer();
String term = TextField.analyzeMultiTerm(field, text, analyzer).utf8ToString(); BytesRef termBytes = TextField.analyzeMultiTerm(field, text, analyzer);
SchemaField sf = schema.getField(field); if (termBytes != null) {
prefix = sf.getType().getPrefixQuery(qParser, sf, term); String term = termBytes.utf8ToString();
SchemaField sf = schema.getField(field);
prefix = sf.getType().getPrefixQuery(qParser, sf, term);
}
} else { } else {
// If the type is *not* a TextField don't do any analysis. // If the type is *not* a TextField don't do any analysis.
SchemaField sf = schema.getField(field); SchemaField sf = schema.getField(field);
prefix = type.getPrefixQuery(qParser, sf, text); prefix = type.getPrefixQuery(qParser, sf, text);
} }
if (prefix != null) {
float boost = entry.getValue(); float boost = entry.getValue();
if (boost != 1f) { if (boost != 1f) {
prefix = new BoostQuery(prefix, boost); prefix = new BoostQuery(prefix, boost);
}
bq.add(prefix, BooleanClause.Occur.SHOULD);
} }
bq.add(prefix, BooleanClause.Occur.SHOULD);
} }
return simplify(bq.build()); return simplify(bq.build());
@ -217,23 +222,27 @@ public class SimpleQParserPlugin extends QParserPlugin {
for (Map.Entry<String, Float> entry : weights.entrySet()) { for (Map.Entry<String, Float> entry : weights.entrySet()) {
String field = entry.getKey(); String field = entry.getKey();
FieldType type = schema.getFieldType(field); FieldType type = schema.getFieldType(field);
Query fuzzy; Query fuzzy = null;
if (type instanceof TextField) { if (type instanceof TextField) {
// If the field type is a TextField then use the multi term analyzer. // If the field type is a TextField then use the multi term analyzer.
Analyzer analyzer = ((TextField)type).getMultiTermAnalyzer(); Analyzer analyzer = ((TextField)type).getMultiTermAnalyzer();
String term = TextField.analyzeMultiTerm(field, text, analyzer).utf8ToString(); BytesRef termBytes = TextField.analyzeMultiTerm(field, text, analyzer);
fuzzy = new FuzzyQuery(new Term(entry.getKey(), term), fuzziness); if (termBytes != null) {
String term = termBytes.utf8ToString();
fuzzy = new FuzzyQuery(new Term(entry.getKey(), term), fuzziness);
}
} else { } else {
// If the type is *not* a TextField don't do any analysis. // If the type is *not* a TextField don't do any analysis.
fuzzy = new FuzzyQuery(new Term(entry.getKey(), text), fuzziness); fuzzy = new FuzzyQuery(new Term(entry.getKey(), text), fuzziness);
} }
if (fuzzy != null) {
float boost = entry.getValue(); float boost = entry.getValue();
if (boost != 1f) { if (boost != 1f) {
fuzzy = new BoostQuery(fuzzy, boost); fuzzy = new BoostQuery(fuzzy, boost);
}
bq.add(fuzzy, BooleanClause.Occur.SHOULD);
} }
bq.add(fuzzy, BooleanClause.Occur.SHOULD);
} }
return simplify(bq.build()); return simplify(bq.build());

View File

@ -0,0 +1,50 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.schema;
import org.apache.lucene.analysis.core.StopAnalyzer;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.analysis.en.EnglishAnalyzer;
import org.apache.lucene.util.BytesRef;
import org.apache.solr.SolrTestCaseJ4;
import org.apache.solr.common.SolrException;
import org.junit.Test;
/**
* Tests directly {@link org.apache.solr.schema.TextField} methods.
*/
public class TestTextField extends SolrTestCaseJ4 {
@Test
public void testAnalyzeMultiTerm() {
// No terms provided by the StopFilter (stop word) for the multi-term part.
// This is supported. Check TextField.analyzeMultiTerm returns null (and does not throw an exception).
BytesRef termBytes = TextField.analyzeMultiTerm("field", "the", new StopAnalyzer(EnglishAnalyzer.ENGLISH_STOP_WORDS_SET));
assertNull(termBytes);
// One term provided by the WhitespaceTokenizer for the multi-term part.
// This is the regular case. Check TextField.analyzeMultiTerm returns it (and does not throw an exception).
termBytes = TextField.analyzeMultiTerm("field", "Sol", new WhitespaceAnalyzer());
assertEquals("Sol", termBytes.utf8ToString());
// Two terms provided by the WhitespaceTokenizer for the multi-term part.
// This is not allowed. Expect an exception.
SolrException exception = expectThrows(SolrException.class, () -> TextField.analyzeMultiTerm("field", "term1 term2", new WhitespaceAnalyzer()));
assertEquals("Unexpected error code", SolrException.ErrorCode.BAD_REQUEST.code, exception.code());
}
}