SOLR-8329: SchemaSimilarityFactory now supports a 'defaultSimFromFieldType' init option for using a fieldType name to identify which Similarity to use as a default

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1715881 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Chris M. Hostetter 2015-11-23 16:50:49 +00:00
parent 8090380ecd
commit 5b0b4194a1
7 changed files with 329 additions and 28 deletions

View File

@ -242,6 +242,9 @@ New Features
* SOLR-6168: Add a 'sort' local param to the collapse QParser to support using complex sort options
to select the representitive doc for each collapsed group. (Umesh Prasad, hossman)
* SOLR-8329: SchemaSimilarityFactory now supports a 'defaultSimFromFieldType' init option for using
a fieldType name to identify which Similarity to use as a default. (hossman)
Bug Fixes
----------------------

View File

@ -23,6 +23,8 @@ import org.apache.lucene.search.similarities.PerFieldSimilarityWrapper;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.util.Version;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrException.ErrorCode;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.core.SolrCore;
import org.apache.solr.schema.FieldType;
@ -34,43 +36,107 @@ import org.slf4j.LoggerFactory;
/**
* <p>
* SimilarityFactory that returns a {@link PerFieldSimilarityWrapper}
* that delegates to the field type, if it's configured, otherwise
* returns a sensible default depending on the {@link Version} matching configured.
* <code>SimilarityFactory</code> that returns a global {@link PerFieldSimilarityWrapper}
* that delegates to the field type, if it's configured. For field type's that
* do not have a <code>Similarity</code> explicitly configured, the global <code>Similarity</code>
* will use per fieldtype defaults -- either based on an explicitly configured
* <code>defaultSimFromFieldType</code> a sensible default depending on the {@link Version}
* matching configured:
* </p>
* <ul>
* <li><code>luceneMatchVersion &lt; 6.0</code> = {@link ClassicSimilarity}</li>
* <li><code>luceneMatchVersion &gt;= 6.0</code> = {@link BM25Similarity}</li>
* </ul>
* <p>
* <b>NOTE:</b> Users should be aware that in addition to supporting
* <code>Similarity</code> configurations specified on individual
* field types, this factory also differs in behavior from
* {@link ClassicSimilarityFactory} because of other differences in the
* implementations of <code>PerFieldSimilarityWrapper</code> and
* {@link ClassicSimilarity} - notably in methods such as
* {@link Similarity#coord} and {@link Similarity#queryNorm}.
* The <code>defaultSimFromFieldType</code> option accepts the name of any fieldtype, and uses
* whatever <code>Similarity</code> is explicitly configured for that fieldType as thedefault for
* all other field types. For example:
* </p>
* <pre class="prettyprint">
* &lt;similarity class="solr.SchemaSimilarityFactory" &gt;
* &lt;str name="defaultSimFromFieldType"&gt;type-using-custom-dfr&lt;/str&gt;
* &lt;/similarity&gt;
* ...
* &lt;fieldType name="type-using-custom-dfr" class="solr.TextField"&gt;
* ...
* &lt;similarity class="solr.DFRSimilarityFactory"&gt;
* &lt;str name="basicModel"&gt;I(F)&lt;/str&gt;
* &lt;str name="afterEffect"&gt;B&lt;/str&gt;
* &lt;str name="normalization"&gt;H3&lt;/str&gt;
* &lt;float name="mu"&gt;900&lt;/float&gt;
* &lt;/similarity&gt;
* &lt;/fieldType&gt;
* </pre>
* <p>
* In the example above, any fieldtypes that do not define their own <code>&lt;/similarity/&gt;</code>
* will use the <code>Similarity</code> configured for the <code>type-using-custom-dfr</code>.
* </p>
*
* <p>
* <b>NOTE:</b> Users should be aware that even when this factory uses a single default
* <code>Similarity</code> for some or all fields in a Query, the behavior can be inconsistent
* with the behavior of explicitly configuring that same <code>Similarity</code> globally, because
* of differences in how some multi-field / multi-clause behavior is defined in
* <code>PerFieldSimilarityWrapper</code>. In particular please consider carefully the documentation
* &amp; implementation of {@link Similarity#coord} and {@link Similarity#queryNorm} in
* {@link ClassicSimilarity} compared to {@link PerFieldSimilarityWrapper}
* </p>
*
* @see FieldType#getSimilarity
*/
public class SchemaSimilarityFactory extends SimilarityFactory implements SolrCoreAware {
private Similarity similarity; // set by init
private Similarity defaultSimilarity; // set by inform(SolrCore)
private volatile SolrCore core;
private static final String INIT_OPT = "defaultSimFromFieldType";
private String defaultSimFromFieldType; // set by init, if null use sensible implicit default
private volatile SolrCore core; // set by inform(SolrCore)
private volatile Similarity similarity; // lazy instantiated
@Override
public void inform(SolrCore core) {
this.core = core;
this.defaultSimilarity = this.core.getSolrConfig().luceneMatchVersion.onOrAfter(Version.LUCENE_6_0_0)
? new BM25Similarity()
: new ClassicSimilarity();
}
@Override
public void init(SolrParams args) {
defaultSimFromFieldType = args.get(INIT_OPT, null);
super.init(args);
}
@Override
public Similarity getSimilarity() {
if (null == core) {
throw new IllegalStateException("SchemaSimilarityFactory can not be used until SolrCoreAware.inform has been called");
}
if (null == similarity) {
// Need to instantiate lazily, can't do this in inform(SolrCore) because of chicken/egg
// circular initialization hell with core.getLatestSchema() to lookup defaultSimFromFieldType
Similarity defaultSim = null;
if (null == defaultSimFromFieldType) {
// nothing configured, choose a sensible implicit default...
defaultSim = this.core.getSolrConfig().luceneMatchVersion.onOrAfter(Version.LUCENE_6_0_0)
? new BM25Similarity()
: new ClassicSimilarity();
} else {
FieldType defSimFT = core.getLatestSchema().getFieldTypeByName(defaultSimFromFieldType);
if (null == defSimFT) {
throw new SolrException(ErrorCode.SERVER_ERROR,
"SchemaSimilarityFactory configured with " + INIT_OPT + "='" +
defaultSimFromFieldType + "' but that <fieldType> does not exist");
}
defaultSim = defSimFT.getSimilarity();
if (null == defaultSim) {
throw new SolrException(ErrorCode.SERVER_ERROR,
"SchemaSimilarityFactory configured with " + INIT_OPT + "='" +
defaultSimFromFieldType +
"' but that <fieldType> does not define a <similarity>");
}
}
assert null != defaultSim;
final Similarity defaultSimilarity = defaultSim;
similarity = new PerFieldSimilarityWrapper() {
@Override
public Similarity get(String name) {
@ -84,12 +150,6 @@ public class SchemaSimilarityFactory extends SimilarityFactory implements SolrCo
}
};
}
@Override
public Similarity getSimilarity() {
if (null == core) {
throw new IllegalStateException("SchemaSimilarityFactory can not be used until SolrCoreAware.inform has been called");
}
return similarity;
}
}

View File

@ -0,0 +1,43 @@
<?xml version="1.0" ?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<schema name="bad-schema-sim-global-vs-ft-mismatch" version="1.0">
<similarity class="solr.SchemaSimilarityFactory" >
<!-- BROKEN: the named fieldType does not exist -->
<str name="defaultSimFromFieldType">ft-does-not-exist</str>
</similarity>
<fieldType name="ft-has-no-sim" class="solr.TextField">
<analyzer>
<tokenizer class="solr.MockTokenizerFactory"/>
</analyzer>
</fieldType>
<fieldType name="ft-overrides-default-sim" class="solr.TextField">
<analyzer>
<tokenizer class="solr.MockTokenizerFactory"/>
</analyzer>
<similarity class="org.apache.lucene.misc.SweetSpotSimilarity"/>
</fieldType>
<field name="sim1text" type="ft-overrides-default-sim" indexed="true" stored="true"/>
<defaultSearchField>sim1text</defaultSearchField>
</schema>

View File

@ -0,0 +1,43 @@
<?xml version="1.0" ?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<schema name="bad-schema-sim-global-vs-ft-mismatch" version="1.0">
<similarity class="solr.SchemaSimilarityFactory" >
<!-- BROKEN: the named fieldType does not have an explicit sim defined, circular default -->
<str name="defaultSimFromFieldType">ft-has-no-sim</str>
</similarity>
<fieldType name="ft-has-no-sim" class="solr.TextField">
<analyzer>
<tokenizer class="solr.MockTokenizerFactory"/>
</analyzer>
</fieldType>
<fieldType name="ft-overrides-default-sim" class="solr.TextField">
<analyzer>
<tokenizer class="solr.MockTokenizerFactory"/>
</analyzer>
<similarity class="org.apache.lucene.misc.SweetSpotSimilarity"/>
</fieldType>
<field name="sim1text" type="ft-overrides-default-sim" indexed="true" stored="true"/>
<defaultSearchField>sim1text</defaultSearchField>
</schema>

View File

@ -0,0 +1,66 @@
<?xml version="1.0" ?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<!-- Per-field similarity testing where default for fieldtypes w/o sim is overridden -->
<schema name="test" version="1.0">
<similarity class="solr.SchemaSimilarityFactory" >
<str name="defaultSimFromFieldType">sim-used-as-default-override</str>
</similarity>
<!-- some per-field similarity examples -->
<!-- specify a Similarity classname directly -->
<fieldType name="sim-explicit" class="solr.TextField">
<analyzer>
<tokenizer class="solr.MockTokenizerFactory"/>
</analyzer>
<similarity class="org.apache.lucene.misc.SweetSpotSimilarity"/>
</fieldType>
<!-- specify a Similarity factory -->
<fieldType name="sim-used-as-default-override" class="solr.TextField">
<analyzer>
<tokenizer class="solr.MockTokenizerFactory"/>
</analyzer>
<similarity class="solr.CustomSimilarityFactory">
<str name="echo">is there an echo?</str>
</similarity>
</fieldType>
<!-- don't specify any sim at all: get the default -->
<fieldType name="sim-none-get-default" class="solr.TextField">
<analyzer>
<tokenizer class="solr.MockTokenizerFactory"/>
</analyzer>
</fieldType>
<field name="id" type="int" indexed="true" stored="true" multiValued="false" required="false"/>
<field name="sim1text" type="sim-explicit" indexed="true" stored="true"/>
<field name="sim2text" type="sim-used-as-default-override" indexed="true" stored="true"/>
<field name="sim3text" type="sim-none-get-default" indexed="true" stored="true"/>
<!-- make sure custom sims work with dynamic fields -->
<dynamicField name="*_sim1" type="sim-explicit" indexed="true" stored="true"/>
<dynamicField name="*_sim2" type="sim-used-as-default-override" indexed="true" stored="true"/>
<dynamicField name="*_sim3" type="sim-none-get-default" indexed="true" stored="true"/>
<fieldType name="int" class="solr.TrieIntField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
<uniqueKey>id</uniqueKey>
</schema>

View File

@ -119,4 +119,14 @@ public class BadIndexSchemaTest extends AbstractBadConfigTestBase {
doTest("bad-schema-bogus-analysis-parameters.xml", "Unknown parameters");
}
public void testSimDefaultFieldTypeHasNoExplicitSim() throws Exception {
doTest("bad-schema-sim-default-has-no-explicit-sim.xml",
"ft-has-no-sim");
}
public void testSimDefaultFieldTypeDoesNotExist() throws Exception {
doTest("bad-schema-sim-default-does-not-exist.xml",
"ft-does-not-exist");
}
}

View File

@ -0,0 +1,76 @@
package org.apache.solr.search.similarities;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.misc.SweetSpotSimilarity;
import org.apache.lucene.search.similarities.BM25Similarity;
import org.apache.lucene.search.similarities.Similarity;
import org.junit.BeforeClass;
/**
* Tests per-field similarity support in the schema when SchemaSimilarityFactory is explicitly
* configured to use a custom default sim for field types that do not override it.
* @see TestPerFieldSimilarity
*/
public class TestPerFieldSimilarityWithDefaultOverride extends BaseSimilarityTestCase {
@BeforeClass
public static void beforeClass() throws Exception {
initCore("solrconfig-basic.xml","schema-sim-default-override.xml");
}
/** test a field where the sim is specified directly */
public void testDirect() throws Exception {
assertNotNull(getSimilarity("sim1text", SweetSpotSimilarity.class));
}
/** ... and for a dynamic field */
public void testDirectDynamic() throws Exception {
assertNotNull(getSimilarity("text_sim1", SweetSpotSimilarity.class));
}
/** test a field where a configurable sim factory is explicitly defined */
public void testDirectFactory() throws Exception {
MockConfigurableSimilarity sim = getSimilarity("sim2text", MockConfigurableSimilarity.class);
assertEquals("is there an echo?", sim.getPassthrough());
}
/** ... and for a dynamic field */
public void testDirectFactoryDynamic() throws Exception {
MockConfigurableSimilarity sim = getSimilarity("text_sim2", MockConfigurableSimilarity.class);
assertEquals("is there an echo?", sim.getPassthrough());
}
/** test a field where no similarity is specified */
public void testDefaults() throws Exception {
MockConfigurableSimilarity sim = getSimilarity("sim3text", MockConfigurableSimilarity.class);
assertEquals("is there an echo?", sim.getPassthrough());
}
/** ... and for a dynamic field */
public void testDefaultsDynamic() throws Exception {
MockConfigurableSimilarity sim = getSimilarity("text_sim3", MockConfigurableSimilarity.class);
assertEquals("is there an echo?", sim.getPassthrough());
}
/** test a field that does not exist */
public void testNonexistent() throws Exception {
MockConfigurableSimilarity sim = getSimilarity("text_sim3", MockConfigurableSimilarity.class);
assertEquals("is there an echo?", sim.getPassthrough());
}
}