Add a scripted similarity. (#25831)

The goal of this similarity is to help users who would like to keep the
functionality of the `tf-idf` similarity that we want to remove, or to allow
for specific usec-cases (disabling idf, disabling tf, disabling length norm,
etc.) to not have to build a custom plugin and familiarize with the low-level
Lucene API.
This commit is contained in:
Adrien Grand 2017-08-08 08:55:12 +02:00 committed by GitHub
parent 872526cad3
commit f0cba4fce5
27 changed files with 1272 additions and 49 deletions

View File

@ -140,7 +140,7 @@ public class MetaDataIndexUpgradeService extends AbstractComponent {
// We cannot instantiate real analysis server at this point because the node might not have
// been started yet. However, we don't really need real analyzers at this stage - so we can fake it
IndexSettings indexSettings = new IndexSettings(indexMetaData, this.settings);
SimilarityService similarityService = new SimilarityService(indexSettings, Collections.emptyMap());
SimilarityService similarityService = new SimilarityService(indexSettings, null, Collections.emptyMap());
final NamedAnalyzer fakeDefault = new NamedAnalyzer("default", AnalyzerScope.INDEX, new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {

View File

@ -69,7 +69,7 @@ import java.util.function.Function;
* IndexModule represents the central extension point for index level custom implementations like:
* <ul>
* <li>{@link SimilarityProvider} - New {@link SimilarityProvider} implementations can be registered through
* {@link #addSimilarity(String, TriFunction)}while existing Providers can be referenced through Settings under the
* {@link #addSimilarity(String, SimilarityProvider.Factory)} while existing Providers can be referenced through Settings under the
* {@link IndexModule#SIMILARITY_SETTINGS_PREFIX} prefix along with the "type" value. For example, to reference the
* {@link BM25SimilarityProvider}, the configuration <tt>"index.similarity.my_similarity.type : "BM25"</tt> can be used.</li>
* <li>{@link IndexStore} - Custom {@link IndexStore} instances can be registered via {@link #addIndexStore(String, Function)}</li>
@ -112,7 +112,7 @@ public final class IndexModule {
final SetOnce<EngineFactory> engineFactory = new SetOnce<>();
private SetOnce<IndexSearcherWrapperFactory> indexSearcherWrapper = new SetOnce<>();
private final Set<IndexEventListener> indexEventListeners = new HashSet<>();
private final Map<String, TriFunction<String, Settings, Settings, SimilarityProvider>> similarities = new HashMap<>();
private final Map<String, SimilarityProvider.Factory> similarities = new HashMap<>();
private final Map<String, Function<IndexSettings, IndexStore>> storeTypes = new HashMap<>();
private final SetOnce<BiFunction<IndexSettings, IndicesQueryCache, QueryCache>> forceQueryCacheProvider = new SetOnce<>();
private final List<SearchOperationListener> searchOperationListeners = new ArrayList<>();
@ -256,7 +256,7 @@ public final class IndexModule {
* @param name Name of the SimilarityProvider
* @param similarity SimilarityProvider to register
*/
public void addSimilarity(String name, TriFunction<String, Settings, Settings, SimilarityProvider> similarity) {
public void addSimilarity(String name, SimilarityProvider.Factory similarity) {
ensureNotFrozen();
if (similarities.containsKey(name) || SimilarityService.BUILT_IN.containsKey(name)) {
throw new IllegalArgumentException("similarity for name: [" + name + " is already registered");
@ -361,7 +361,8 @@ public final class IndexModule {
} else {
queryCache = new DisabledQueryCache(indexSettings);
}
return new IndexService(indexSettings, environment, xContentRegistry, new SimilarityService(indexSettings, similarities),
return new IndexService(indexSettings, environment, xContentRegistry,
new SimilarityService(indexSettings, scriptService, similarities),
shardStoreDeleter, analysisRegistry, engineFactory.get(), circuitBreakerService, bigArrays, threadPool, scriptService,
client, queryCache, store, eventListener, searcherWrapperFactory, mapperRegistry,
indicesFieldDataCache, searchOperationListeners, indexOperationListeners, namedWriteableRegistry);
@ -371,9 +372,10 @@ public final class IndexModule {
* creates a new mapper service to do administrative work like mapping updates. This *should not* be used for document parsing.
* doing so will result in an exception.
*/
public MapperService newIndexMapperService(NamedXContentRegistry xContentRegistry, MapperRegistry mapperRegistry) throws IOException {
public MapperService newIndexMapperService(NamedXContentRegistry xContentRegistry, MapperRegistry mapperRegistry,
ScriptService scriptService) throws IOException {
return new MapperService(indexSettings, analysisRegistry.build(indexSettings), xContentRegistry,
new SimilarityService(indexSettings, similarities), mapperRegistry,
new SimilarityService(indexSettings, scriptService, similarities), mapperRegistry,
() -> { throw new UnsupportedOperationException("no index query shard context available"); });
}

View File

@ -0,0 +1,284 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.similarity;
import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.search.CollectionStatistics;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.TermStatistics;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.SmallFloat;
import org.elasticsearch.script.SimilarityScript;
import org.elasticsearch.script.SimilarityWeightScript;
import java.io.IOException;
/**
* A {@link Similarity} implementation that allows scores to be scripted.
*/
public final class ScriptedSimilarity extends Similarity {
final String weightScriptSource;
final String scriptSource;
final SimilarityWeightScript.Factory weightScriptFactory;
final SimilarityScript.Factory scriptFactory;
final boolean discountOverlaps;
/** Sole constructor. */
public ScriptedSimilarity(String weightScriptString, SimilarityWeightScript.Factory weightScriptFactory,
String scriptString, SimilarityScript.Factory scriptFactory, boolean discountOverlaps) {
this.weightScriptSource = weightScriptString;
this.weightScriptFactory = weightScriptFactory;
this.scriptSource = scriptString;
this.scriptFactory = scriptFactory;
this.discountOverlaps = discountOverlaps;
}
@Override
public String toString() {
return getClass().getSimpleName() + "(weightScript=[" + weightScriptSource + "], script=[" + scriptSource + "])";
}
@Override
public long computeNorm(FieldInvertState state) {
final int numTerms = discountOverlaps ? state.getLength() - state.getNumOverlap() : state.getLength();
return SmallFloat.intToByte4(numTerms);
}
@Override
public SimWeight computeWeight(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
Query query = new Query(boost);
long docCount = collectionStats.docCount();
if (docCount == -1) {
docCount = collectionStats.maxDoc();
}
Field field = new Field(docCount, collectionStats.sumDocFreq(), collectionStats.sumTotalTermFreq());
Term[] terms = new Term[termStats.length];
for (int i = 0; i < termStats.length; ++i) {
terms[i] = new Term(termStats[i].docFreq(), termStats[i].totalTermFreq());
}
return new Weight(collectionStats.field(), query, field, terms);
}
/** Compute the part of the score that does not depend on the current document using the init_script. */
private double computeWeight(Query query, Field field, Term term) throws IOException {
if (weightScriptFactory == null) {
return 1d;
}
SimilarityWeightScript weightScript = weightScriptFactory.newInstance();
return weightScript.execute(query, field, term);
}
@Override
public SimScorer simScorer(SimWeight w, LeafReaderContext context) throws IOException {
Weight weight = (Weight) w;
SimScorer[] scorers = new SimScorer[weight.terms.length];
for (int i = 0; i < weight.terms.length; ++i) {
final Term term = weight.terms[i];
final SimilarityScript script = scriptFactory.newInstance();
final NumericDocValues norms = context.reader().getNormValues(weight.fieldName);
final Doc doc = new Doc(norms);
final double scoreWeight = computeWeight(weight.query, weight.field, term);
scorers[i] = new SimScorer() {
@Override
public float score(int docID, float freq) throws IOException {
doc.docID = docID;
doc.freq = freq;
return (float) script.execute(scoreWeight, weight.query, weight.field, term, doc);
}
@Override
public float computeSlopFactor(int distance) {
return 1.0f / (distance + 1);
}
@Override
public float computePayloadFactor(int doc, int start, int end, BytesRef payload) {
return 1f;
}
@Override
public Explanation explain(int docID, Explanation freq) throws IOException {
doc.docID = docID;
float score = score(docID, freq.getValue());
return Explanation.match(score, "score from " + ScriptedSimilarity.this.toString() +
" computed from:",
Explanation.match((float) scoreWeight, "weight"),
Explanation.match(weight.query.boost, "query.boost"),
Explanation.match(weight.field.docCount, "field.docCount"),
Explanation.match(weight.field.sumDocFreq, "field.sumDocFreq"),
Explanation.match(weight.field.sumTotalTermFreq, "field.sumTotalTermFreq"),
Explanation.match(term.docFreq, "term.docFreq"),
Explanation.match(term.totalTermFreq, "term.totalTermFreq"),
Explanation.match(freq.getValue(), "doc.freq", freq.getDetails()),
Explanation.match(doc.getLength(), "doc.length"));
}
};
}
if (scorers.length == 1) {
return scorers[0];
} else {
// Sum scores across terms like a BooleanQuery would do
return new SimScorer() {
@Override
public float score(int doc, float freq) throws IOException {
double sum = 0;
for (SimScorer scorer : scorers) {
sum += scorer.score(doc, freq);
}
return (float) sum;
}
@Override
public float computeSlopFactor(int distance) {
return 1.0f / (distance + 1);
}
@Override
public float computePayloadFactor(int doc, int start, int end, BytesRef payload) {
return 1f;
}
@Override
public Explanation explain(int doc, Explanation freq) throws IOException {
Explanation[] subs = new Explanation[scorers.length];
for (int i = 0; i < subs.length; ++i) {
subs[i] = scorers[i].explain(doc, freq);
}
return Explanation.match(score(doc, freq.getValue()), "Sum of:", subs);
}
};
}
}
private static class Weight extends SimWeight {
private final String fieldName;
private final Query query;
private final Field field;
private final Term[] terms;
Weight(String fieldName, Query query, Field field, Term[] terms) {
this.fieldName = fieldName;
this.query = query;
this.field = field;
this.terms = terms;
}
}
/** Scoring factors that come from the query. */
public static class Query {
private final float boost;
private Query(float boost) {
this.boost = boost;
}
/** The boost of the query. It should typically be incorporated into the score as a multiplicative factor. */
public float getBoost() {
return boost;
}
}
/** Statistics that are specific to a given field. */
public static class Field {
private final long docCount;
private final long sumDocFreq;
private final long sumTotalTermFreq;
private Field(long docCount, long sumDocFreq, long sumTotalTermFreq) {
this.docCount = docCount;
this.sumDocFreq = sumDocFreq;
this.sumTotalTermFreq = sumTotalTermFreq;
}
/** Return the number of documents that have a value for this field. */
public long getDocCount() {
return docCount;
}
/** Return the sum of {@link Term#getDocFreq()} for all terms that exist in this field,
* or {@code -1} if this statistic is not available. */
public long getSumDocFreq() {
return sumDocFreq;
}
/** Return the sum of {@link Term#getTotalTermFreq()} for all terms that exist in this field,
* or {@code -1} if this statistic is not available. */
public long getSumTotalTermFreq() {
return sumTotalTermFreq;
}
}
/** Statistics that are specific to a given term. */
public static class Term {
private final long docFreq;
private final long totalTermFreq;
private Term(long docFreq, long totalTermFreq) {
this.docFreq = docFreq;
this.totalTermFreq = totalTermFreq;
}
/** Return the number of documents that contain this term in the index. */
public long getDocFreq() {
return docFreq;
}
/** Return the total number of occurrences of this term in the index, or {@code -1} if this statistic is not available. */
public long getTotalTermFreq() {
return totalTermFreq;
}
}
/** Statistics that are specific to a document. */
public static class Doc {
private final NumericDocValues norms;
private int docID;
private float freq;
private Doc(NumericDocValues norms) {
this.norms = norms;
}
/** Return the number of tokens that the current document has in the considered field. */
public int getLength() throws IOException {
// the length is computed lazily so that similarities that do not use the length are
// not penalized
if (norms == null) {
return 1;
} else if (norms.advanceExact(docID)) {
return SmallFloat.byte4ToInt((byte) norms.longValue());
} else {
return 0;
}
}
/** Return the number of occurrences of the term in the current document for the considered field. */
public float getFreq() {
return freq;
}
}
}

View File

@ -0,0 +1,58 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.similarity;
import org.apache.lucene.search.similarities.Similarity;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.script.Script;
import org.elasticsearch.script.ScriptService;
import org.elasticsearch.script.SimilarityScript;
import org.elasticsearch.script.SimilarityWeightScript;
/** Provider of scripted similarities. */
public class ScriptedSimilarityProvider extends AbstractSimilarityProvider {
private final ScriptedSimilarity scriptedSimilarity;
public ScriptedSimilarityProvider(String name, Settings settings, Settings indexSettings, ScriptService scriptService) {
super(name);
boolean discountOverlaps = settings.getAsBoolean("discount_overlaps", true);
Settings scriptSettings = settings.getAsSettings("script");
Script script = Script.parse(scriptSettings);
SimilarityScript.Factory scriptFactory = scriptService.compile(script, SimilarityScript.CONTEXT);
Settings weightScriptSettings = settings.getAsSettings("weight_script");
Script weightScript = null;
SimilarityWeightScript.Factory weightScriptFactory = null;
if (weightScriptSettings.isEmpty() == false) {
weightScript = Script.parse(weightScriptSettings);
weightScriptFactory = scriptService.compile(weightScript, SimilarityWeightScript.CONTEXT);
}
scriptedSimilarity = new ScriptedSimilarity(
weightScript == null ? null : weightScript.toString(),
weightScriptFactory == null ? null : weightScriptFactory::newInstance,
script.toString(), scriptFactory::newInstance, discountOverlaps);
}
@Override
public Similarity get() {
return scriptedSimilarity;
}
}

View File

@ -20,6 +20,8 @@
package org.elasticsearch.index.similarity;
import org.apache.lucene.search.similarities.Similarity;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.script.ScriptService;
/**
* Provider for {@link Similarity} instances
@ -39,4 +41,11 @@ public interface SimilarityProvider {
* @return Provided {@link Similarity}
*/
Similarity get();
/** Factory of {@link SimilarityProvider} */
@FunctionalInterface
interface Factory {
/** Create a new {@link SimilarityProvider}. */
SimilarityProvider create(String name, Settings settings, Settings indexSettings, ScriptService scriptService);
}
}

View File

@ -22,7 +22,6 @@ package org.elasticsearch.index.similarity;
import org.apache.lucene.search.similarities.PerFieldSimilarityWrapper;
import org.apache.lucene.search.similarities.Similarity;
import org.elasticsearch.Version;
import org.elasticsearch.common.TriFunction;
import org.elasticsearch.common.logging.DeprecationLogger;
import org.elasticsearch.common.logging.Loggers;
import org.elasticsearch.common.settings.Settings;
@ -31,6 +30,7 @@ import org.elasticsearch.index.IndexModule;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.mapper.MappedFieldType;
import org.elasticsearch.index.mapper.MapperService;
import org.elasticsearch.script.ScriptService;
import java.util.Collections;
import java.util.HashMap;
@ -42,27 +42,38 @@ public final class SimilarityService extends AbstractIndexComponent {
public static final String DEFAULT_SIMILARITY = "BM25";
private final Similarity defaultSimilarity;
private final Map<String, SimilarityProvider> similarities;
private static final Map<String, TriFunction<String, Settings, Settings, SimilarityProvider>> DEFAULTS;
public static final Map<String, TriFunction<String, Settings, Settings, SimilarityProvider>> BUILT_IN;
private static final Map<String, SimilarityProvider.Factory> DEFAULTS;
public static final Map<String, SimilarityProvider.Factory> BUILT_IN;
static {
Map<String, TriFunction<String, Settings, Settings, SimilarityProvider>> defaults = new HashMap<>();
Map<String, TriFunction<String, Settings, Settings, SimilarityProvider>> buildIn = new HashMap<>();
defaults.put("classic", ClassicSimilarityProvider::new);
defaults.put("BM25", BM25SimilarityProvider::new);
defaults.put("boolean", BooleanSimilarityProvider::new);
buildIn.put("classic", ClassicSimilarityProvider::new);
buildIn.put("BM25", BM25SimilarityProvider::new);
buildIn.put("DFR", DFRSimilarityProvider::new);
buildIn.put("IB", IBSimilarityProvider::new);
buildIn.put("LMDirichlet", LMDirichletSimilarityProvider::new);
buildIn.put("LMJelinekMercer", LMJelinekMercerSimilarityProvider::new);
buildIn.put("DFI", DFISimilarityProvider::new);
Map<String, SimilarityProvider.Factory> defaults = new HashMap<>();
Map<String, SimilarityProvider.Factory> buildIn = new HashMap<>();
defaults.put("classic",
(name, settings, indexSettings, scriptService) -> new ClassicSimilarityProvider(name, settings, indexSettings));
defaults.put("BM25",
(name, settings, indexSettings, scriptService) -> new BM25SimilarityProvider(name, settings, indexSettings));
defaults.put("boolean",
(name, settings, indexSettings, scriptService) -> new BooleanSimilarityProvider(name, settings, indexSettings));
buildIn.put("classic",
(name, settings, indexSettings, scriptService) -> new ClassicSimilarityProvider(name, settings, indexSettings));
buildIn.put("BM25",
(name, settings, indexSettings, scriptService) -> new BM25SimilarityProvider(name, settings, indexSettings));
buildIn.put("DFR",
(name, settings, indexSettings, scriptService) -> new DFRSimilarityProvider(name, settings, indexSettings));
buildIn.put("IB",
(name, settings, indexSettings, scriptService) -> new IBSimilarityProvider(name, settings, indexSettings));
buildIn.put("LMDirichlet",
(name, settings, indexSettings, scriptService) -> new LMDirichletSimilarityProvider(name, settings, indexSettings));
buildIn.put("LMJelinekMercer",
(name, settings, indexSettings, scriptService) -> new LMJelinekMercerSimilarityProvider(name, settings, indexSettings));
buildIn.put("DFI",
(name, settings, indexSettings, scriptService) -> new DFISimilarityProvider(name, settings, indexSettings));
buildIn.put("scripted", ScriptedSimilarityProvider::new);
DEFAULTS = Collections.unmodifiableMap(defaults);
BUILT_IN = Collections.unmodifiableMap(buildIn);
}
public SimilarityService(IndexSettings indexSettings,
Map<String, TriFunction<String, Settings, Settings, SimilarityProvider>> similarities) {
public SimilarityService(IndexSettings indexSettings, ScriptService scriptService,
Map<String, SimilarityProvider.Factory> similarities) {
super(indexSettings);
Map<String, SimilarityProvider> providers = new HashMap<>(similarities.size());
Map<String, Settings> similaritySettings = this.indexSettings.getSettings().getGroups(IndexModule.SIMILARITY_SETTINGS_PREFIX);
@ -79,14 +90,12 @@ public final class SimilarityService extends AbstractIndexComponent {
} else if ((similarities.containsKey(typeName) || BUILT_IN.containsKey(typeName)) == false) {
throw new IllegalArgumentException("Unknown Similarity type [" + typeName + "] for [" + name + "]");
}
TriFunction<String, Settings, Settings, SimilarityProvider> defaultFactory = BUILT_IN.get(typeName);
TriFunction<String, Settings, Settings, SimilarityProvider> factory = similarities.getOrDefault(typeName, defaultFactory);
if (providerSettings == null) {
providerSettings = Settings.Builder.EMPTY_SETTINGS;
SimilarityProvider.Factory defaultFactory = BUILT_IN.get(typeName);
SimilarityProvider.Factory factory = similarities.getOrDefault(typeName, defaultFactory);
providers.put(name, factory.create(name, providerSettings, indexSettings.getSettings(), scriptService));
}
providers.put(name, factory.apply(name, providerSettings, indexSettings.getSettings()));
}
Map<String, SimilarityProvider> providerMapping = addSimilarities(similaritySettings, indexSettings.getSettings(), DEFAULTS);
Map<String, SimilarityProvider> providerMapping = addSimilarities(similaritySettings, indexSettings.getSettings(), scriptService,
DEFAULTS);
for (Map.Entry<String, SimilarityProvider> entry : providerMapping.entrySet()) {
// Avoid overwriting custom providers for indices older that v5.0
if (providers.containsKey(entry.getKey()) && indexSettings.getIndexVersionCreated().before(Version.V_5_0_0_alpha1)) {
@ -109,16 +118,16 @@ public final class SimilarityService extends AbstractIndexComponent {
}
private Map<String, SimilarityProvider> addSimilarities(Map<String, Settings> similaritySettings, Settings indexSettings,
Map<String, TriFunction<String, Settings, Settings, SimilarityProvider>> similarities) {
ScriptService scriptService, Map<String, SimilarityProvider.Factory> similarities) {
Map<String, SimilarityProvider> providers = new HashMap<>(similarities.size());
for (Map.Entry<String, TriFunction<String, Settings, Settings, SimilarityProvider>> entry : similarities.entrySet()) {
for (Map.Entry<String, SimilarityProvider.Factory> entry : similarities.entrySet()) {
String name = entry.getKey();
TriFunction<String, Settings, Settings, SimilarityProvider> factory = entry.getValue();
SimilarityProvider.Factory factory = entry.getValue();
Settings providerSettings = similaritySettings.get(name);
if (providerSettings == null) {
providerSettings = Settings.Builder.EMPTY_SETTINGS;
}
providers.put(name, factory.apply(name, providerSettings, indexSettings));
providers.put(name, factory.create(name, providerSettings, indexSettings, scriptService));
}
return providers;
}

View File

@ -467,7 +467,7 @@ public class IndicesService extends AbstractLifecycleComponent
final IndexSettings idxSettings = new IndexSettings(indexMetaData, this.settings, indexScopeSetting);
final IndexModule indexModule = new IndexModule(idxSettings, analysisRegistry);
pluginsService.onIndexModule(indexModule);
return indexModule.newIndexMapperService(xContentRegistry, mapperRegistry);
return indexModule.newIndexMapperService(xContentRegistry, mapperRegistry, scriptService);
}
/**

View File

@ -25,6 +25,8 @@ import org.elasticsearch.common.bytes.BytesArray;
import org.elasticsearch.common.io.stream.StreamInput;
import org.elasticsearch.common.io.stream.StreamOutput;
import org.elasticsearch.common.io.stream.Writeable;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.xcontent.NamedXContentRegistry;
import org.elasticsearch.common.xcontent.ObjectParser;
import org.elasticsearch.common.xcontent.ObjectParser.ValueType;
import org.elasticsearch.common.xcontent.ToXContentObject;
@ -33,6 +35,7 @@ import org.elasticsearch.common.xcontent.XContentFactory;
import org.elasticsearch.common.xcontent.XContentParser;
import org.elasticsearch.common.xcontent.XContentParser.Token;
import org.elasticsearch.common.xcontent.XContentType;
import org.elasticsearch.common.xcontent.json.JsonXContent;
import java.io.IOException;
import java.io.UncheckedIOException;
@ -269,6 +272,20 @@ public final class Script implements ToXContentObject, Writeable {
return parse(parser, DEFAULT_SCRIPT_LANG);
}
/**
* Parse the script configured in the given settings.
*/
public static Script parse(Settings settings) {
try {
XContentBuilder builder = JsonXContent.contentBuilder();
builder.map(settings.getAsStructuredMap());
return parse(JsonXContent.jsonXContent.createParser(NamedXContentRegistry.EMPTY, builder.bytes()));
} catch (IOException e) {
// it should not happen since we are not actually reading from a stream but an in-memory byte[]
throw new IllegalStateException(e);
}
}
/**
* This will parse XContent into a {@link Script}. The following formats can be parsed:
*

View File

@ -45,6 +45,8 @@ public class ScriptModule {
ExecutableScript.AGGS_CONTEXT,
ExecutableScript.UPDATE_CONTEXT,
ExecutableScript.INGEST_CONTEXT,
SimilarityScript.CONTEXT,
SimilarityWeightScript.CONTEXT,
TemplateScript.CONTEXT
).collect(Collectors.toMap(c -> c.name, Function.identity()));
}

View File

@ -0,0 +1,45 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.script;
import org.elasticsearch.index.similarity.ScriptedSimilarity;
import java.io.IOException;
/** A script that is used to build {@link ScriptedSimilarity} instances. */
public abstract class SimilarityScript {
/** Compute the score.
* @param weight weight computed by the {@link SimilarityWeightScript} if any, or 1.
* @param query scoring factors that come from the query
* @param field field-level statistics
* @param term term-level statistics
* @param doc per-document statistics
*/
public abstract double execute(double weight, ScriptedSimilarity.Query query,
ScriptedSimilarity.Field field, ScriptedSimilarity.Term term, ScriptedSimilarity.Doc doc) throws IOException;
public interface Factory {
SimilarityScript newInstance();
}
public static final String[] PARAMETERS = new String[] {"weight", "query", "field", "term", "doc"};
public static final ScriptContext<Factory> CONTEXT = new ScriptContext<>("similarity", Factory.class);
}

View File

@ -0,0 +1,43 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.script;
import org.elasticsearch.index.similarity.ScriptedSimilarity;
import java.io.IOException;
/** A script that is used to compute scoring factors that are the same for all documents. */
public abstract class SimilarityWeightScript {
/** Compute the weight.
* @param query scoring factors that come from the query
* @param field field-level statistics
* @param term term-level statistics
*/
public abstract double execute(ScriptedSimilarity.Query query, ScriptedSimilarity.Field field,
ScriptedSimilarity.Term term) throws IOException;
public interface Factory {
SimilarityWeightScript newInstance();
}
public static final String[] PARAMETERS = new String[] {"query", "field", "term"};
public static final ScriptContext<Factory> CONTEXT = new ScriptContext<>("similarity_weight", Factory.class);
}

View File

@ -284,7 +284,7 @@ public class IndexModuleTests extends ESTestCase {
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.build();
IndexModule module = new IndexModule(IndexSettingsModule.newIndexSettings("foo", indexSettings), emptyAnalysisRegistry);
module.addSimilarity("test_similarity", (string, providerSettings, indexLevelSettings) -> new SimilarityProvider() {
module.addSimilarity("test_similarity", (string, providerSettings, indexLevelSettings, scriptService) -> new SimilarityProvider() {
@Override
public String name() {
return string;

View File

@ -90,7 +90,7 @@ public class CodecTests extends ESTestCase {
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir())
.build();
IndexSettings settings = IndexSettingsModule.newIndexSettings("_na", nodeSettings);
SimilarityService similarityService = new SimilarityService(settings, Collections.emptyMap());
SimilarityService similarityService = new SimilarityService(settings, null, Collections.emptyMap());
IndexAnalyzers indexAnalyzers = createTestAnalysis(settings, nodeSettings).indexAnalyzers;
MapperRegistry mapperRegistry = new MapperRegistry(Collections.emptyMap(), Collections.emptyMap());
MapperService service = new MapperService(settings, indexAnalyzers, xContentRegistry(), similarityService, mapperRegistry,

View File

@ -2774,7 +2774,7 @@ public class InternalEngineTests extends ESTestCase {
public TranslogHandler(NamedXContentRegistry xContentRegistry, IndexSettings indexSettings) {
NamedAnalyzer defaultAnalyzer = new NamedAnalyzer("default", AnalyzerScope.INDEX, new StandardAnalyzer());
IndexAnalyzers indexAnalyzers = new IndexAnalyzers(indexSettings, defaultAnalyzer, defaultAnalyzer, defaultAnalyzer, Collections.emptyMap(), Collections.emptyMap());
SimilarityService similarityService = new SimilarityService(indexSettings, Collections.emptyMap());
SimilarityService similarityService = new SimilarityService(indexSettings, null, Collections.emptyMap());
MapperRegistry mapperRegistry = new IndicesModule(Collections.emptyList()).getMapperRegistry();
mapperService = new MapperService(indexSettings, indexAnalyzers, xContentRegistry, similarityService, mapperRegistry,
() -> null);

View File

@ -113,7 +113,7 @@ public class ParentFieldMapperTests extends ESSingleNodeTestCase {
NamedAnalyzer namedAnalyzer = new NamedAnalyzer("default", AnalyzerScope.INDEX, new StandardAnalyzer());
IndexAnalyzers indexAnalyzers = new IndexAnalyzers(indexSettings, namedAnalyzer, namedAnalyzer, namedAnalyzer,
Collections.emptyMap(), Collections.emptyMap());
SimilarityService similarityService = new SimilarityService(indexSettings, Collections.emptyMap());
SimilarityService similarityService = new SimilarityService(indexSettings, null, Collections.emptyMap());
MapperService mapperService = new MapperService(indexSettings, indexAnalyzers, xContentRegistry(), similarityService,
new IndicesModule(emptyList()).getMapperRegistry(), () -> null);
XContentBuilder mappingSource = jsonBuilder().startObject().startObject("some_type")

View File

@ -0,0 +1,224 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.similarity;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.BoostQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.similarities.BM25Similarity;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.TestUtil;
import org.apache.lucene.util.Version;
import org.elasticsearch.script.SimilarityScript;
import org.elasticsearch.script.SimilarityWeightScript;
import org.elasticsearch.test.ESTestCase;
import java.io.IOException;
import java.util.concurrent.atomic.AtomicBoolean;
public class ScriptedSimilarityTests extends ESTestCase {
public void testSameNormsAsBM25CountOverlaps() {
doTestSameNormsAsBM25(false);
}
public void testSameNormsAsBM25DiscountOverlaps() {
doTestSameNormsAsBM25(true);
}
private void doTestSameNormsAsBM25(boolean discountOverlaps) {
ScriptedSimilarity sim1 = new ScriptedSimilarity("foobar", null, "foobaz", null, discountOverlaps);
BM25Similarity sim2 = new BM25Similarity();
sim2.setDiscountOverlaps(discountOverlaps);
for (int iter = 0; iter < 100; ++iter) {
final int length = TestUtil.nextInt(random(), 1, 100);
final int position = random().nextInt(length);
final int numOverlaps = random().nextInt(length);
FieldInvertState state = new FieldInvertState(Version.LATEST.major, "foo", position, length, numOverlaps, 100);
assertEquals(
sim2.computeNorm(state),
sim1.computeNorm(state),
0f);
}
}
public void testBasics() throws IOException {
final AtomicBoolean called = new AtomicBoolean();
SimilarityScript.Factory scriptFactory = () -> {
return new SimilarityScript() {
@Override
public double execute(double weight, ScriptedSimilarity.Query query,
ScriptedSimilarity.Field field, ScriptedSimilarity.Term term,
ScriptedSimilarity.Doc doc) throws IOException {
assertEquals(1, weight, 0);
assertNotNull(doc);
assertEquals(2f, doc.getFreq(), 0);
assertEquals(3, doc.getLength(), 0);
assertNotNull(field);
assertEquals(3, field.getDocCount());
assertEquals(5, field.getSumDocFreq());
assertEquals(6, field.getSumTotalTermFreq());
assertNotNull(term);
assertEquals(2, term.getDocFreq());
assertEquals(3, term.getTotalTermFreq());
assertNotNull(query);
assertEquals(3.2f, query.getBoost(), 0);
called.set(true);
return 42f;
}
};
};
ScriptedSimilarity sim = new ScriptedSimilarity("foobar", null, "foobaz", scriptFactory, true);
Directory dir = new RAMDirectory();
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig().setSimilarity(sim));
Document doc = new Document();
doc.add(new TextField("f", "foo bar", Store.NO));
doc.add(new StringField("match", "no", Store.NO));
w.addDocument(doc);
doc = new Document();
doc.add(new TextField("f", "foo foo bar", Store.NO));
doc.add(new StringField("match", "yes", Store.NO));
w.addDocument(doc);
doc = new Document();
doc.add(new TextField("f", "bar", Store.NO));
doc.add(new StringField("match", "no", Store.NO));
w.addDocument(doc);
IndexReader r = DirectoryReader.open(w);
w.close();
IndexSearcher searcher = new IndexSearcher(r);
searcher.setSimilarity(sim);
Query query = new BoostQuery(new BooleanQuery.Builder()
.add(new TermQuery(new Term("f", "foo")), Occur.SHOULD)
.add(new TermQuery(new Term("match", "yes")), Occur.FILTER)
.build(), 3.2f);
TopDocs topDocs = searcher.search(query, 1);
assertEquals(1, topDocs.totalHits);
assertTrue(called.get());
assertEquals(42, topDocs.scoreDocs[0].score, 0);
w.close();
dir.close();
}
public void testInitScript() throws IOException {
final AtomicBoolean initCalled = new AtomicBoolean();
SimilarityWeightScript.Factory weightScriptFactory = () -> {
return new SimilarityWeightScript() {
@Override
public double execute(ScriptedSimilarity.Query query, ScriptedSimilarity.Field field,
ScriptedSimilarity.Term term) throws IOException {
assertNotNull(field);
assertEquals(3, field.getDocCount());
assertEquals(5, field.getSumDocFreq());
assertEquals(6, field.getSumTotalTermFreq());
assertNotNull(term);
assertEquals(2, term.getDocFreq());
assertEquals(3, term.getTotalTermFreq());
assertNotNull(query);
assertEquals(3.2f, query.getBoost(), 0);
initCalled.set(true);
return 28;
}
};
};
final AtomicBoolean called = new AtomicBoolean();
SimilarityScript.Factory scriptFactory = () -> {
return new SimilarityScript() {
@Override
public double execute(double weight, ScriptedSimilarity.Query query,
ScriptedSimilarity.Field field, ScriptedSimilarity.Term term,
ScriptedSimilarity.Doc doc) throws IOException {
assertEquals(28, weight, 0d);
assertNotNull(doc);
assertEquals(2f, doc.getFreq(), 0);
assertEquals(3, doc.getLength(), 0);
assertNotNull(field);
assertEquals(3, field.getDocCount());
assertEquals(5, field.getSumDocFreq());
assertEquals(6, field.getSumTotalTermFreq());
assertNotNull(term);
assertEquals(2, term.getDocFreq());
assertEquals(3, term.getTotalTermFreq());
assertNotNull(query);
assertEquals(3.2f, query.getBoost(), 0);
called.set(true);
return 42;
}
};
};
ScriptedSimilarity sim = new ScriptedSimilarity("foobar", weightScriptFactory, "foobaz", scriptFactory, true);
Directory dir = new RAMDirectory();
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig().setSimilarity(sim));
Document doc = new Document();
doc.add(new TextField("f", "foo bar", Store.NO));
doc.add(new StringField("match", "no", Store.NO));
w.addDocument(doc);
doc = new Document();
doc.add(new TextField("f", "foo foo bar", Store.NO));
doc.add(new StringField("match", "yes", Store.NO));
w.addDocument(doc);
doc = new Document();
doc.add(new TextField("f", "bar", Store.NO));
doc.add(new StringField("match", "no", Store.NO));
w.addDocument(doc);
IndexReader r = DirectoryReader.open(w);
w.close();
IndexSearcher searcher = new IndexSearcher(r);
searcher.setSimilarity(sim);
Query query = new BoostQuery(new BooleanQuery.Builder()
.add(new TermQuery(new Term("f", "foo")), Occur.SHOULD)
.add(new TermQuery(new Term("match", "yes")), Occur.FILTER)
.build(), 3.2f);
TopDocs topDocs = searcher.search(query, 1);
assertEquals(1, topDocs.totalHits);
assertTrue(initCalled.get());
assertTrue(called.get());
assertEquals(42, topDocs.scoreDocs[0].score, 0);
w.close();
dir.close();
}
}

View File

@ -33,7 +33,7 @@ public class SimilarityServiceTests extends ESTestCase {
public void testDefaultSimilarity() {
Settings settings = Settings.builder().build();
IndexSettings indexSettings = IndexSettingsModule.newIndexSettings("test", settings);
SimilarityService service = new SimilarityService(indexSettings, Collections.emptyMap());
SimilarityService service = new SimilarityService(indexSettings, null, Collections.emptyMap());
assertThat(service.getDefaultSimilarity(), instanceOf(BM25Similarity.class));
}
@ -42,7 +42,7 @@ public class SimilarityServiceTests extends ESTestCase {
Settings settings = Settings.builder().put("index.similarity.BM25.type", "classic").build();
IndexSettings indexSettings = IndexSettingsModule.newIndexSettings("test", settings);
try {
new SimilarityService(indexSettings, Collections.emptyMap());
new SimilarityService(indexSettings, null, Collections.emptyMap());
fail("can't override bm25");
} catch (IllegalArgumentException ex) {
assertEquals(ex.getMessage(), "Cannot redefine built-in Similarity [BM25]");
@ -53,7 +53,7 @@ public class SimilarityServiceTests extends ESTestCase {
Settings settings = Settings.builder().put("index.similarity.default.type", "classic")
.build();
IndexSettings indexSettings = IndexSettingsModule.newIndexSettings("test", settings);
SimilarityService service = new SimilarityService(indexSettings, Collections.emptyMap());
SimilarityService service = new SimilarityService(indexSettings, null, Collections.emptyMap());
assertTrue(service.getDefaultSimilarity() instanceof ClassicSimilarity);
}
}

View File

@ -105,7 +105,8 @@ public class IndicesServiceTests extends ESSingleNodeTestCase {
@Override
public void onIndexModule(IndexModule indexModule) {
super.onIndexModule(indexModule);
indexModule.addSimilarity("fake-similarity", BM25SimilarityProvider::new);
indexModule.addSimilarity("fake-similarity",
(name, settings, indexSettings, scriptService) -> new BM25SimilarityProvider(name, settings, indexSettings));
}
}

View File

@ -163,6 +163,325 @@ for title queries and `0.7` for long queries. Default to `0.1`. When value appro
Type name: `LMJelinekMercer`
[float]
[[scripted_similarity]]
==== Scripted similarity
A similarity that allows you to use a script in order to specify how scores
should be computed. For instance, the below example shows how to reimplement
TF-IDF:
[source,js]
--------------------------------------------------
PUT index
{
"settings": {
"number_of_shards": 1,
"similarity": {
"scripted_tfidf": {
"type": "scripted",
"script": {
"source": "double tf = Math.sqrt(doc.freq); double idf = Math.log((field.docCount+1.0)/(term.docFreq+1.0)) + 1.0; double norm = 1/Math.sqrt(doc.length); return query.boost * tf * idf * norm;"
}
}
}
},
"mappings": {
"doc": {
"properties": {
"field": {
"type": "text",
"similarity": "scripted_tfidf"
}
}
}
}
}
PUT index/doc/1
{
"field": "foo bar foo"
}
PUT index/doc/2
{
"field": "bar baz"
}
POST index/_refresh
GET index/_search?explain=true
{
"query": {
"query_string": {
"query": "foo^1.7",
"default_field": "field"
}
}
}
--------------------------------------------------
// CONSOLE
Which yields:
[source,js]
--------------------------------------------------
{
"took": 12,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 1.9508477,
"hits": [
{
"_shard": "[index][0]",
"_node": "OzrdjxNtQGaqs4DmioFw9A",
"_index": "index",
"_type": "doc",
"_id": "1",
"_score": 1.9508477,
"_source": {
"field": "foo bar foo"
},
"_explanation": {
"value": 1.9508477,
"description": "weight(field:foo in 0) [PerFieldSimilarity], result of:",
"details": [
{
"value": 1.9508477,
"description": "score from ScriptedSimilarity(weightScript=[null], script=[Script{type=inline, lang='painless', idOrCode='double tf = Math.sqrt(doc.freq); double idf = Math.log((field.docCount+1.0)/(term.docFreq+1.0)) + 1.0; double norm = 1/Math.sqrt(doc.length); return query.boost * tf * idf * norm;', options={}, params={}}]) computed from:",
"details": [
{
"value": 1.0,
"description": "weight",
"details": []
},
{
"value": 1.7,
"description": "query.boost",
"details": []
},
{
"value": 2.0,
"description": "field.docCount",
"details": []
},
{
"value": 4.0,
"description": "field.sumDocFreq",
"details": []
},
{
"value": 5.0,
"description": "field.sumTotalTermFreq",
"details": []
},
{
"value": 1.0,
"description": "term.docFreq",
"details": []
},
{
"value": 2.0,
"description": "term.totalTermFreq",
"details": []
},
{
"value": 2.0,
"description": "doc.freq",
"details": []
},
{
"value": 3.0,
"description": "doc.length",
"details": []
}
]
}
]
}
}
]
}
}
--------------------------------------------------
// TESTRESPONSE[s/"took": 12/"took" : $body.took/]
// TESTRESPONSE[s/OzrdjxNtQGaqs4DmioFw9A/$body.hits.hits.0._node/]
You might have noticed that a significant part of the script depends on
statistics that are the same for every document. It is possible to make the
above slightly more efficient by providing an `weight_script` which will
compute the document-independent part of the score and will be available
under the `weight` variable. When no `weight_script` is provided, `weight`
is equal to `1`. The `weight_script` has access to the same variables as
the `script` except `doc` since it is supposed to compute a
document-independent contribution to the score.
The below configuration will give the same tf-idf scores but is slightly
more efficient:
[source,js]
--------------------------------------------------
PUT index
{
"settings": {
"number_of_shards": 1,
"similarity": {
"scripted_tfidf": {
"type": "scripted",
"weight_script": {
"source": "double idf = Math.log((field.docCount+1.0)/(term.docFreq+1.0)) + 1.0; return query.boost * idf;"
},
"script": {
"source": "double tf = Math.sqrt(doc.freq); double norm = 1/Math.sqrt(doc.length); return weight * tf * norm;"
}
}
}
},
"mappings": {
"doc": {
"properties": {
"field": {
"type": "text",
"similarity": "scripted_tfidf"
}
}
}
}
}
--------------------------------------------------
// CONSOLE
////////////////////
[source,js]
--------------------------------------------------
PUT index/doc/1
{
"field": "foo bar foo"
}
PUT index/doc/2
{
"field": "bar baz"
}
POST index/_refresh
GET index/_search?explain=true
{
"query": {
"query_string": {
"query": "foo^1.7",
"default_field": "field"
}
}
}
--------------------------------------------------
// CONSOLE
// TEST[continued]
[source,js]
--------------------------------------------------
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 1.9508477,
"hits": [
{
"_shard": "[index][0]",
"_node": "OzrdjxNtQGaqs4DmioFw9A",
"_index": "index",
"_type": "doc",
"_id": "1",
"_score": 1.9508477,
"_source": {
"field": "foo bar foo"
},
"_explanation": {
"value": 1.9508477,
"description": "weight(field:foo in 0) [PerFieldSimilarity], result of:",
"details": [
{
"value": 1.9508477,
"description": "score from ScriptedSimilarity(weightScript=[Script{type=inline, lang='painless', idOrCode='double idf = Math.log((field.docCount+1.0)/(term.docFreq+1.0)) + 1.0; return query.boost * idf;', options={}, params={}}], script=[Script{type=inline, lang='painless', idOrCode='double tf = Math.sqrt(doc.freq); double norm = 1/Math.sqrt(doc.length); return weight * tf * norm;', options={}, params={}}]) computed from:",
"details": [
{
"value": 2.3892908,
"description": "weight",
"details": []
},
{
"value": 1.7,
"description": "query.boost",
"details": []
},
{
"value": 2.0,
"description": "field.docCount",
"details": []
},
{
"value": 4.0,
"description": "field.sumDocFreq",
"details": []
},
{
"value": 5.0,
"description": "field.sumTotalTermFreq",
"details": []
},
{
"value": 1.0,
"description": "term.docFreq",
"details": []
},
{
"value": 2.0,
"description": "term.totalTermFreq",
"details": []
},
{
"value": 2.0,
"description": "doc.freq",
"details": []
},
{
"value": 3.0,
"description": "doc.length",
"details": []
}
]
}
]
}
}
]
}
}
--------------------------------------------------
// TESTRESPONSE[s/"took": 1/"took" : $body.took/]
// TESTRESPONSE[s/OzrdjxNtQGaqs4DmioFw9A/$body.hits.hits.0._node/]
////////////////////
Type name: `scripted`
[float]
[[default-base]]
==== Default Similarity

View File

@ -165,3 +165,23 @@ class org.elasticsearch.search.lookup.FieldLookup -> org.elasticsearch.search.lo
List getValues()
boolean isEmpty()
}
class org.elasticsearch.index.similarity.ScriptedSimilarity.Query -> org.elasticsearch.index.similarity.ScriptedSimilarity$Query extends Object {
float getBoost()
}
class org.elasticsearch.index.similarity.ScriptedSimilarity.Field -> org.elasticsearch.index.similarity.ScriptedSimilarity$Field extends Object {
long getDocCount()
long getSumDocFreq()
long getSumTotalTermFreq()
}
class org.elasticsearch.index.similarity.ScriptedSimilarity.Term -> org.elasticsearch.index.similarity.ScriptedSimilarity$Term extends Object {
long getDocFreq()
long getTotalTermFreq()
}
class org.elasticsearch.index.similarity.ScriptedSimilarity.Doc -> org.elasticsearch.index.similarity.ScriptedSimilarity$Doc extends Object {
int getLength()
float getFreq()
}

View File

@ -0,0 +1,131 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.painless;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.BoostQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.elasticsearch.index.similarity.ScriptedSimilarity;
import org.elasticsearch.script.ScriptContext;
import org.elasticsearch.script.SimilarityScript;
import org.elasticsearch.script.SimilarityWeightScript;
import java.io.IOException;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
public class SimilarityScriptTests extends ScriptTestCase {
@Override
protected Collection<ScriptContext<?>> scriptContexts() {
return Arrays.asList(SimilarityScript.CONTEXT, SimilarityWeightScript.CONTEXT);
}
public void testBasics() throws IOException {
SimilarityScript.Factory factory = scriptEngine.compile(
"foobar", "return query.boost * doc.freq / doc.length", SimilarityScript.CONTEXT, Collections.emptyMap());
ScriptedSimilarity sim = new ScriptedSimilarity("foobar", null, "foobaz", factory::newInstance, true);
Directory dir = new RAMDirectory();
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig().setSimilarity(sim));
Document doc = new Document();
doc.add(new TextField("f", "foo bar", Store.NO));
doc.add(new StringField("match", "no", Store.NO));
w.addDocument(doc);
doc = new Document();
doc.add(new TextField("f", "foo foo bar", Store.NO));
doc.add(new StringField("match", "yes", Store.NO));
w.addDocument(doc);
doc = new Document();
doc.add(new TextField("f", "bar", Store.NO));
doc.add(new StringField("match", "no", Store.NO));
w.addDocument(doc);
IndexReader r = DirectoryReader.open(w);
w.close();
IndexSearcher searcher = new IndexSearcher(r);
searcher.setSimilarity(sim);
Query query = new BoostQuery(new BooleanQuery.Builder()
.add(new TermQuery(new Term("f", "foo")), Occur.SHOULD)
.add(new TermQuery(new Term("match", "yes")), Occur.FILTER)
.build(), 3.2f);
TopDocs topDocs = searcher.search(query, 1);
assertEquals(1, topDocs.totalHits);
assertEquals((float) (3.2 * 2 / 3), topDocs.scoreDocs[0].score, 0);
w.close();
dir.close();
}
public void testWeightScript() throws IOException {
SimilarityWeightScript.Factory weightFactory = scriptEngine.compile(
"foobar", "return query.boost", SimilarityWeightScript.CONTEXT, Collections.emptyMap());
SimilarityScript.Factory factory = scriptEngine.compile(
"foobar", "return weight * doc.freq / doc.length", SimilarityScript.CONTEXT, Collections.emptyMap());
ScriptedSimilarity sim = new ScriptedSimilarity("foobar", weightFactory::newInstance, "foobaz", factory::newInstance, true);
Directory dir = new RAMDirectory();
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig().setSimilarity(sim));
Document doc = new Document();
doc.add(new TextField("f", "foo bar", Store.NO));
doc.add(new StringField("match", "no", Store.NO));
w.addDocument(doc);
doc = new Document();
doc.add(new TextField("f", "foo foo bar", Store.NO));
doc.add(new StringField("match", "yes", Store.NO));
w.addDocument(doc);
doc = new Document();
doc.add(new TextField("f", "bar", Store.NO));
doc.add(new StringField("match", "no", Store.NO));
w.addDocument(doc);
IndexReader r = DirectoryReader.open(w);
w.close();
IndexSearcher searcher = new IndexSearcher(r);
searcher.setSimilarity(sim);
Query query = new BoostQuery(new BooleanQuery.Builder()
.add(new TermQuery(new Term("f", "foo")), Occur.SHOULD)
.add(new TermQuery(new Term("match", "yes")), Occur.FILTER)
.build(), 3.2f);
TopDocs topDocs = searcher.search(query, 1);
assertEquals(1, topDocs.totalHits);
assertEquals((float) (3.2 * 2 / 3), topDocs.scoreDocs[0].score, 0);
w.close();
dir.close();
}
}

View File

@ -336,7 +336,8 @@ public class HasChildQueryBuilderTests extends AbstractQueryTestCase<HasChildQue
hasChildQuery(CHILD_DOC, new TermQueryBuilder("custom_string", "value"), ScoreMode.None);
HasChildQueryBuilder.LateParsingQuery query = (HasChildQueryBuilder.LateParsingQuery) hasChildQueryBuilder.toQuery(shardContext);
Similarity expected = SimilarityService.BUILT_IN.get(similarity)
.apply(similarity, Settings.EMPTY, Settings.builder().put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT).build())
.create(similarity, Settings.EMPTY,
Settings.builder().put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT).build(), null)
.get();
assertThat(((PerFieldSimilarityWrapper) query.getSimilarity()).get("custom_string"), instanceOf(expected.getClass()));
}

View File

@ -322,7 +322,8 @@ public class LegacyHasChildQueryBuilderTests extends AbstractQueryTestCase<HasCh
hasChildQuery(CHILD_TYPE, new TermQueryBuilder("custom_string", "value"), ScoreMode.None);
HasChildQueryBuilder.LateParsingQuery query = (HasChildQueryBuilder.LateParsingQuery) hasChildQueryBuilder.toQuery(shardContext);
Similarity expected = SimilarityService.BUILT_IN.get(similarity)
.apply(similarity, Settings.EMPTY, Settings.builder().put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT).build())
.create(similarity, Settings.EMPTY,
Settings.builder().put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT).build(), null)
.get();
assertThat(((PerFieldSimilarityWrapper) query.getSimilarity()).get("custom_string"), instanceOf(expected.getClass()));
}

View File

@ -60,7 +60,7 @@ public class MapperTestUtils {
MapperRegistry mapperRegistry = indicesModule.getMapperRegistry();
IndexSettings indexSettings = IndexSettingsModule.newIndexSettings(indexName, finalSettings);
IndexAnalyzers indexAnalyzers = createTestAnalysis(indexSettings, finalSettings).indexAnalyzers;
SimilarityService similarityService = new SimilarityService(indexSettings, Collections.emptyMap());
SimilarityService similarityService = new SimilarityService(indexSettings, null, Collections.emptyMap());
return new MapperService(indexSettings,
indexAnalyzers,
xContentRegistry,

View File

@ -274,7 +274,7 @@ public abstract class IndexShardTestCase extends ESTestCase {
MapperService mapperService = MapperTestUtils.newMapperService(xContentRegistry(), createTempDir(),
indexSettings.getSettings(), "index");
mapperService.merge(indexMetaData, MapperService.MergeReason.MAPPING_RECOVERY, true);
SimilarityService similarityService = new SimilarityService(indexSettings, Collections.emptyMap());
SimilarityService similarityService = new SimilarityService(indexSettings, null, Collections.emptyMap());
final IndexEventListener indexEventListener = new IndexEventListener() {
};
final Engine.Warmer warmer = searcher -> {

View File

@ -21,6 +21,11 @@ package org.elasticsearch.script;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.search.Scorer;
import org.elasticsearch.index.similarity.ScriptedSimilarity.Doc;
import org.elasticsearch.index.similarity.ScriptedSimilarity.Field;
import org.elasticsearch.index.similarity.ScriptedSimilarity.Query;
import org.elasticsearch.index.similarity.ScriptedSimilarity.Term;
import org.elasticsearch.index.similarity.SimilarityService;
import org.elasticsearch.search.lookup.LeafSearchLookup;
import org.elasticsearch.search.lookup.SearchLookup;
@ -94,6 +99,12 @@ public class MockScriptEngine implements ScriptEngine {
};
};
return context.factoryClazz.cast(factory);
} else if (context.instanceClazz.equals(SimilarityScript.class)) {
SimilarityScript.Factory factory = mockCompiled::createSimilarityScript;
return context.factoryClazz.cast(factory);
} else if (context.instanceClazz.equals(SimilarityWeightScript.class)) {
SimilarityWeightScript.Factory factory = mockCompiled::createSimilarityWeightScript;
return context.factoryClazz.cast(factory);
}
throw new IllegalArgumentException("mock script engine does not know how to handle context [" + context.name + "]");
}
@ -141,6 +152,14 @@ public class MockScriptEngine implements ScriptEngine {
}
return new MockSearchScript(lookup, context, script != null ? script : ctx -> source);
}
public SimilarityScript createSimilarityScript() {
return new MockSimilarityScript(script != null ? script : ctx -> 42d);
}
public SimilarityWeightScript createSimilarityWeightScript() {
return new MockSimilarityWeightScript(script != null ? script : ctx -> 42d);
}
}
public class MockExecutableScript implements ExecutableScript {
@ -224,6 +243,44 @@ public class MockScriptEngine implements ScriptEngine {
}
}
public class MockSimilarityScript extends SimilarityScript {
private final Function<Map<String, Object>, Object> script;
MockSimilarityScript(Function<Map<String, Object>, Object> script) {
this.script = script;
}
@Override
public double execute(double weight, Query query, Field field, Term term, Doc doc) throws IOException {
Map<String, Object> map = new HashMap<>();
map.put("weight", weight);
map.put("query", query);
map.put("field", field);
map.put("term", term);
map.put("doc", doc);
return ((Number) script.apply(map)).doubleValue();
}
}
public class MockSimilarityWeightScript extends SimilarityWeightScript {
private final Function<Map<String, Object>, Object> script;
MockSimilarityWeightScript(Function<Map<String, Object>, Object> script) {
this.script = script;
}
@Override
public double execute(Query query, Field field, Term term) throws IOException {
Map<String, Object> map = new HashMap<>();
map.put("query", query);
map.put("field", field);
map.put("term", term);
return ((Number) script.apply(map)).doubleValue();
}
}
public static Script mockInlineScript(final String script) {
return new Script(ScriptType.INLINE, "mock", script, emptyMap());
}

View File

@ -1036,7 +1036,7 @@ public abstract class AbstractQueryTestCase<QB extends AbstractQueryBuilder<QB>>
AnalysisModule analysisModule = new AnalysisModule(new Environment(nodeSettings), emptyList());
IndexAnalyzers indexAnalyzers = analysisModule.getAnalysisRegistry().build(idxSettings);
scriptService = scriptModule.getScriptService();
similarityService = new SimilarityService(idxSettings, Collections.emptyMap());
similarityService = new SimilarityService(idxSettings, null, Collections.emptyMap());
MapperRegistry mapperRegistry = indicesModule.getMapperRegistry();
mapperService = new MapperService(idxSettings, indexAnalyzers, xContentRegistry, similarityService, mapperRegistry,
this::createShardContext);